linux/mm/huge_memory.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  Copyright (C) 2009  Red Hat, Inc.
   4 */
   5
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7
   8#include <linux/mm.h>
   9#include <linux/sched.h>
  10#include <linux/sched/coredump.h>
  11#include <linux/sched/numa_balancing.h>
  12#include <linux/highmem.h>
  13#include <linux/hugetlb.h>
  14#include <linux/mmu_notifier.h>
  15#include <linux/rmap.h>
  16#include <linux/swap.h>
  17#include <linux/shrinker.h>
  18#include <linux/mm_inline.h>
  19#include <linux/swapops.h>
  20#include <linux/dax.h>
  21#include <linux/khugepaged.h>
  22#include <linux/freezer.h>
  23#include <linux/pfn_t.h>
  24#include <linux/mman.h>
  25#include <linux/memremap.h>
  26#include <linux/pagemap.h>
  27#include <linux/debugfs.h>
  28#include <linux/migrate.h>
  29#include <linux/hashtable.h>
  30#include <linux/userfaultfd_k.h>
  31#include <linux/page_idle.h>
  32#include <linux/shmem_fs.h>
  33#include <linux/oom.h>
  34#include <linux/numa.h>
  35#include <linux/page_owner.h>
  36
  37#include <asm/tlb.h>
  38#include <asm/pgalloc.h>
  39#include "internal.h"
  40
  41/*
  42 * By default, transparent hugepage support is disabled in order to avoid
  43 * risking an increased memory footprint for applications that are not
  44 * guaranteed to benefit from it. When transparent hugepage support is
  45 * enabled, it is for all mappings, and khugepaged scans all mappings.
  46 * Defrag is invoked by khugepaged hugepage allocations and by page faults
  47 * for all hugepage allocations.
  48 */
  49unsigned long transparent_hugepage_flags __read_mostly =
  50#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
  51        (1<<TRANSPARENT_HUGEPAGE_FLAG)|
  52#endif
  53#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
  54        (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
  55#endif
  56        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
  57        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
  58        (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
  59
  60static struct shrinker deferred_split_shrinker;
  61
  62static atomic_t huge_zero_refcount;
  63struct page *huge_zero_page __read_mostly;
  64
  65bool transparent_hugepage_enabled(struct vm_area_struct *vma)
  66{
  67        /* The addr is used to check if the vma size fits */
  68        unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
  69
  70        if (!transhuge_vma_suitable(vma, addr))
  71                return false;
  72        if (vma_is_anonymous(vma))
  73                return __transparent_hugepage_enabled(vma);
  74        if (vma_is_shmem(vma))
  75                return shmem_huge_enabled(vma);
  76
  77        return false;
  78}
  79
  80static struct page *get_huge_zero_page(void)
  81{
  82        struct page *zero_page;
  83retry:
  84        if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
  85                return READ_ONCE(huge_zero_page);
  86
  87        zero_page = alloc_pages((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
  88                        HPAGE_PMD_ORDER);
  89        if (!zero_page) {
  90                count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
  91                return NULL;
  92        }
  93        count_vm_event(THP_ZERO_PAGE_ALLOC);
  94        preempt_disable();
  95        if (cmpxchg(&huge_zero_page, NULL, zero_page)) {
  96                preempt_enable();
  97                __free_pages(zero_page, compound_order(zero_page));
  98                goto retry;
  99        }
 100
 101        /* We take additional reference here. It will be put back by shrinker */
 102        atomic_set(&huge_zero_refcount, 2);
 103        preempt_enable();
 104        return READ_ONCE(huge_zero_page);
 105}
 106
 107static void put_huge_zero_page(void)
 108{
 109        /*
 110         * Counter should never go to zero here. Only shrinker can put
 111         * last reference.
 112         */
 113        BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
 114}
 115
 116struct page *mm_get_huge_zero_page(struct mm_struct *mm)
 117{
 118        if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
 119                return READ_ONCE(huge_zero_page);
 120
 121        if (!get_huge_zero_page())
 122                return NULL;
 123
 124        if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
 125                put_huge_zero_page();
 126
 127        return READ_ONCE(huge_zero_page);
 128}
 129
 130void mm_put_huge_zero_page(struct mm_struct *mm)
 131{
 132        if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags))
 133                put_huge_zero_page();
 134}
 135
 136static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
 137                                        struct shrink_control *sc)
 138{
 139        /* we can free zero page only if last reference remains */
 140        return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
 141}
 142
 143static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
 144                                       struct shrink_control *sc)
 145{
 146        if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
 147                struct page *zero_page = xchg(&huge_zero_page, NULL);
 148                BUG_ON(zero_page == NULL);
 149                __free_pages(zero_page, compound_order(zero_page));
 150                return HPAGE_PMD_NR;
 151        }
 152
 153        return 0;
 154}
 155
 156static struct shrinker huge_zero_page_shrinker = {
 157        .count_objects = shrink_huge_zero_page_count,
 158        .scan_objects = shrink_huge_zero_page_scan,
 159        .seeks = DEFAULT_SEEKS,
 160};
 161
 162#ifdef CONFIG_SYSFS
 163static ssize_t enabled_show(struct kobject *kobj,
 164                            struct kobj_attribute *attr, char *buf)
 165{
 166        if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
 167                return sprintf(buf, "[always] madvise never\n");
 168        else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
 169                return sprintf(buf, "always [madvise] never\n");
 170        else
 171                return sprintf(buf, "always madvise [never]\n");
 172}
 173
 174static ssize_t enabled_store(struct kobject *kobj,
 175                             struct kobj_attribute *attr,
 176                             const char *buf, size_t count)
 177{
 178        ssize_t ret = count;
 179
 180        if (!memcmp("always", buf,
 181                    min(sizeof("always")-1, count))) {
 182                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
 183                set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
 184        } else if (!memcmp("madvise", buf,
 185                           min(sizeof("madvise")-1, count))) {
 186                clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
 187                set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
 188        } else if (!memcmp("never", buf,
 189                           min(sizeof("never")-1, count))) {
 190                clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
 191                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
 192        } else
 193                ret = -EINVAL;
 194
 195        if (ret > 0) {
 196                int err = start_stop_khugepaged();
 197                if (err)
 198                        ret = err;
 199        }
 200        return ret;
 201}
 202static struct kobj_attribute enabled_attr =
 203        __ATTR(enabled, 0644, enabled_show, enabled_store);
 204
 205ssize_t single_hugepage_flag_show(struct kobject *kobj,
 206                                struct kobj_attribute *attr, char *buf,
 207                                enum transparent_hugepage_flag flag)
 208{
 209        return sprintf(buf, "%d\n",
 210                       !!test_bit(flag, &transparent_hugepage_flags));
 211}
 212
 213ssize_t single_hugepage_flag_store(struct kobject *kobj,
 214                                 struct kobj_attribute *attr,
 215                                 const char *buf, size_t count,
 216                                 enum transparent_hugepage_flag flag)
 217{
 218        unsigned long value;
 219        int ret;
 220
 221        ret = kstrtoul(buf, 10, &value);
 222        if (ret < 0)
 223                return ret;
 224        if (value > 1)
 225                return -EINVAL;
 226
 227        if (value)
 228                set_bit(flag, &transparent_hugepage_flags);
 229        else
 230                clear_bit(flag, &transparent_hugepage_flags);
 231
 232        return count;
 233}
 234
 235static ssize_t defrag_show(struct kobject *kobj,
 236                           struct kobj_attribute *attr, char *buf)
 237{
 238        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
 239                return sprintf(buf, "[always] defer defer+madvise madvise never\n");
 240        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
 241                return sprintf(buf, "always [defer] defer+madvise madvise never\n");
 242        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
 243                return sprintf(buf, "always defer [defer+madvise] madvise never\n");
 244        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
 245                return sprintf(buf, "always defer defer+madvise [madvise] never\n");
 246        return sprintf(buf, "always defer defer+madvise madvise [never]\n");
 247}
 248
 249static ssize_t defrag_store(struct kobject *kobj,
 250                            struct kobj_attribute *attr,
 251                            const char *buf, size_t count)
 252{
 253        if (!memcmp("always", buf,
 254                    min(sizeof("always")-1, count))) {
 255                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 256                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 257                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 258                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 259        } else if (!memcmp("defer+madvise", buf,
 260                    min(sizeof("defer+madvise")-1, count))) {
 261                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 262                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 263                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 264                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 265        } else if (!memcmp("defer", buf,
 266                    min(sizeof("defer")-1, count))) {
 267                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 268                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 269                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 270                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 271        } else if (!memcmp("madvise", buf,
 272                           min(sizeof("madvise")-1, count))) {
 273                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 274                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 275                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 276                set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 277        } else if (!memcmp("never", buf,
 278                           min(sizeof("never")-1, count))) {
 279                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
 280                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
 281                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
 282                clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
 283        } else
 284                return -EINVAL;
 285
 286        return count;
 287}
 288static struct kobj_attribute defrag_attr =
 289        __ATTR(defrag, 0644, defrag_show, defrag_store);
 290
 291static ssize_t use_zero_page_show(struct kobject *kobj,
 292                struct kobj_attribute *attr, char *buf)
 293{
 294        return single_hugepage_flag_show(kobj, attr, buf,
 295                                TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 296}
 297static ssize_t use_zero_page_store(struct kobject *kobj,
 298                struct kobj_attribute *attr, const char *buf, size_t count)
 299{
 300        return single_hugepage_flag_store(kobj, attr, buf, count,
 301                                 TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 302}
 303static struct kobj_attribute use_zero_page_attr =
 304        __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
 305
 306static ssize_t hpage_pmd_size_show(struct kobject *kobj,
 307                struct kobj_attribute *attr, char *buf)
 308{
 309        return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE);
 310}
 311static struct kobj_attribute hpage_pmd_size_attr =
 312        __ATTR_RO(hpage_pmd_size);
 313
 314#ifdef CONFIG_DEBUG_VM
 315static ssize_t debug_cow_show(struct kobject *kobj,
 316                                struct kobj_attribute *attr, char *buf)
 317{
 318        return single_hugepage_flag_show(kobj, attr, buf,
 319                                TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
 320}
 321static ssize_t debug_cow_store(struct kobject *kobj,
 322                               struct kobj_attribute *attr,
 323                               const char *buf, size_t count)
 324{
 325        return single_hugepage_flag_store(kobj, attr, buf, count,
 326                                 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
 327}
 328static struct kobj_attribute debug_cow_attr =
 329        __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
 330#endif /* CONFIG_DEBUG_VM */
 331
 332static struct attribute *hugepage_attr[] = {
 333        &enabled_attr.attr,
 334        &defrag_attr.attr,
 335        &use_zero_page_attr.attr,
 336        &hpage_pmd_size_attr.attr,
 337#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
 338        &shmem_enabled_attr.attr,
 339#endif
 340#ifdef CONFIG_DEBUG_VM
 341        &debug_cow_attr.attr,
 342#endif
 343        NULL,
 344};
 345
 346static const struct attribute_group hugepage_attr_group = {
 347        .attrs = hugepage_attr,
 348};
 349
 350static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
 351{
 352        int err;
 353
 354        *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
 355        if (unlikely(!*hugepage_kobj)) {
 356                pr_err("failed to create transparent hugepage kobject\n");
 357                return -ENOMEM;
 358        }
 359
 360        err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group);
 361        if (err) {
 362                pr_err("failed to register transparent hugepage group\n");
 363                goto delete_obj;
 364        }
 365
 366        err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group);
 367        if (err) {
 368                pr_err("failed to register transparent hugepage group\n");
 369                goto remove_hp_group;
 370        }
 371
 372        return 0;
 373
 374remove_hp_group:
 375        sysfs_remove_group(*hugepage_kobj, &hugepage_attr_group);
 376delete_obj:
 377        kobject_put(*hugepage_kobj);
 378        return err;
 379}
 380
 381static void __init hugepage_exit_sysfs(struct kobject *hugepage_kobj)
 382{
 383        sysfs_remove_group(hugepage_kobj, &khugepaged_attr_group);
 384        sysfs_remove_group(hugepage_kobj, &hugepage_attr_group);
 385        kobject_put(hugepage_kobj);
 386}
 387#else
 388static inline int hugepage_init_sysfs(struct kobject **hugepage_kobj)
 389{
 390        return 0;
 391}
 392
 393static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj)
 394{
 395}
 396#endif /* CONFIG_SYSFS */
 397
 398static int __init hugepage_init(void)
 399{
 400        int err;
 401        struct kobject *hugepage_kobj;
 402
 403        if (!has_transparent_hugepage()) {
 404                transparent_hugepage_flags = 0;
 405                return -EINVAL;
 406        }
 407
 408        /*
 409         * hugepages can't be allocated by the buddy allocator
 410         */
 411        MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
 412        /*
 413         * we use page->mapping and page->index in second tail page
 414         * as list_head: assuming THP order >= 2
 415         */
 416        MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
 417
 418        err = hugepage_init_sysfs(&hugepage_kobj);
 419        if (err)
 420                goto err_sysfs;
 421
 422        err = khugepaged_init();
 423        if (err)
 424                goto err_slab;
 425
 426        err = register_shrinker(&huge_zero_page_shrinker);
 427        if (err)
 428                goto err_hzp_shrinker;
 429        err = register_shrinker(&deferred_split_shrinker);
 430        if (err)
 431                goto err_split_shrinker;
 432
 433        /*
 434         * By default disable transparent hugepages on smaller systems,
 435         * where the extra memory used could hurt more than TLB overhead
 436         * is likely to save.  The admin can still enable it through /sys.
 437         */
 438        if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
 439                transparent_hugepage_flags = 0;
 440                return 0;
 441        }
 442
 443        err = start_stop_khugepaged();
 444        if (err)
 445                goto err_khugepaged;
 446
 447        return 0;
 448err_khugepaged:
 449        unregister_shrinker(&deferred_split_shrinker);
 450err_split_shrinker:
 451        unregister_shrinker(&huge_zero_page_shrinker);
 452err_hzp_shrinker:
 453        khugepaged_destroy();
 454err_slab:
 455        hugepage_exit_sysfs(hugepage_kobj);
 456err_sysfs:
 457        return err;
 458}
 459subsys_initcall(hugepage_init);
 460
 461static int __init setup_transparent_hugepage(char *str)
 462{
 463        int ret = 0;
 464        if (!str)
 465                goto out;
 466        if (!strcmp(str, "always")) {
 467                set_bit(TRANSPARENT_HUGEPAGE_FLAG,
 468                        &transparent_hugepage_flags);
 469                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 470                          &transparent_hugepage_flags);
 471                ret = 1;
 472        } else if (!strcmp(str, "madvise")) {
 473                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
 474                          &transparent_hugepage_flags);
 475                set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 476                        &transparent_hugepage_flags);
 477                ret = 1;
 478        } else if (!strcmp(str, "never")) {
 479                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
 480                          &transparent_hugepage_flags);
 481                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
 482                          &transparent_hugepage_flags);
 483                ret = 1;
 484        }
 485out:
 486        if (!ret)
 487                pr_warn("transparent_hugepage= cannot parse, ignored\n");
 488        return ret;
 489}
 490__setup("transparent_hugepage=", setup_transparent_hugepage);
 491
 492pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 493{
 494        if (likely(vma->vm_flags & VM_WRITE))
 495                pmd = pmd_mkwrite(pmd);
 496        return pmd;
 497}
 498
 499#ifdef CONFIG_MEMCG
 500static inline struct deferred_split *get_deferred_split_queue(struct page *page)
 501{
 502        struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
 503        struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
 504
 505        if (memcg)
 506                return &memcg->deferred_split_queue;
 507        else
 508                return &pgdat->deferred_split_queue;
 509}
 510#else
 511static inline struct deferred_split *get_deferred_split_queue(struct page *page)
 512{
 513        struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
 514
 515        return &pgdat->deferred_split_queue;
 516}
 517#endif
 518
 519void prep_transhuge_page(struct page *page)
 520{
 521        /*
 522         * we use page->mapping and page->indexlru in second tail page
 523         * as list_head: assuming THP order >= 2
 524         */
 525
 526        INIT_LIST_HEAD(page_deferred_list(page));
 527        set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
 528}
 529
 530static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
 531                loff_t off, unsigned long flags, unsigned long size)
 532{
 533        unsigned long addr;
 534        loff_t off_end = off + len;
 535        loff_t off_align = round_up(off, size);
 536        unsigned long len_pad;
 537
 538        if (off_end <= off_align || (off_end - off_align) < size)
 539                return 0;
 540
 541        len_pad = len + size;
 542        if (len_pad < len || (off + len_pad) < off)
 543                return 0;
 544
 545        addr = current->mm->get_unmapped_area(filp, 0, len_pad,
 546                                              off >> PAGE_SHIFT, flags);
 547        if (IS_ERR_VALUE(addr))
 548                return 0;
 549
 550        addr += (off - addr) & (size - 1);
 551        return addr;
 552}
 553
 554unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 555                unsigned long len, unsigned long pgoff, unsigned long flags)
 556{
 557        loff_t off = (loff_t)pgoff << PAGE_SHIFT;
 558
 559        if (addr)
 560                goto out;
 561        if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
 562                goto out;
 563
 564        addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
 565        if (addr)
 566                return addr;
 567
 568 out:
 569        return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
 570}
 571EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
 572
 573static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 574                        struct page *page, gfp_t gfp)
 575{
 576        struct vm_area_struct *vma = vmf->vma;
 577        struct mem_cgroup *memcg;
 578        pgtable_t pgtable;
 579        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
 580        vm_fault_t ret = 0;
 581
 582        VM_BUG_ON_PAGE(!PageCompound(page), page);
 583
 584        if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
 585                put_page(page);
 586                count_vm_event(THP_FAULT_FALLBACK);
 587                return VM_FAULT_FALLBACK;
 588        }
 589
 590        pgtable = pte_alloc_one(vma->vm_mm);
 591        if (unlikely(!pgtable)) {
 592                ret = VM_FAULT_OOM;
 593                goto release;
 594        }
 595
 596        clear_huge_page(page, vmf->address, HPAGE_PMD_NR);
 597        /*
 598         * The memory barrier inside __SetPageUptodate makes sure that
 599         * clear_huge_page writes become visible before the set_pmd_at()
 600         * write.
 601         */
 602        __SetPageUptodate(page);
 603
 604        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
 605        if (unlikely(!pmd_none(*vmf->pmd))) {
 606                goto unlock_release;
 607        } else {
 608                pmd_t entry;
 609
 610                ret = check_stable_address_space(vma->vm_mm);
 611                if (ret)
 612                        goto unlock_release;
 613
 614                /* Deliver the page fault to userland */
 615                if (userfaultfd_missing(vma)) {
 616                        vm_fault_t ret2;
 617
 618                        spin_unlock(vmf->ptl);
 619                        mem_cgroup_cancel_charge(page, memcg, true);
 620                        put_page(page);
 621                        pte_free(vma->vm_mm, pgtable);
 622                        ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
 623                        VM_BUG_ON(ret2 & VM_FAULT_FALLBACK);
 624                        return ret2;
 625                }
 626
 627                entry = mk_huge_pmd(page, vma->vm_page_prot);
 628                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 629                page_add_new_anon_rmap(page, vma, haddr, true);
 630                mem_cgroup_commit_charge(page, memcg, false, true);
 631                lru_cache_add_active_or_unevictable(page, vma);
 632                pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
 633                set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
 634                add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
 635                mm_inc_nr_ptes(vma->vm_mm);
 636                spin_unlock(vmf->ptl);
 637                count_vm_event(THP_FAULT_ALLOC);
 638                count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
 639        }
 640
 641        return 0;
 642unlock_release:
 643        spin_unlock(vmf->ptl);
 644release:
 645        if (pgtable)
 646                pte_free(vma->vm_mm, pgtable);
 647        mem_cgroup_cancel_charge(page, memcg, true);
 648        put_page(page);
 649        return ret;
 650
 651}
 652
 653/*
 654 * always: directly stall for all thp allocations
 655 * defer: wake kswapd and fail if not immediately available
 656 * defer+madvise: wake kswapd and directly stall for MADV_HUGEPAGE, otherwise
 657 *                fail if not immediately available
 658 * madvise: directly stall for MADV_HUGEPAGE, otherwise fail if not immediately
 659 *          available
 660 * never: never stall for any thp allocation
 661 */
 662static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma)
 663{
 664        const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
 665
 666        /* Always do synchronous compaction */
 667        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
 668                return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
 669
 670        /* Kick kcompactd and fail quickly */
 671        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
 672                return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
 673
 674        /* Synchronous compaction if madvised, otherwise kick kcompactd */
 675        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
 676                return GFP_TRANSHUGE_LIGHT |
 677                        (vma_madvised ? __GFP_DIRECT_RECLAIM :
 678                                        __GFP_KSWAPD_RECLAIM);
 679
 680        /* Only do synchronous compaction if madvised */
 681        if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
 682                return GFP_TRANSHUGE_LIGHT |
 683                       (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
 684
 685        return GFP_TRANSHUGE_LIGHT;
 686}
 687
 688/* Caller must hold page table lock. */
 689static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
 690                struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
 691                struct page *zero_page)
 692{
 693        pmd_t entry;
 694        if (!pmd_none(*pmd))
 695                return false;
 696        entry = mk_pmd(zero_page, vma->vm_page_prot);
 697        entry = pmd_mkhuge(entry);
 698        if (pgtable)
 699                pgtable_trans_huge_deposit(mm, pmd, pgtable);
 700        set_pmd_at(mm, haddr, pmd, entry);
 701        mm_inc_nr_ptes(mm);
 702        return true;
 703}
 704
 705vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 706{
 707        struct vm_area_struct *vma = vmf->vma;
 708        gfp_t gfp;
 709        struct page *page;
 710        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
 711
 712        if (!transhuge_vma_suitable(vma, haddr))
 713                return VM_FAULT_FALLBACK;
 714        if (unlikely(anon_vma_prepare(vma)))
 715                return VM_FAULT_OOM;
 716        if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
 717                return VM_FAULT_OOM;
 718        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
 719                        !mm_forbids_zeropage(vma->vm_mm) &&
 720                        transparent_hugepage_use_zero_page()) {
 721                pgtable_t pgtable;
 722                struct page *zero_page;
 723                bool set;
 724                vm_fault_t ret;
 725                pgtable = pte_alloc_one(vma->vm_mm);
 726                if (unlikely(!pgtable))
 727                        return VM_FAULT_OOM;
 728                zero_page = mm_get_huge_zero_page(vma->vm_mm);
 729                if (unlikely(!zero_page)) {
 730                        pte_free(vma->vm_mm, pgtable);
 731                        count_vm_event(THP_FAULT_FALLBACK);
 732                        return VM_FAULT_FALLBACK;
 733                }
 734                vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
 735                ret = 0;
 736                set = false;
 737                if (pmd_none(*vmf->pmd)) {
 738                        ret = check_stable_address_space(vma->vm_mm);
 739                        if (ret) {
 740                                spin_unlock(vmf->ptl);
 741                        } else if (userfaultfd_missing(vma)) {
 742                                spin_unlock(vmf->ptl);
 743                                ret = handle_userfault(vmf, VM_UFFD_MISSING);
 744                                VM_BUG_ON(ret & VM_FAULT_FALLBACK);
 745                        } else {
 746                                set_huge_zero_page(pgtable, vma->vm_mm, vma,
 747                                                   haddr, vmf->pmd, zero_page);
 748                                spin_unlock(vmf->ptl);
 749                                set = true;
 750                        }
 751                } else
 752                        spin_unlock(vmf->ptl);
 753                if (!set)
 754                        pte_free(vma->vm_mm, pgtable);
 755                return ret;
 756        }
 757        gfp = alloc_hugepage_direct_gfpmask(vma);
 758        page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
 759        if (unlikely(!page)) {
 760                count_vm_event(THP_FAULT_FALLBACK);
 761                return VM_FAULT_FALLBACK;
 762        }
 763        prep_transhuge_page(page);
 764        return __do_huge_pmd_anonymous_page(vmf, page, gfp);
 765}
 766
 767static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 768                pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
 769                pgtable_t pgtable)
 770{
 771        struct mm_struct *mm = vma->vm_mm;
 772        pmd_t entry;
 773        spinlock_t *ptl;
 774
 775        ptl = pmd_lock(mm, pmd);
 776        if (!pmd_none(*pmd)) {
 777                if (write) {
 778                        if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
 779                                WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
 780                                goto out_unlock;
 781                        }
 782                        entry = pmd_mkyoung(*pmd);
 783                        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
 784                        if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
 785                                update_mmu_cache_pmd(vma, addr, pmd);
 786                }
 787
 788                goto out_unlock;
 789        }
 790
 791        entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
 792        if (pfn_t_devmap(pfn))
 793                entry = pmd_mkdevmap(entry);
 794        if (write) {
 795                entry = pmd_mkyoung(pmd_mkdirty(entry));
 796                entry = maybe_pmd_mkwrite(entry, vma);
 797        }
 798
 799        if (pgtable) {
 800                pgtable_trans_huge_deposit(mm, pmd, pgtable);
 801                mm_inc_nr_ptes(mm);
 802                pgtable = NULL;
 803        }
 804
 805        set_pmd_at(mm, addr, pmd, entry);
 806        update_mmu_cache_pmd(vma, addr, pmd);
 807
 808out_unlock:
 809        spin_unlock(ptl);
 810        if (pgtable)
 811                pte_free(mm, pgtable);
 812}
 813
 814vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
 815{
 816        unsigned long addr = vmf->address & PMD_MASK;
 817        struct vm_area_struct *vma = vmf->vma;
 818        pgprot_t pgprot = vma->vm_page_prot;
 819        pgtable_t pgtable = NULL;
 820
 821        /*
 822         * If we had pmd_special, we could avoid all these restrictions,
 823         * but we need to be consistent with PTEs and architectures that
 824         * can't support a 'special' bit.
 825         */
 826        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
 827                        !pfn_t_devmap(pfn));
 828        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
 829                                                (VM_PFNMAP|VM_MIXEDMAP));
 830        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
 831
 832        if (addr < vma->vm_start || addr >= vma->vm_end)
 833                return VM_FAULT_SIGBUS;
 834
 835        if (arch_needs_pgtable_deposit()) {
 836                pgtable = pte_alloc_one(vma->vm_mm);
 837                if (!pgtable)
 838                        return VM_FAULT_OOM;
 839        }
 840
 841        track_pfn_insert(vma, &pgprot, pfn);
 842
 843        insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
 844        return VM_FAULT_NOPAGE;
 845}
 846EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
 847
 848#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 849static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
 850{
 851        if (likely(vma->vm_flags & VM_WRITE))
 852                pud = pud_mkwrite(pud);
 853        return pud;
 854}
 855
 856static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
 857                pud_t *pud, pfn_t pfn, pgprot_t prot, bool write)
 858{
 859        struct mm_struct *mm = vma->vm_mm;
 860        pud_t entry;
 861        spinlock_t *ptl;
 862
 863        ptl = pud_lock(mm, pud);
 864        if (!pud_none(*pud)) {
 865                if (write) {
 866                        if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
 867                                WARN_ON_ONCE(!is_huge_zero_pud(*pud));
 868                                goto out_unlock;
 869                        }
 870                        entry = pud_mkyoung(*pud);
 871                        entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
 872                        if (pudp_set_access_flags(vma, addr, pud, entry, 1))
 873                                update_mmu_cache_pud(vma, addr, pud);
 874                }
 875                goto out_unlock;
 876        }
 877
 878        entry = pud_mkhuge(pfn_t_pud(pfn, prot));
 879        if (pfn_t_devmap(pfn))
 880                entry = pud_mkdevmap(entry);
 881        if (write) {
 882                entry = pud_mkyoung(pud_mkdirty(entry));
 883                entry = maybe_pud_mkwrite(entry, vma);
 884        }
 885        set_pud_at(mm, addr, pud, entry);
 886        update_mmu_cache_pud(vma, addr, pud);
 887
 888out_unlock:
 889        spin_unlock(ptl);
 890}
 891
 892vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
 893{
 894        unsigned long addr = vmf->address & PUD_MASK;
 895        struct vm_area_struct *vma = vmf->vma;
 896        pgprot_t pgprot = vma->vm_page_prot;
 897
 898        /*
 899         * If we had pud_special, we could avoid all these restrictions,
 900         * but we need to be consistent with PTEs and architectures that
 901         * can't support a 'special' bit.
 902         */
 903        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
 904                        !pfn_t_devmap(pfn));
 905        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
 906                                                (VM_PFNMAP|VM_MIXEDMAP));
 907        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
 908
 909        if (addr < vma->vm_start || addr >= vma->vm_end)
 910                return VM_FAULT_SIGBUS;
 911
 912        track_pfn_insert(vma, &pgprot, pfn);
 913
 914        insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
 915        return VM_FAULT_NOPAGE;
 916}
 917EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
 918#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 919
 920static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 921                pmd_t *pmd, int flags)
 922{
 923        pmd_t _pmd;
 924
 925        _pmd = pmd_mkyoung(*pmd);
 926        if (flags & FOLL_WRITE)
 927                _pmd = pmd_mkdirty(_pmd);
 928        if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
 929                                pmd, _pmd, flags & FOLL_WRITE))
 930                update_mmu_cache_pmd(vma, addr, pmd);
 931}
 932
 933struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 934                pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
 935{
 936        unsigned long pfn = pmd_pfn(*pmd);
 937        struct mm_struct *mm = vma->vm_mm;
 938        struct page *page;
 939
 940        assert_spin_locked(pmd_lockptr(mm, pmd));
 941
 942        /*
 943         * When we COW a devmap PMD entry, we split it into PTEs, so we should
 944         * not be in this function with `flags & FOLL_COW` set.
 945         */
 946        WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
 947
 948        if (flags & FOLL_WRITE && !pmd_write(*pmd))
 949                return NULL;
 950
 951        if (pmd_present(*pmd) && pmd_devmap(*pmd))
 952                /* pass */;
 953        else
 954                return NULL;
 955
 956        if (flags & FOLL_TOUCH)
 957                touch_pmd(vma, addr, pmd, flags);
 958
 959        /*
 960         * device mapped pages can only be returned if the
 961         * caller will manage the page reference count.
 962         */
 963        if (!(flags & FOLL_GET))
 964                return ERR_PTR(-EEXIST);
 965
 966        pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
 967        *pgmap = get_dev_pagemap(pfn, *pgmap);
 968        if (!*pgmap)
 969                return ERR_PTR(-EFAULT);
 970        page = pfn_to_page(pfn);
 971        get_page(page);
 972
 973        return page;
 974}
 975
 976int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 977                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
 978                  struct vm_area_struct *vma)
 979{
 980        spinlock_t *dst_ptl, *src_ptl;
 981        struct page *src_page;
 982        pmd_t pmd;
 983        pgtable_t pgtable = NULL;
 984        int ret = -ENOMEM;
 985
 986        /* Skip if can be re-fill on fault */
 987        if (!vma_is_anonymous(vma))
 988                return 0;
 989
 990        pgtable = pte_alloc_one(dst_mm);
 991        if (unlikely(!pgtable))
 992                goto out;
 993
 994        dst_ptl = pmd_lock(dst_mm, dst_pmd);
 995        src_ptl = pmd_lockptr(src_mm, src_pmd);
 996        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
 997
 998        ret = -EAGAIN;
 999        pmd = *src_pmd;
1000
1001#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1002        if (unlikely(is_swap_pmd(pmd))) {
1003                swp_entry_t entry = pmd_to_swp_entry(pmd);
1004
1005                VM_BUG_ON(!is_pmd_migration_entry(pmd));
1006                if (is_write_migration_entry(entry)) {
1007                        make_migration_entry_read(&entry);
1008                        pmd = swp_entry_to_pmd(entry);
1009                        if (pmd_swp_soft_dirty(*src_pmd))
1010                                pmd = pmd_swp_mksoft_dirty(pmd);
1011                        set_pmd_at(src_mm, addr, src_pmd, pmd);
1012                }
1013                add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1014                mm_inc_nr_ptes(dst_mm);
1015                pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1016                set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1017                ret = 0;
1018                goto out_unlock;
1019        }
1020#endif
1021
1022        if (unlikely(!pmd_trans_huge(pmd))) {
1023                pte_free(dst_mm, pgtable);
1024                goto out_unlock;
1025        }
1026        /*
1027         * When page table lock is held, the huge zero pmd should not be
1028         * under splitting since we don't split the page itself, only pmd to
1029         * a page table.
1030         */
1031        if (is_huge_zero_pmd(pmd)) {
1032                struct page *zero_page;
1033                /*
1034                 * get_huge_zero_page() will never allocate a new page here,
1035                 * since we already have a zero page to copy. It just takes a
1036                 * reference.
1037                 */
1038                zero_page = mm_get_huge_zero_page(dst_mm);
1039                set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
1040                                zero_page);
1041                ret = 0;
1042                goto out_unlock;
1043        }
1044
1045        src_page = pmd_page(pmd);
1046        VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
1047        get_page(src_page);
1048        page_dup_rmap(src_page, true);
1049        add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1050        mm_inc_nr_ptes(dst_mm);
1051        pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
1052
1053        pmdp_set_wrprotect(src_mm, addr, src_pmd);
1054        pmd = pmd_mkold(pmd_wrprotect(pmd));
1055        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
1056
1057        ret = 0;
1058out_unlock:
1059        spin_unlock(src_ptl);
1060        spin_unlock(dst_ptl);
1061out:
1062        return ret;
1063}
1064
1065#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
1066static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
1067                pud_t *pud, int flags)
1068{
1069        pud_t _pud;
1070
1071        _pud = pud_mkyoung(*pud);
1072        if (flags & FOLL_WRITE)
1073                _pud = pud_mkdirty(_pud);
1074        if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
1075                                pud, _pud, flags & FOLL_WRITE))
1076                update_mmu_cache_pud(vma, addr, pud);
1077}
1078
1079struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
1080                pud_t *pud, int flags, struct dev_pagemap **pgmap)
1081{
1082        unsigned long pfn = pud_pfn(*pud);
1083        struct mm_struct *mm = vma->vm_mm;
1084        struct page *page;
1085
1086        assert_spin_locked(pud_lockptr(mm, pud));
1087
1088        if (flags & FOLL_WRITE && !pud_write(*pud))
1089                return NULL;
1090
1091        if (pud_present(*pud) && pud_devmap(*pud))
1092                /* pass */;
1093        else
1094                return NULL;
1095
1096        if (flags & FOLL_TOUCH)
1097                touch_pud(vma, addr, pud, flags);
1098
1099        /*
1100         * device mapped pages can only be returned if the
1101         * caller will manage the page reference count.
1102         */
1103        if (!(flags & FOLL_GET))
1104                return ERR_PTR(-EEXIST);
1105
1106        pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
1107        *pgmap = get_dev_pagemap(pfn, *pgmap);
1108        if (!*pgmap)
1109                return ERR_PTR(-EFAULT);
1110        page = pfn_to_page(pfn);
1111        get_page(page);
1112
1113        return page;
1114}
1115
1116int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
1117                  pud_t *dst_pud, pud_t *src_pud, unsigned long addr,
1118                  struct vm_area_struct *vma)
1119{
1120        spinlock_t *dst_ptl, *src_ptl;
1121        pud_t pud;
1122        int ret;
1123
1124        dst_ptl = pud_lock(dst_mm, dst_pud);
1125        src_ptl = pud_lockptr(src_mm, src_pud);
1126        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
1127
1128        ret = -EAGAIN;
1129        pud = *src_pud;
1130        if (unlikely(!pud_trans_huge(pud) && !pud_devmap(pud)))
1131                goto out_unlock;
1132
1133        /*
1134         * When page table lock is held, the huge zero pud should not be
1135         * under splitting since we don't split the page itself, only pud to
1136         * a page table.
1137         */
1138        if (is_huge_zero_pud(pud)) {
1139                /* No huge zero pud yet */
1140        }
1141
1142        pudp_set_wrprotect(src_mm, addr, src_pud);
1143        pud = pud_mkold(pud_wrprotect(pud));
1144        set_pud_at(dst_mm, addr, dst_pud, pud);
1145
1146        ret = 0;
1147out_unlock:
1148        spin_unlock(src_ptl);
1149        spin_unlock(dst_ptl);
1150        return ret;
1151}
1152
1153void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
1154{
1155        pud_t entry;
1156        unsigned long haddr;
1157        bool write = vmf->flags & FAULT_FLAG_WRITE;
1158
1159        vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
1160        if (unlikely(!pud_same(*vmf->pud, orig_pud)))
1161                goto unlock;
1162
1163        entry = pud_mkyoung(orig_pud);
1164        if (write)
1165                entry = pud_mkdirty(entry);
1166        haddr = vmf->address & HPAGE_PUD_MASK;
1167        if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
1168                update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
1169
1170unlock:
1171        spin_unlock(vmf->ptl);
1172}
1173#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
1174
1175void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
1176{
1177        pmd_t entry;
1178        unsigned long haddr;
1179        bool write = vmf->flags & FAULT_FLAG_WRITE;
1180
1181        vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
1182        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
1183                goto unlock;
1184
1185        entry = pmd_mkyoung(orig_pmd);
1186        if (write)
1187                entry = pmd_mkdirty(entry);
1188        haddr = vmf->address & HPAGE_PMD_MASK;
1189        if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
1190                update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
1191
1192unlock:
1193        spin_unlock(vmf->ptl);
1194}
1195
1196static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
1197                        pmd_t orig_pmd, struct page *page)
1198{
1199        struct vm_area_struct *vma = vmf->vma;
1200        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1201        struct mem_cgroup *memcg;
1202        pgtable_t pgtable;
1203        pmd_t _pmd;
1204        int i;
1205        vm_fault_t ret = 0;
1206        struct page **pages;
1207        struct mmu_notifier_range range;
1208
1209        pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
1210                              GFP_KERNEL);
1211        if (unlikely(!pages)) {
1212                ret |= VM_FAULT_OOM;
1213                goto out;
1214        }
1215
1216        for (i = 0; i < HPAGE_PMD_NR; i++) {
1217                pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
1218                                               vmf->address, page_to_nid(page));
1219                if (unlikely(!pages[i] ||
1220                             mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
1221                                     GFP_KERNEL, &memcg, false))) {
1222                        if (pages[i])
1223                                put_page(pages[i]);
1224                        while (--i >= 0) {
1225                                memcg = (void *)page_private(pages[i]);
1226                                set_page_private(pages[i], 0);
1227                                mem_cgroup_cancel_charge(pages[i], memcg,
1228                                                false);
1229                                put_page(pages[i]);
1230                        }
1231                        kfree(pages);
1232                        ret |= VM_FAULT_OOM;
1233                        goto out;
1234                }
1235                set_page_private(pages[i], (unsigned long)memcg);
1236        }
1237
1238        for (i = 0; i < HPAGE_PMD_NR; i++) {
1239                copy_user_highpage(pages[i], page + i,
1240                                   haddr + PAGE_SIZE * i, vma);
1241                __SetPageUptodate(pages[i]);
1242                cond_resched();
1243        }
1244
1245        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1246                                haddr, haddr + HPAGE_PMD_SIZE);
1247        mmu_notifier_invalidate_range_start(&range);
1248
1249        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1250        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
1251                goto out_free_pages;
1252        VM_BUG_ON_PAGE(!PageHead(page), page);
1253
1254        /*
1255         * Leave pmd empty until pte is filled note we must notify here as
1256         * concurrent CPU thread might write to new page before the call to
1257         * mmu_notifier_invalidate_range_end() happens which can lead to a
1258         * device seeing memory write in different order than CPU.
1259         *
1260         * See Documentation/vm/mmu_notifier.rst
1261         */
1262        pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
1263
1264        pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
1265        pmd_populate(vma->vm_mm, &_pmd, pgtable);
1266
1267        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
1268                pte_t entry;
1269                entry = mk_pte(pages[i], vma->vm_page_prot);
1270                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
1271                memcg = (void *)page_private(pages[i]);
1272                set_page_private(pages[i], 0);
1273                page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
1274                mem_cgroup_commit_charge(pages[i], memcg, false, false);
1275                lru_cache_add_active_or_unevictable(pages[i], vma);
1276                vmf->pte = pte_offset_map(&_pmd, haddr);
1277                VM_BUG_ON(!pte_none(*vmf->pte));
1278                set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
1279                pte_unmap(vmf->pte);
1280        }
1281        kfree(pages);
1282
1283        smp_wmb(); /* make pte visible before pmd */
1284        pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
1285        page_remove_rmap(page, true);
1286        spin_unlock(vmf->ptl);
1287
1288        /*
1289         * No need to double call mmu_notifier->invalidate_range() callback as
1290         * the above pmdp_huge_clear_flush_notify() did already call it.
1291         */
1292        mmu_notifier_invalidate_range_only_end(&range);
1293
1294        ret |= VM_FAULT_WRITE;
1295        put_page(page);
1296
1297out:
1298        return ret;
1299
1300out_free_pages:
1301        spin_unlock(vmf->ptl);
1302        mmu_notifier_invalidate_range_end(&range);
1303        for (i = 0; i < HPAGE_PMD_NR; i++) {
1304                memcg = (void *)page_private(pages[i]);
1305                set_page_private(pages[i], 0);
1306                mem_cgroup_cancel_charge(pages[i], memcg, false);
1307                put_page(pages[i]);
1308        }
1309        kfree(pages);
1310        goto out;
1311}
1312
1313vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
1314{
1315        struct vm_area_struct *vma = vmf->vma;
1316        struct page *page = NULL, *new_page;
1317        struct mem_cgroup *memcg;
1318        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1319        struct mmu_notifier_range range;
1320        gfp_t huge_gfp;                 /* for allocation and charge */
1321        vm_fault_t ret = 0;
1322
1323        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
1324        VM_BUG_ON_VMA(!vma->anon_vma, vma);
1325        if (is_huge_zero_pmd(orig_pmd))
1326                goto alloc;
1327        spin_lock(vmf->ptl);
1328        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
1329                goto out_unlock;
1330
1331        page = pmd_page(orig_pmd);
1332        VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
1333        /*
1334         * We can only reuse the page if nobody else maps the huge page or it's
1335         * part.
1336         */
1337        if (!trylock_page(page)) {
1338                get_page(page);
1339                spin_unlock(vmf->ptl);
1340                lock_page(page);
1341                spin_lock(vmf->ptl);
1342                if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1343                        unlock_page(page);
1344                        put_page(page);
1345                        goto out_unlock;
1346                }
1347                put_page(page);
1348        }
1349        if (reuse_swap_page(page, NULL)) {
1350                pmd_t entry;
1351                entry = pmd_mkyoung(orig_pmd);
1352                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1353                if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
1354                        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1355                ret |= VM_FAULT_WRITE;
1356                unlock_page(page);
1357                goto out_unlock;
1358        }
1359        unlock_page(page);
1360        get_page(page);
1361        spin_unlock(vmf->ptl);
1362alloc:
1363        if (__transparent_hugepage_enabled(vma) &&
1364            !transparent_hugepage_debug_cow()) {
1365                huge_gfp = alloc_hugepage_direct_gfpmask(vma);
1366                new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
1367        } else
1368                new_page = NULL;
1369
1370        if (likely(new_page)) {
1371                prep_transhuge_page(new_page);
1372        } else {
1373                if (!page) {
1374                        split_huge_pmd(vma, vmf->pmd, vmf->address);
1375                        ret |= VM_FAULT_FALLBACK;
1376                } else {
1377                        ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
1378                        if (ret & VM_FAULT_OOM) {
1379                                split_huge_pmd(vma, vmf->pmd, vmf->address);
1380                                ret |= VM_FAULT_FALLBACK;
1381                        }
1382                        put_page(page);
1383                }
1384                count_vm_event(THP_FAULT_FALLBACK);
1385                goto out;
1386        }
1387
1388        if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
1389                                        huge_gfp, &memcg, true))) {
1390                put_page(new_page);
1391                split_huge_pmd(vma, vmf->pmd, vmf->address);
1392                if (page)
1393                        put_page(page);
1394                ret |= VM_FAULT_FALLBACK;
1395                count_vm_event(THP_FAULT_FALLBACK);
1396                goto out;
1397        }
1398
1399        count_vm_event(THP_FAULT_ALLOC);
1400        count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
1401
1402        if (!page)
1403                clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
1404        else
1405                copy_user_huge_page(new_page, page, vmf->address,
1406                                    vma, HPAGE_PMD_NR);
1407        __SetPageUptodate(new_page);
1408
1409        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1410                                haddr, haddr + HPAGE_PMD_SIZE);
1411        mmu_notifier_invalidate_range_start(&range);
1412
1413        spin_lock(vmf->ptl);
1414        if (page)
1415                put_page(page);
1416        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
1417                spin_unlock(vmf->ptl);
1418                mem_cgroup_cancel_charge(new_page, memcg, true);
1419                put_page(new_page);
1420                goto out_mn;
1421        } else {
1422                pmd_t entry;
1423                entry = mk_huge_pmd(new_page, vma->vm_page_prot);
1424                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
1425                pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
1426                page_add_new_anon_rmap(new_page, vma, haddr, true);
1427                mem_cgroup_commit_charge(new_page, memcg, false, true);
1428                lru_cache_add_active_or_unevictable(new_page, vma);
1429                set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
1430                update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1431                if (!page) {
1432                        add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
1433                } else {
1434                        VM_BUG_ON_PAGE(!PageHead(page), page);
1435                        page_remove_rmap(page, true);
1436                        put_page(page);
1437                }
1438                ret |= VM_FAULT_WRITE;
1439        }
1440        spin_unlock(vmf->ptl);
1441out_mn:
1442        /*
1443         * No need to double call mmu_notifier->invalidate_range() callback as
1444         * the above pmdp_huge_clear_flush_notify() did already call it.
1445         */
1446        mmu_notifier_invalidate_range_only_end(&range);
1447out:
1448        return ret;
1449out_unlock:
1450        spin_unlock(vmf->ptl);
1451        return ret;
1452}
1453
1454/*
1455 * FOLL_FORCE can write to even unwritable pmd's, but only
1456 * after we've gone through a COW cycle and they are dirty.
1457 */
1458static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
1459{
1460        return pmd_write(pmd) ||
1461               ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
1462}
1463
1464struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1465                                   unsigned long addr,
1466                                   pmd_t *pmd,
1467                                   unsigned int flags)
1468{
1469        struct mm_struct *mm = vma->vm_mm;
1470        struct page *page = NULL;
1471
1472        assert_spin_locked(pmd_lockptr(mm, pmd));
1473
1474        if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
1475                goto out;
1476
1477        /* Avoid dumping huge zero page */
1478        if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
1479                return ERR_PTR(-EFAULT);
1480
1481        /* Full NUMA hinting faults to serialise migration in fault paths */
1482        if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
1483                goto out;
1484
1485        page = pmd_page(*pmd);
1486        VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
1487        if (flags & FOLL_TOUCH)
1488                touch_pmd(vma, addr, pmd, flags);
1489        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1490                /*
1491                 * We don't mlock() pte-mapped THPs. This way we can avoid
1492                 * leaking mlocked pages into non-VM_LOCKED VMAs.
1493                 *
1494                 * For anon THP:
1495                 *
1496                 * In most cases the pmd is the only mapping of the page as we
1497                 * break COW for the mlock() -- see gup_flags |= FOLL_WRITE for
1498                 * writable private mappings in populate_vma_page_range().
1499                 *
1500                 * The only scenario when we have the page shared here is if we
1501                 * mlocking read-only mapping shared over fork(). We skip
1502                 * mlocking such pages.
1503                 *
1504                 * For file THP:
1505                 *
1506                 * We can expect PageDoubleMap() to be stable under page lock:
1507                 * for file pages we set it in page_add_file_rmap(), which
1508                 * requires page to be locked.
1509                 */
1510
1511                if (PageAnon(page) && compound_mapcount(page) != 1)
1512                        goto skip_mlock;
1513                if (PageDoubleMap(page) || !page->mapping)
1514                        goto skip_mlock;
1515                if (!trylock_page(page))
1516                        goto skip_mlock;
1517                lru_add_drain();
1518                if (page->mapping && !PageDoubleMap(page))
1519                        mlock_vma_page(page);
1520                unlock_page(page);
1521        }
1522skip_mlock:
1523        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
1524        VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
1525        if (flags & FOLL_GET)
1526                get_page(page);
1527
1528out:
1529        return page;
1530}
1531
1532/* NUMA hinting page fault entry point for trans huge pmds */
1533vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
1534{
1535        struct vm_area_struct *vma = vmf->vma;
1536        struct anon_vma *anon_vma = NULL;
1537        struct page *page;
1538        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
1539        int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
1540        int target_nid, last_cpupid = -1;
1541        bool page_locked;
1542        bool migrated = false;
1543        bool was_writable;
1544        int flags = 0;
1545
1546        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
1547        if (unlikely(!pmd_same(pmd, *vmf->pmd)))
1548                goto out_unlock;
1549
1550        /*
1551         * If there are potential migrations, wait for completion and retry
1552         * without disrupting NUMA hinting information. Do not relock and
1553         * check_same as the page may no longer be mapped.
1554         */
1555        if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
1556                page = pmd_page(*vmf->pmd);
1557                if (!get_page_unless_zero(page))
1558                        goto out_unlock;
1559                spin_unlock(vmf->ptl);
1560                put_and_wait_on_page_locked(page);
1561                goto out;
1562        }
1563
1564        page = pmd_page(pmd);
1565        BUG_ON(is_huge_zero_page(page));
1566        page_nid = page_to_nid(page);
1567        last_cpupid = page_cpupid_last(page);
1568        count_vm_numa_event(NUMA_HINT_FAULTS);
1569        if (page_nid == this_nid) {
1570                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
1571                flags |= TNF_FAULT_LOCAL;
1572        }
1573
1574        /* See similar comment in do_numa_page for explanation */
1575        if (!pmd_savedwrite(pmd))
1576                flags |= TNF_NO_GROUP;
1577
1578        /*
1579         * Acquire the page lock to serialise THP migrations but avoid dropping
1580         * page_table_lock if at all possible
1581         */
1582        page_locked = trylock_page(page);
1583        target_nid = mpol_misplaced(page, vma, haddr);
1584        if (target_nid == NUMA_NO_NODE) {
1585                /* If the page was locked, there are no parallel migrations */
1586                if (page_locked)
1587                        goto clear_pmdnuma;
1588        }
1589
1590        /* Migration could have started since the pmd_trans_migrating check */
1591        if (!page_locked) {
1592                page_nid = NUMA_NO_NODE;
1593                if (!get_page_unless_zero(page))
1594                        goto out_unlock;
1595                spin_unlock(vmf->ptl);
1596                put_and_wait_on_page_locked(page);
1597                goto out;
1598        }
1599
1600        /*
1601         * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
1602         * to serialises splits
1603         */
1604        get_page(page);
1605        spin_unlock(vmf->ptl);
1606        anon_vma = page_lock_anon_vma_read(page);
1607
1608        /* Confirm the PMD did not change while page_table_lock was released */
1609        spin_lock(vmf->ptl);
1610        if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
1611                unlock_page(page);
1612                put_page(page);
1613                page_nid = NUMA_NO_NODE;
1614                goto out_unlock;
1615        }
1616
1617        /* Bail if we fail to protect against THP splits for any reason */
1618        if (unlikely(!anon_vma)) {
1619                put_page(page);
1620                page_nid = NUMA_NO_NODE;
1621                goto clear_pmdnuma;
1622        }
1623
1624        /*
1625         * Since we took the NUMA fault, we must have observed the !accessible
1626         * bit. Make sure all other CPUs agree with that, to avoid them
1627         * modifying the page we're about to migrate.
1628         *
1629         * Must be done under PTL such that we'll observe the relevant
1630         * inc_tlb_flush_pending().
1631         *
1632         * We are not sure a pending tlb flush here is for a huge page
1633         * mapping or not. Hence use the tlb range variant
1634         */
1635        if (mm_tlb_flush_pending(vma->vm_mm)) {
1636                flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
1637                /*
1638                 * change_huge_pmd() released the pmd lock before
1639                 * invalidating the secondary MMUs sharing the primary
1640                 * MMU pagetables (with ->invalidate_range()). The
1641                 * mmu_notifier_invalidate_range_end() (which
1642                 * internally calls ->invalidate_range()) in
1643                 * change_pmd_range() will run after us, so we can't
1644                 * rely on it here and we need an explicit invalidate.
1645                 */
1646                mmu_notifier_invalidate_range(vma->vm_mm, haddr,
1647                                              haddr + HPAGE_PMD_SIZE);
1648        }
1649
1650        /*
1651         * Migrate the THP to the requested node, returns with page unlocked
1652         * and access rights restored.
1653         */
1654        spin_unlock(vmf->ptl);
1655
1656        migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
1657                                vmf->pmd, pmd, vmf->address, page, target_nid);
1658        if (migrated) {
1659                flags |= TNF_MIGRATED;
1660                page_nid = target_nid;
1661        } else
1662                flags |= TNF_MIGRATE_FAIL;
1663
1664        goto out;
1665clear_pmdnuma:
1666        BUG_ON(!PageLocked(page));
1667        was_writable = pmd_savedwrite(pmd);
1668        pmd = pmd_modify(pmd, vma->vm_page_prot);
1669        pmd = pmd_mkyoung(pmd);
1670        if (was_writable)
1671                pmd = pmd_mkwrite(pmd);
1672        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
1673        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
1674        unlock_page(page);
1675out_unlock:
1676        spin_unlock(vmf->ptl);
1677
1678out:
1679        if (anon_vma)
1680                page_unlock_anon_vma_read(anon_vma);
1681
1682        if (page_nid != NUMA_NO_NODE)
1683                task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
1684                                flags);
1685
1686        return 0;
1687}
1688
1689/*
1690 * Return true if we do MADV_FREE successfully on entire pmd page.
1691 * Otherwise, return false.
1692 */
1693bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1694                pmd_t *pmd, unsigned long addr, unsigned long next)
1695{
1696        spinlock_t *ptl;
1697        pmd_t orig_pmd;
1698        struct page *page;
1699        struct mm_struct *mm = tlb->mm;
1700        bool ret = false;
1701
1702        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1703
1704        ptl = pmd_trans_huge_lock(pmd, vma);
1705        if (!ptl)
1706                goto out_unlocked;
1707
1708        orig_pmd = *pmd;
1709        if (is_huge_zero_pmd(orig_pmd))
1710                goto out;
1711
1712        if (unlikely(!pmd_present(orig_pmd))) {
1713                VM_BUG_ON(thp_migration_supported() &&
1714                                  !is_pmd_migration_entry(orig_pmd));
1715                goto out;
1716        }
1717
1718        page = pmd_page(orig_pmd);
1719        /*
1720         * If other processes are mapping this page, we couldn't discard
1721         * the page unless they all do MADV_FREE so let's skip the page.
1722         */
1723        if (page_mapcount(page) != 1)
1724                goto out;
1725
1726        if (!trylock_page(page))
1727                goto out;
1728
1729        /*
1730         * If user want to discard part-pages of THP, split it so MADV_FREE
1731         * will deactivate only them.
1732         */
1733        if (next - addr != HPAGE_PMD_SIZE) {
1734                get_page(page);
1735                spin_unlock(ptl);
1736                split_huge_page(page);
1737                unlock_page(page);
1738                put_page(page);
1739                goto out_unlocked;
1740        }
1741
1742        if (PageDirty(page))
1743                ClearPageDirty(page);
1744        unlock_page(page);
1745
1746        if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) {
1747                pmdp_invalidate(vma, addr, pmd);
1748                orig_pmd = pmd_mkold(orig_pmd);
1749                orig_pmd = pmd_mkclean(orig_pmd);
1750
1751                set_pmd_at(mm, addr, pmd, orig_pmd);
1752                tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1753        }
1754
1755        mark_page_lazyfree(page);
1756        ret = true;
1757out:
1758        spin_unlock(ptl);
1759out_unlocked:
1760        return ret;
1761}
1762
1763static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd)
1764{
1765        pgtable_t pgtable;
1766
1767        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
1768        pte_free(mm, pgtable);
1769        mm_dec_nr_ptes(mm);
1770}
1771
1772int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
1773                 pmd_t *pmd, unsigned long addr)
1774{
1775        pmd_t orig_pmd;
1776        spinlock_t *ptl;
1777
1778        tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
1779
1780        ptl = __pmd_trans_huge_lock(pmd, vma);
1781        if (!ptl)
1782                return 0;
1783        /*
1784         * For architectures like ppc64 we look at deposited pgtable
1785         * when calling pmdp_huge_get_and_clear. So do the
1786         * pgtable_trans_huge_withdraw after finishing pmdp related
1787         * operations.
1788         */
1789        orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
1790                        tlb->fullmm);
1791        tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
1792        if (vma_is_dax(vma)) {
1793                if (arch_needs_pgtable_deposit())
1794                        zap_deposited_table(tlb->mm, pmd);
1795                spin_unlock(ptl);
1796                if (is_huge_zero_pmd(orig_pmd))
1797                        tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
1798        } else if (is_huge_zero_pmd(orig_pmd)) {
1799                zap_deposited_table(tlb->mm, pmd);
1800                spin_unlock(ptl);
1801                tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
1802        } else {
1803                struct page *page = NULL;
1804                int flush_needed = 1;
1805
1806                if (pmd_present(orig_pmd)) {
1807                        page = pmd_page(orig_pmd);
1808                        page_remove_rmap(page, true);
1809                        VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
1810                        VM_BUG_ON_PAGE(!PageHead(page), page);
1811                } else if (thp_migration_supported()) {
1812                        swp_entry_t entry;
1813
1814                        VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
1815                        entry = pmd_to_swp_entry(orig_pmd);
1816                        page = pfn_to_page(swp_offset(entry));
1817                        flush_needed = 0;
1818                } else
1819                        WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
1820
1821                if (PageAnon(page)) {
1822                        zap_deposited_table(tlb->mm, pmd);
1823                        add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
1824                } else {
1825                        if (arch_needs_pgtable_deposit())
1826                                zap_deposited_table(tlb->mm, pmd);
1827                        add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR);
1828                }
1829
1830                spin_unlock(ptl);
1831                if (flush_needed)
1832                        tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE);
1833        }
1834        return 1;
1835}
1836
1837#ifndef pmd_move_must_withdraw
1838static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl,
1839                                         spinlock_t *old_pmd_ptl,
1840                                         struct vm_area_struct *vma)
1841{
1842        /*
1843         * With split pmd lock we also need to move preallocated
1844         * PTE page table if new_pmd is on different PMD page table.
1845         *
1846         * We also don't deposit and withdraw tables for file pages.
1847         */
1848        return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma);
1849}
1850#endif
1851
1852static pmd_t move_soft_dirty_pmd(pmd_t pmd)
1853{
1854#ifdef CONFIG_MEM_SOFT_DIRTY
1855        if (unlikely(is_pmd_migration_entry(pmd)))
1856                pmd = pmd_swp_mksoft_dirty(pmd);
1857        else if (pmd_present(pmd))
1858                pmd = pmd_mksoft_dirty(pmd);
1859#endif
1860        return pmd;
1861}
1862
1863bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
1864                  unsigned long new_addr, unsigned long old_end,
1865                  pmd_t *old_pmd, pmd_t *new_pmd)
1866{
1867        spinlock_t *old_ptl, *new_ptl;
1868        pmd_t pmd;
1869        struct mm_struct *mm = vma->vm_mm;
1870        bool force_flush = false;
1871
1872        if ((old_addr & ~HPAGE_PMD_MASK) ||
1873            (new_addr & ~HPAGE_PMD_MASK) ||
1874            old_end - old_addr < HPAGE_PMD_SIZE)
1875                return false;
1876
1877        /*
1878         * The destination pmd shouldn't be established, free_pgtables()
1879         * should have release it.
1880         */
1881        if (WARN_ON(!pmd_none(*new_pmd))) {
1882                VM_BUG_ON(pmd_trans_huge(*new_pmd));
1883                return false;
1884        }
1885
1886        /*
1887         * We don't have to worry about the ordering of src and dst
1888         * ptlocks because exclusive mmap_sem prevents deadlock.
1889         */
1890        old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
1891        if (old_ptl) {
1892                new_ptl = pmd_lockptr(mm, new_pmd);
1893                if (new_ptl != old_ptl)
1894                        spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
1895                pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
1896                if (pmd_present(pmd))
1897                        force_flush = true;
1898                VM_BUG_ON(!pmd_none(*new_pmd));
1899
1900                if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) {
1901                        pgtable_t pgtable;
1902                        pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
1903                        pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
1904                }
1905                pmd = move_soft_dirty_pmd(pmd);
1906                set_pmd_at(mm, new_addr, new_pmd, pmd);
1907                if (force_flush)
1908                        flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
1909                if (new_ptl != old_ptl)
1910                        spin_unlock(new_ptl);
1911                spin_unlock(old_ptl);
1912                return true;
1913        }
1914        return false;
1915}
1916
1917/*
1918 * Returns
1919 *  - 0 if PMD could not be locked
1920 *  - 1 if PMD was locked but protections unchange and TLB flush unnecessary
1921 *  - HPAGE_PMD_NR is protections changed and TLB flush necessary
1922 */
1923int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1924                unsigned long addr, pgprot_t newprot, int prot_numa)
1925{
1926        struct mm_struct *mm = vma->vm_mm;
1927        spinlock_t *ptl;
1928        pmd_t entry;
1929        bool preserve_write;
1930        int ret;
1931
1932        ptl = __pmd_trans_huge_lock(pmd, vma);
1933        if (!ptl)
1934                return 0;
1935
1936        preserve_write = prot_numa && pmd_write(*pmd);
1937        ret = 1;
1938
1939#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1940        if (is_swap_pmd(*pmd)) {
1941                swp_entry_t entry = pmd_to_swp_entry(*pmd);
1942
1943                VM_BUG_ON(!is_pmd_migration_entry(*pmd));
1944                if (is_write_migration_entry(entry)) {
1945                        pmd_t newpmd;
1946                        /*
1947                         * A protection check is difficult so
1948                         * just be safe and disable write
1949                         */
1950                        make_migration_entry_read(&entry);
1951                        newpmd = swp_entry_to_pmd(entry);
1952                        if (pmd_swp_soft_dirty(*pmd))
1953                                newpmd = pmd_swp_mksoft_dirty(newpmd);
1954                        set_pmd_at(mm, addr, pmd, newpmd);
1955                }
1956                goto unlock;
1957        }
1958#endif
1959
1960        /*
1961         * Avoid trapping faults against the zero page. The read-only
1962         * data is likely to be read-cached on the local CPU and
1963         * local/remote hits to the zero page are not interesting.
1964         */
1965        if (prot_numa && is_huge_zero_pmd(*pmd))
1966                goto unlock;
1967
1968        if (prot_numa && pmd_protnone(*pmd))
1969                goto unlock;
1970
1971        /*
1972         * In case prot_numa, we are under down_read(mmap_sem). It's critical
1973         * to not clear pmd intermittently to avoid race with MADV_DONTNEED
1974         * which is also under down_read(mmap_sem):
1975         *
1976         *      CPU0:                           CPU1:
1977         *                              change_huge_pmd(prot_numa=1)
1978         *                               pmdp_huge_get_and_clear_notify()
1979         * madvise_dontneed()
1980         *  zap_pmd_range()
1981         *   pmd_trans_huge(*pmd) == 0 (without ptl)
1982         *   // skip the pmd
1983         *                               set_pmd_at();
1984         *                               // pmd is re-established
1985         *
1986         * The race makes MADV_DONTNEED miss the huge pmd and don't clear it
1987         * which may break userspace.
1988         *
1989         * pmdp_invalidate() is required to make sure we don't miss
1990         * dirty/young flags set by hardware.
1991         */
1992        entry = pmdp_invalidate(vma, addr, pmd);
1993
1994        entry = pmd_modify(entry, newprot);
1995        if (preserve_write)
1996                entry = pmd_mk_savedwrite(entry);
1997        ret = HPAGE_PMD_NR;
1998        set_pmd_at(mm, addr, pmd, entry);
1999        BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
2000unlock:
2001        spin_unlock(ptl);
2002        return ret;
2003}
2004
2005/*
2006 * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
2007 *
2008 * Note that if it returns page table lock pointer, this routine returns without
2009 * unlocking page table lock. So callers must unlock it.
2010 */
2011spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
2012{
2013        spinlock_t *ptl;
2014        ptl = pmd_lock(vma->vm_mm, pmd);
2015        if (likely(is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) ||
2016                        pmd_devmap(*pmd)))
2017                return ptl;
2018        spin_unlock(ptl);
2019        return NULL;
2020}
2021
2022/*
2023 * Returns true if a given pud maps a thp, false otherwise.
2024 *
2025 * Note that if it returns true, this routine returns without unlocking page
2026 * table lock. So callers must unlock it.
2027 */
2028spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
2029{
2030        spinlock_t *ptl;
2031
2032        ptl = pud_lock(vma->vm_mm, pud);
2033        if (likely(pud_trans_huge(*pud) || pud_devmap(*pud)))
2034                return ptl;
2035        spin_unlock(ptl);
2036        return NULL;
2037}
2038
2039#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
2040int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
2041                 pud_t *pud, unsigned long addr)
2042{
2043        spinlock_t *ptl;
2044
2045        ptl = __pud_trans_huge_lock(pud, vma);
2046        if (!ptl)
2047                return 0;
2048        /*
2049         * For architectures like ppc64 we look at deposited pgtable
2050         * when calling pudp_huge_get_and_clear. So do the
2051         * pgtable_trans_huge_withdraw after finishing pudp related
2052         * operations.
2053         */
2054        pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
2055        tlb_remove_pud_tlb_entry(tlb, pud, addr);
2056        if (vma_is_dax(vma)) {
2057                spin_unlock(ptl);
2058                /* No zero page support yet */
2059        } else {
2060                /* No support for anonymous PUD pages yet */
2061                BUG();
2062        }
2063        return 1;
2064}
2065
2066static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud,
2067                unsigned long haddr)
2068{
2069        VM_BUG_ON(haddr & ~HPAGE_PUD_MASK);
2070        VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2071        VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PUD_SIZE, vma);
2072        VM_BUG_ON(!pud_trans_huge(*pud) && !pud_devmap(*pud));
2073
2074        count_vm_event(THP_SPLIT_PUD);
2075
2076        pudp_huge_clear_flush_notify(vma, haddr, pud);
2077}
2078
2079void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
2080                unsigned long address)
2081{
2082        spinlock_t *ptl;
2083        struct mmu_notifier_range range;
2084
2085        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
2086                                address & HPAGE_PUD_MASK,
2087                                (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
2088        mmu_notifier_invalidate_range_start(&range);
2089        ptl = pud_lock(vma->vm_mm, pud);
2090        if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
2091                goto out;
2092        __split_huge_pud_locked(vma, pud, range.start);
2093
2094out:
2095        spin_unlock(ptl);
2096        /*
2097         * No need to double call mmu_notifier->invalidate_range() callback as
2098         * the above pudp_huge_clear_flush_notify() did already call it.
2099         */
2100        mmu_notifier_invalidate_range_only_end(&range);
2101}
2102#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
2103
2104static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
2105                unsigned long haddr, pmd_t *pmd)
2106{
2107        struct mm_struct *mm = vma->vm_mm;
2108        pgtable_t pgtable;
2109        pmd_t _pmd;
2110        int i;
2111
2112        /*
2113         * Leave pmd empty until pte is filled note that it is fine to delay
2114         * notification until mmu_notifier_invalidate_range_end() as we are
2115         * replacing a zero pmd write protected page with a zero pte write
2116         * protected page.
2117         *
2118         * See Documentation/vm/mmu_notifier.rst
2119         */
2120        pmdp_huge_clear_flush(vma, haddr, pmd);
2121
2122        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2123        pmd_populate(mm, &_pmd, pgtable);
2124
2125        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
2126                pte_t *pte, entry;
2127                entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
2128                entry = pte_mkspecial(entry);
2129                pte = pte_offset_map(&_pmd, haddr);
2130                VM_BUG_ON(!pte_none(*pte));
2131                set_pte_at(mm, haddr, pte, entry);
2132                pte_unmap(pte);
2133        }
2134        smp_wmb(); /* make pte visible before pmd */
2135        pmd_populate(mm, pmd, pgtable);
2136}
2137
2138static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
2139                unsigned long haddr, bool freeze)
2140{
2141        struct mm_struct *mm = vma->vm_mm;
2142        struct page *page;
2143        pgtable_t pgtable;
2144        pmd_t old_pmd, _pmd;
2145        bool young, write, soft_dirty, pmd_migration = false;
2146        unsigned long addr;
2147        int i;
2148
2149        VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
2150        VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
2151        VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
2152        VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)
2153                                && !pmd_devmap(*pmd));
2154
2155        count_vm_event(THP_SPLIT_PMD);
2156
2157        if (!vma_is_anonymous(vma)) {
2158                _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
2159                /*
2160                 * We are going to unmap this huge page. So
2161                 * just go ahead and zap it
2162                 */
2163                if (arch_needs_pgtable_deposit())
2164                        zap_deposited_table(mm, pmd);
2165                if (vma_is_dax(vma))
2166                        return;
2167                page = pmd_page(_pmd);
2168                if (!PageDirty(page) && pmd_dirty(_pmd))
2169                        set_page_dirty(page);
2170                if (!PageReferenced(page) && pmd_young(_pmd))
2171                        SetPageReferenced(page);
2172                page_remove_rmap(page, true);
2173                put_page(page);
2174                add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
2175                return;
2176        } else if (is_huge_zero_pmd(*pmd)) {
2177                /*
2178                 * FIXME: Do we want to invalidate secondary mmu by calling
2179                 * mmu_notifier_invalidate_range() see comments below inside
2180                 * __split_huge_pmd() ?
2181                 *
2182                 * We are going from a zero huge page write protected to zero
2183                 * small page also write protected so it does not seems useful
2184                 * to invalidate secondary mmu at this time.
2185                 */
2186                return __split_huge_zero_page_pmd(vma, haddr, pmd);
2187        }
2188
2189        /*
2190         * Up to this point the pmd is present and huge and userland has the
2191         * whole access to the hugepage during the split (which happens in
2192         * place). If we overwrite the pmd with the not-huge version pointing
2193         * to the pte here (which of course we could if all CPUs were bug
2194         * free), userland could trigger a small page size TLB miss on the
2195         * small sized TLB while the hugepage TLB entry is still established in
2196         * the huge TLB. Some CPU doesn't like that.
2197         * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
2198         * 383 on page 93. Intel should be safe but is also warns that it's
2199         * only safe if the permission and cache attributes of the two entries
2200         * loaded in the two TLB is identical (which should be the case here).
2201         * But it is generally safer to never allow small and huge TLB entries
2202         * for the same virtual address to be loaded simultaneously. So instead
2203         * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the
2204         * current pmd notpresent (atomically because here the pmd_trans_huge
2205         * must remain set at all times on the pmd until the split is complete
2206         * for this pmd), then we flush the SMP TLB and finally we write the
2207         * non-huge version of the pmd entry with pmd_populate.
2208         */
2209        old_pmd = pmdp_invalidate(vma, haddr, pmd);
2210
2211        pmd_migration = is_pmd_migration_entry(old_pmd);
2212        if (unlikely(pmd_migration)) {
2213                swp_entry_t entry;
2214
2215                entry = pmd_to_swp_entry(old_pmd);
2216                page = pfn_to_page(swp_offset(entry));
2217                write = is_write_migration_entry(entry);
2218                young = false;
2219                soft_dirty = pmd_swp_soft_dirty(old_pmd);
2220        } else {
2221                page = pmd_page(old_pmd);
2222                if (pmd_dirty(old_pmd))
2223                        SetPageDirty(page);
2224                write = pmd_write(old_pmd);
2225                young = pmd_young(old_pmd);
2226                soft_dirty = pmd_soft_dirty(old_pmd);
2227        }
2228        VM_BUG_ON_PAGE(!page_count(page), page);
2229        page_ref_add(page, HPAGE_PMD_NR - 1);
2230
2231        /*
2232         * Withdraw the table only after we mark the pmd entry invalid.
2233         * This's critical for some architectures (Power).
2234         */
2235        pgtable = pgtable_trans_huge_withdraw(mm, pmd);
2236        pmd_populate(mm, &_pmd, pgtable);
2237
2238        for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
2239                pte_t entry, *pte;
2240                /*
2241                 * Note that NUMA hinting access restrictions are not
2242                 * transferred to avoid any possibility of altering
2243                 * permissions across VMAs.
2244                 */
2245                if (freeze || pmd_migration) {
2246                        swp_entry_t swp_entry;
2247                        swp_entry = make_migration_entry(page + i, write);
2248                        entry = swp_entry_to_pte(swp_entry);
2249                        if (soft_dirty)
2250                                entry = pte_swp_mksoft_dirty(entry);
2251                } else {
2252                        entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
2253                        entry = maybe_mkwrite(entry, vma);
2254                        if (!write)
2255                                entry = pte_wrprotect(entry);
2256                        if (!young)
2257                                entry = pte_mkold(entry);
2258                        if (soft_dirty)
2259                                entry = pte_mksoft_dirty(entry);
2260                }
2261                pte = pte_offset_map(&_pmd, addr);
2262                BUG_ON(!pte_none(*pte));
2263                set_pte_at(mm, addr, pte, entry);
2264                atomic_inc(&page[i]._mapcount);
2265                pte_unmap(pte);
2266        }
2267
2268        /*
2269         * Set PG_double_map before dropping compound_mapcount to avoid
2270         * false-negative page_mapped().
2271         */
2272        if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
2273                for (i = 0; i < HPAGE_PMD_NR; i++)
2274                        atomic_inc(&page[i]._mapcount);
2275        }
2276
2277        if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
2278                /* Last compound_mapcount is gone. */
2279                __dec_node_page_state(page, NR_ANON_THPS);
2280                if (TestClearPageDoubleMap(page)) {
2281                        /* No need in mapcount reference anymore */
2282                        for (i = 0; i < HPAGE_PMD_NR; i++)
2283                                atomic_dec(&page[i]._mapcount);
2284                }
2285        }
2286
2287        smp_wmb(); /* make pte visible before pmd */
2288        pmd_populate(mm, pmd, pgtable);
2289
2290        if (freeze) {
2291                for (i = 0; i < HPAGE_PMD_NR; i++) {
2292                        page_remove_rmap(page + i, false);
2293                        put_page(page + i);
2294                }
2295        }
2296}
2297
2298void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
2299                unsigned long address, bool freeze, struct page *page)
2300{
2301        spinlock_t *ptl;
2302        struct mmu_notifier_range range;
2303
2304        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
2305                                address & HPAGE_PMD_MASK,
2306                                (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
2307        mmu_notifier_invalidate_range_start(&range);
2308        ptl = pmd_lock(vma->vm_mm, pmd);
2309
2310        /*
2311         * If caller asks to setup a migration entries, we need a page to check
2312         * pmd against. Otherwise we can end up replacing wrong page.
2313         */
2314        VM_BUG_ON(freeze && !page);
2315        if (page && page != pmd_page(*pmd))
2316                goto out;
2317
2318        if (pmd_trans_huge(*pmd)) {
2319                page = pmd_page(*pmd);
2320                if (PageMlocked(page))
2321                        clear_page_mlock(page);
2322        } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
2323                goto out;
2324        __split_huge_pmd_locked(vma, pmd, range.start, freeze);
2325out:
2326        spin_unlock(ptl);
2327        /*
2328         * No need to double call mmu_notifier->invalidate_range() callback.
2329         * They are 3 cases to consider inside __split_huge_pmd_locked():
2330         *  1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious
2331         *  2) __split_huge_zero_page_pmd() read only zero page and any write
2332         *    fault will trigger a flush_notify before pointing to a new page
2333         *    (it is fine if the secondary mmu keeps pointing to the old zero
2334         *    page in the meantime)
2335         *  3) Split a huge pmd into pte pointing to the same page. No need
2336         *     to invalidate secondary tlb entry they are all still valid.
2337         *     any further changes to individual pte will notify. So no need
2338         *     to call mmu_notifier->invalidate_range()
2339         */
2340        mmu_notifier_invalidate_range_only_end(&range);
2341}
2342
2343void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
2344                bool freeze, struct page *page)
2345{
2346        pgd_t *pgd;
2347        p4d_t *p4d;
2348        pud_t *pud;
2349        pmd_t *pmd;
2350
2351        pgd = pgd_offset(vma->vm_mm, address);
2352        if (!pgd_present(*pgd))
2353                return;
2354
2355        p4d = p4d_offset(pgd, address);
2356        if (!p4d_present(*p4d))
2357                return;
2358
2359        pud = pud_offset(p4d, address);
2360        if (!pud_present(*pud))
2361                return;
2362
2363        pmd = pmd_offset(pud, address);
2364
2365        __split_huge_pmd(vma, pmd, address, freeze, page);
2366}
2367
2368void vma_adjust_trans_huge(struct vm_area_struct *vma,
2369                             unsigned long start,
2370                             unsigned long end,
2371                             long adjust_next)
2372{
2373        /*
2374         * If the new start address isn't hpage aligned and it could
2375         * previously contain an hugepage: check if we need to split
2376         * an huge pmd.
2377         */
2378        if (start & ~HPAGE_PMD_MASK &&
2379            (start & HPAGE_PMD_MASK) >= vma->vm_start &&
2380            (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2381                split_huge_pmd_address(vma, start, false, NULL);
2382
2383        /*
2384         * If the new end address isn't hpage aligned and it could
2385         * previously contain an hugepage: check if we need to split
2386         * an huge pmd.
2387         */
2388        if (end & ~HPAGE_PMD_MASK &&
2389            (end & HPAGE_PMD_MASK) >= vma->vm_start &&
2390            (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
2391                split_huge_pmd_address(vma, end, false, NULL);
2392
2393        /*
2394         * If we're also updating the vma->vm_next->vm_start, if the new
2395         * vm_next->vm_start isn't page aligned and it could previously
2396         * contain an hugepage: check if we need to split an huge pmd.
2397         */
2398        if (adjust_next > 0) {
2399                struct vm_area_struct *next = vma->vm_next;
2400                unsigned long nstart = next->vm_start;
2401                nstart += adjust_next << PAGE_SHIFT;
2402                if (nstart & ~HPAGE_PMD_MASK &&
2403                    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
2404                    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
2405                        split_huge_pmd_address(next, nstart, false, NULL);
2406        }
2407}
2408
2409static void unmap_page(struct page *page)
2410{
2411        enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
2412                TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
2413        bool unmap_success;
2414
2415        VM_BUG_ON_PAGE(!PageHead(page), page);
2416
2417        if (PageAnon(page))
2418                ttu_flags |= TTU_SPLIT_FREEZE;
2419
2420        unmap_success = try_to_unmap(page, ttu_flags);
2421        VM_BUG_ON_PAGE(!unmap_success, page);
2422}
2423
2424static void remap_page(struct page *page)
2425{
2426        int i;
2427        if (PageTransHuge(page)) {
2428                remove_migration_ptes(page, page, true);
2429        } else {
2430                for (i = 0; i < HPAGE_PMD_NR; i++)
2431                        remove_migration_ptes(page + i, page + i, true);
2432        }
2433}
2434
2435static void __split_huge_page_tail(struct page *head, int tail,
2436                struct lruvec *lruvec, struct list_head *list)
2437{
2438        struct page *page_tail = head + tail;
2439
2440        VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
2441
2442        /*
2443         * Clone page flags before unfreezing refcount.
2444         *
2445         * After successful get_page_unless_zero() might follow flags change,
2446         * for exmaple lock_page() which set PG_waiters.
2447         */
2448        page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
2449        page_tail->flags |= (head->flags &
2450                        ((1L << PG_referenced) |
2451                         (1L << PG_swapbacked) |
2452                         (1L << PG_swapcache) |
2453                         (1L << PG_mlocked) |
2454                         (1L << PG_uptodate) |
2455                         (1L << PG_active) |
2456                         (1L << PG_workingset) |
2457                         (1L << PG_locked) |
2458                         (1L << PG_unevictable) |
2459                         (1L << PG_dirty)));
2460
2461        /* ->mapping in first tail page is compound_mapcount */
2462        VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
2463                        page_tail);
2464        page_tail->mapping = head->mapping;
2465        page_tail->index = head->index + tail;
2466
2467        /* Page flags must be visible before we make the page non-compound. */
2468        smp_wmb();
2469
2470        /*
2471         * Clear PageTail before unfreezing page refcount.
2472         *
2473         * After successful get_page_unless_zero() might follow put_page()
2474         * which needs correct compound_head().
2475         */
2476        clear_compound_head(page_tail);
2477
2478        /* Finally unfreeze refcount. Additional reference from page cache. */
2479        page_ref_unfreeze(page_tail, 1 + (!PageAnon(head) ||
2480                                          PageSwapCache(head)));
2481
2482        if (page_is_young(head))
2483                set_page_young(page_tail);
2484        if (page_is_idle(head))
2485                set_page_idle(page_tail);
2486
2487        page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
2488
2489        /*
2490         * always add to the tail because some iterators expect new
2491         * pages to show after the currently processed elements - e.g.
2492         * migrate_pages
2493         */
2494        lru_add_page_tail(head, page_tail, lruvec, list);
2495}
2496
2497static void __split_huge_page(struct page *page, struct list_head *list,
2498                pgoff_t end, unsigned long flags)
2499{
2500        struct page *head = compound_head(page);
2501        pg_data_t *pgdat = page_pgdat(head);
2502        struct lruvec *lruvec;
2503        struct address_space *swap_cache = NULL;
2504        unsigned long offset = 0;
2505        int i;
2506
2507        lruvec = mem_cgroup_page_lruvec(head, pgdat);
2508
2509        /* complete memcg works before add pages to LRU */
2510        mem_cgroup_split_huge_fixup(head);
2511
2512        if (PageAnon(head) && PageSwapCache(head)) {
2513                swp_entry_t entry = { .val = page_private(head) };
2514
2515                offset = swp_offset(entry);
2516                swap_cache = swap_address_space(entry);
2517                xa_lock(&swap_cache->i_pages);
2518        }
2519
2520        for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
2521                __split_huge_page_tail(head, i, lruvec, list);
2522                /* Some pages can be beyond i_size: drop them from page cache */
2523                if (head[i].index >= end) {
2524                        ClearPageDirty(head + i);
2525                        __delete_from_page_cache(head + i, NULL);
2526                        if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
2527                                shmem_uncharge(head->mapping->host, 1);
2528                        put_page(head + i);
2529                } else if (!PageAnon(page)) {
2530                        __xa_store(&head->mapping->i_pages, head[i].index,
2531                                        head + i, 0);
2532                } else if (swap_cache) {
2533                        __xa_store(&swap_cache->i_pages, offset + i,
2534                                        head + i, 0);
2535                }
2536        }
2537
2538        ClearPageCompound(head);
2539
2540        split_page_owner(head, HPAGE_PMD_ORDER);
2541
2542        /* See comment in __split_huge_page_tail() */
2543        if (PageAnon(head)) {
2544                /* Additional pin to swap cache */
2545                if (PageSwapCache(head)) {
2546                        page_ref_add(head, 2);
2547                        xa_unlock(&swap_cache->i_pages);
2548                } else {
2549                        page_ref_inc(head);
2550                }
2551        } else {
2552                /* Additional pin to page cache */
2553                page_ref_add(head, 2);
2554                xa_unlock(&head->mapping->i_pages);
2555        }
2556
2557        spin_unlock_irqrestore(&pgdat->lru_lock, flags);
2558
2559        remap_page(head);
2560
2561        for (i = 0; i < HPAGE_PMD_NR; i++) {
2562                struct page *subpage = head + i;
2563                if (subpage == page)
2564                        continue;
2565                unlock_page(subpage);
2566
2567                /*
2568                 * Subpages may be freed if there wasn't any mapping
2569                 * like if add_to_swap() is running on a lru page that
2570                 * had its mapping zapped. And freeing these pages
2571                 * requires taking the lru_lock so we do the put_page
2572                 * of the tail pages after the split is complete.
2573                 */
2574                put_page(subpage);
2575        }
2576}
2577
2578int total_mapcount(struct page *page)
2579{
2580        int i, compound, ret;
2581
2582        VM_BUG_ON_PAGE(PageTail(page), page);
2583
2584        if (likely(!PageCompound(page)))
2585                return atomic_read(&page->_mapcount) + 1;
2586
2587        compound = compound_mapcount(page);
2588        if (PageHuge(page))
2589                return compound;
2590        ret = compound;
2591        for (i = 0; i < HPAGE_PMD_NR; i++)
2592                ret += atomic_read(&page[i]._mapcount) + 1;
2593        /* File pages has compound_mapcount included in _mapcount */
2594        if (!PageAnon(page))
2595                return ret - compound * HPAGE_PMD_NR;
2596        if (PageDoubleMap(page))
2597                ret -= HPAGE_PMD_NR;
2598        return ret;
2599}
2600
2601/*
2602 * This calculates accurately how many mappings a transparent hugepage
2603 * has (unlike page_mapcount() which isn't fully accurate). This full
2604 * accuracy is primarily needed to know if copy-on-write faults can
2605 * reuse the page and change the mapping to read-write instead of
2606 * copying them. At the same time this returns the total_mapcount too.
2607 *
2608 * The function returns the highest mapcount any one of the subpages
2609 * has. If the return value is one, even if different processes are
2610 * mapping different subpages of the transparent hugepage, they can
2611 * all reuse it, because each process is reusing a different subpage.
2612 *
2613 * The total_mapcount is instead counting all virtual mappings of the
2614 * subpages. If the total_mapcount is equal to "one", it tells the
2615 * caller all mappings belong to the same "mm" and in turn the
2616 * anon_vma of the transparent hugepage can become the vma->anon_vma
2617 * local one as no other process may be mapping any of the subpages.
2618 *
2619 * It would be more accurate to replace page_mapcount() with
2620 * page_trans_huge_mapcount(), however we only use
2621 * page_trans_huge_mapcount() in the copy-on-write faults where we
2622 * need full accuracy to avoid breaking page pinning, because
2623 * page_trans_huge_mapcount() is slower than page_mapcount().
2624 */
2625int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
2626{
2627        int i, ret, _total_mapcount, mapcount;
2628
2629        /* hugetlbfs shouldn't call it */
2630        VM_BUG_ON_PAGE(PageHuge(page), page);
2631
2632        if (likely(!PageTransCompound(page))) {
2633                mapcount = atomic_read(&page->_mapcount) + 1;
2634                if (total_mapcount)
2635                        *total_mapcount = mapcount;
2636                return mapcount;
2637        }
2638
2639        page = compound_head(page);
2640
2641        _total_mapcount = ret = 0;
2642        for (i = 0; i < HPAGE_PMD_NR; i++) {
2643                mapcount = atomic_read(&page[i]._mapcount) + 1;
2644                ret = max(ret, mapcount);
2645                _total_mapcount += mapcount;
2646        }
2647        if (PageDoubleMap(page)) {
2648                ret -= 1;
2649                _total_mapcount -= HPAGE_PMD_NR;
2650        }
2651        mapcount = compound_mapcount(page);
2652        ret += mapcount;
2653        _total_mapcount += mapcount;
2654        if (total_mapcount)
2655                *total_mapcount = _total_mapcount;
2656        return ret;
2657}
2658
2659/* Racy check whether the huge page can be split */
2660bool can_split_huge_page(struct page *page, int *pextra_pins)
2661{
2662        int extra_pins;
2663
2664        /* Additional pins from page cache */
2665        if (PageAnon(page))
2666                extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
2667        else
2668                extra_pins = HPAGE_PMD_NR;
2669        if (pextra_pins)
2670                *pextra_pins = extra_pins;
2671        return total_mapcount(page) == page_count(page) - extra_pins - 1;
2672}
2673
2674/*
2675 * This function splits huge page into normal pages. @page can point to any
2676 * subpage of huge page to split. Split doesn't change the position of @page.
2677 *
2678 * Only caller must hold pin on the @page, otherwise split fails with -EBUSY.
2679 * The huge page must be locked.
2680 *
2681 * If @list is null, tail pages will be added to LRU list, otherwise, to @list.
2682 *
2683 * Both head page and tail pages will inherit mapping, flags, and so on from
2684 * the hugepage.
2685 *
2686 * GUP pin and PG_locked transferred to @page. Rest subpages can be freed if
2687 * they are not mapped.
2688 *
2689 * Returns 0 if the hugepage is split successfully.
2690 * Returns -EBUSY if the page is pinned or if anon_vma disappeared from under
2691 * us.
2692 */
2693int split_huge_page_to_list(struct page *page, struct list_head *list)
2694{
2695        struct page *head = compound_head(page);
2696        struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
2697        struct deferred_split *ds_queue = get_deferred_split_queue(page);
2698        struct anon_vma *anon_vma = NULL;
2699        struct address_space *mapping = NULL;
2700        int count, mapcount, extra_pins, ret;
2701        bool mlocked;
2702        unsigned long flags;
2703        pgoff_t end;
2704
2705        VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
2706        VM_BUG_ON_PAGE(!PageLocked(page), page);
2707        VM_BUG_ON_PAGE(!PageCompound(page), page);
2708
2709        if (PageWriteback(page))
2710                return -EBUSY;
2711
2712        if (PageAnon(head)) {
2713                /*
2714                 * The caller does not necessarily hold an mmap_sem that would
2715                 * prevent the anon_vma disappearing so we first we take a
2716                 * reference to it and then lock the anon_vma for write. This
2717                 * is similar to page_lock_anon_vma_read except the write lock
2718                 * is taken to serialise against parallel split or collapse
2719                 * operations.
2720                 */
2721                anon_vma = page_get_anon_vma(head);
2722                if (!anon_vma) {
2723                        ret = -EBUSY;
2724                        goto out;
2725                }
2726                end = -1;
2727                mapping = NULL;
2728                anon_vma_lock_write(anon_vma);
2729        } else {
2730                mapping = head->mapping;
2731
2732                /* Truncated ? */
2733                if (!mapping) {
2734                        ret = -EBUSY;
2735                        goto out;
2736                }
2737
2738                anon_vma = NULL;
2739                i_mmap_lock_read(mapping);
2740
2741                /*
2742                 *__split_huge_page() may need to trim off pages beyond EOF:
2743                 * but on 32-bit, i_size_read() takes an irq-unsafe seqlock,
2744                 * which cannot be nested inside the page tree lock. So note
2745                 * end now: i_size itself may be changed at any moment, but
2746                 * head page lock is good enough to serialize the trimming.
2747                 */
2748                end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
2749        }
2750
2751        /*
2752         * Racy check if we can split the page, before unmap_page() will
2753         * split PMDs
2754         */
2755        if (!can_split_huge_page(head, &extra_pins)) {
2756                ret = -EBUSY;
2757                goto out_unlock;
2758        }
2759
2760        mlocked = PageMlocked(page);
2761        unmap_page(head);
2762        VM_BUG_ON_PAGE(compound_mapcount(head), head);
2763
2764        /* Make sure the page is not on per-CPU pagevec as it takes pin */
2765        if (mlocked)
2766                lru_add_drain();
2767
2768        /* prevent PageLRU to go away from under us, and freeze lru stats */
2769        spin_lock_irqsave(&pgdata->lru_lock, flags);
2770
2771        if (mapping) {
2772                XA_STATE(xas, &mapping->i_pages, page_index(head));
2773
2774                /*
2775                 * Check if the head page is present in page cache.
2776                 * We assume all tail are present too, if head is there.
2777                 */
2778                xa_lock(&mapping->i_pages);
2779                if (xas_load(&xas) != head)
2780                        goto fail;
2781        }
2782
2783        /* Prevent deferred_split_scan() touching ->_refcount */
2784        spin_lock(&ds_queue->split_queue_lock);
2785        count = page_count(head);
2786        mapcount = total_mapcount(head);
2787        if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
2788                if (!list_empty(page_deferred_list(head))) {
2789                        ds_queue->split_queue_len--;
2790                        list_del(page_deferred_list(head));
2791                }
2792                if (mapping) {
2793                        if (PageSwapBacked(page))
2794                                __dec_node_page_state(page, NR_SHMEM_THPS);
2795                        else
2796                                __dec_node_page_state(page, NR_FILE_THPS);
2797                }
2798
2799                spin_unlock(&ds_queue->split_queue_lock);
2800                __split_huge_page(page, list, end, flags);
2801                if (PageSwapCache(head)) {
2802                        swp_entry_t entry = { .val = page_private(head) };
2803
2804                        ret = split_swap_cluster(entry);
2805                } else
2806                        ret = 0;
2807        } else {
2808                if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
2809                        pr_alert("total_mapcount: %u, page_count(): %u\n",
2810                                        mapcount, count);
2811                        if (PageTail(page))
2812                                dump_page(head, NULL);
2813                        dump_page(page, "total_mapcount(head) > 0");
2814                        BUG();
2815                }
2816                spin_unlock(&ds_queue->split_queue_lock);
2817fail:           if (mapping)
2818                        xa_unlock(&mapping->i_pages);
2819                spin_unlock_irqrestore(&pgdata->lru_lock, flags);
2820                remap_page(head);
2821                ret = -EBUSY;
2822        }
2823
2824out_unlock:
2825        if (anon_vma) {
2826                anon_vma_unlock_write(anon_vma);
2827                put_anon_vma(anon_vma);
2828        }
2829        if (mapping)
2830                i_mmap_unlock_read(mapping);
2831out:
2832        count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
2833        return ret;
2834}
2835
2836void free_transhuge_page(struct page *page)
2837{
2838        struct deferred_split *ds_queue = get_deferred_split_queue(page);
2839        unsigned long flags;
2840
2841        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2842        if (!list_empty(page_deferred_list(page))) {
2843                ds_queue->split_queue_len--;
2844                list_del(page_deferred_list(page));
2845        }
2846        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
2847        free_compound_page(page);
2848}
2849
2850void deferred_split_huge_page(struct page *page)
2851{
2852        struct deferred_split *ds_queue = get_deferred_split_queue(page);
2853#ifdef CONFIG_MEMCG
2854        struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
2855#endif
2856        unsigned long flags;
2857
2858        VM_BUG_ON_PAGE(!PageTransHuge(page), page);
2859
2860        /*
2861         * The try_to_unmap() in page reclaim path might reach here too,
2862         * this may cause a race condition to corrupt deferred split queue.
2863         * And, if page reclaim is already handling the same page, it is
2864         * unnecessary to handle it again in shrinker.
2865         *
2866         * Check PageSwapCache to determine if the page is being
2867         * handled by page reclaim since THP swap would add the page into
2868         * swap cache before calling try_to_unmap().
2869         */
2870        if (PageSwapCache(page))
2871                return;
2872
2873        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2874        if (list_empty(page_deferred_list(page))) {
2875                count_vm_event(THP_DEFERRED_SPLIT_PAGE);
2876                list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
2877                ds_queue->split_queue_len++;
2878#ifdef CONFIG_MEMCG
2879                if (memcg)
2880                        memcg_set_shrinker_bit(memcg, page_to_nid(page),
2881                                               deferred_split_shrinker.id);
2882#endif
2883        }
2884        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
2885}
2886
2887static unsigned long deferred_split_count(struct shrinker *shrink,
2888                struct shrink_control *sc)
2889{
2890        struct pglist_data *pgdata = NODE_DATA(sc->nid);
2891        struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
2892
2893#ifdef CONFIG_MEMCG
2894        if (sc->memcg)
2895                ds_queue = &sc->memcg->deferred_split_queue;
2896#endif
2897        return READ_ONCE(ds_queue->split_queue_len);
2898}
2899
2900static unsigned long deferred_split_scan(struct shrinker *shrink,
2901                struct shrink_control *sc)
2902{
2903        struct pglist_data *pgdata = NODE_DATA(sc->nid);
2904        struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
2905        unsigned long flags;
2906        LIST_HEAD(list), *pos, *next;
2907        struct page *page;
2908        int split = 0;
2909
2910#ifdef CONFIG_MEMCG
2911        if (sc->memcg)
2912                ds_queue = &sc->memcg->deferred_split_queue;
2913#endif
2914
2915        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2916        /* Take pin on all head pages to avoid freeing them under us */
2917        list_for_each_safe(pos, next, &ds_queue->split_queue) {
2918                page = list_entry((void *)pos, struct page, mapping);
2919                page = compound_head(page);
2920                if (get_page_unless_zero(page)) {
2921                        list_move(page_deferred_list(page), &list);
2922                } else {
2923                        /* We lost race with put_compound_page() */
2924                        list_del_init(page_deferred_list(page));
2925                        ds_queue->split_queue_len--;
2926                }
2927                if (!--sc->nr_to_scan)
2928                        break;
2929        }
2930        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
2931
2932        list_for_each_safe(pos, next, &list) {
2933                page = list_entry((void *)pos, struct page, mapping);
2934                if (!trylock_page(page))
2935                        goto next;
2936                /* split_huge_page() removes page from list on success */
2937                if (!split_huge_page(page))
2938                        split++;
2939                unlock_page(page);
2940next:
2941                put_page(page);
2942        }
2943
2944        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
2945        list_splice_tail(&list, &ds_queue->split_queue);
2946        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
2947
2948        /*
2949         * Stop shrinker if we didn't split any page, but the queue is empty.
2950         * This can happen if pages were freed under us.
2951         */
2952        if (!split && list_empty(&ds_queue->split_queue))
2953                return SHRINK_STOP;
2954        return split;
2955}
2956
2957static struct shrinker deferred_split_shrinker = {
2958        .count_objects = deferred_split_count,
2959        .scan_objects = deferred_split_scan,
2960        .seeks = DEFAULT_SEEKS,
2961        .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
2962                 SHRINKER_NONSLAB,
2963};
2964
2965#ifdef CONFIG_DEBUG_FS
2966static int split_huge_pages_set(void *data, u64 val)
2967{
2968        struct zone *zone;
2969        struct page *page;
2970        unsigned long pfn, max_zone_pfn;
2971        unsigned long total = 0, split = 0;
2972
2973        if (val != 1)
2974                return -EINVAL;
2975
2976        for_each_populated_zone(zone) {
2977                max_zone_pfn = zone_end_pfn(zone);
2978                for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
2979                        if (!pfn_valid(pfn))
2980                                continue;
2981
2982                        page = pfn_to_page(pfn);
2983                        if (!get_page_unless_zero(page))
2984                                continue;
2985
2986                        if (zone != page_zone(page))
2987                                goto next;
2988
2989                        if (!PageHead(page) || PageHuge(page) || !PageLRU(page))
2990                                goto next;
2991
2992                        total++;
2993                        lock_page(page);
2994                        if (!split_huge_page(page))
2995                                split++;
2996                        unlock_page(page);
2997next:
2998                        put_page(page);
2999                }
3000        }
3001
3002        pr_info("%lu of %lu THP split\n", split, total);
3003
3004        return 0;
3005}
3006DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
3007                "%llu\n");
3008
3009static int __init split_huge_pages_debugfs(void)
3010{
3011        debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
3012                            &split_huge_pages_fops);
3013        return 0;
3014}
3015late_initcall(split_huge_pages_debugfs);
3016#endif
3017
3018#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
3019void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
3020                struct page *page)
3021{
3022        struct vm_area_struct *vma = pvmw->vma;
3023        struct mm_struct *mm = vma->vm_mm;
3024        unsigned long address = pvmw->address;
3025        pmd_t pmdval;
3026        swp_entry_t entry;
3027        pmd_t pmdswp;
3028
3029        if (!(pvmw->pmd && !pvmw->pte))
3030                return;
3031
3032        flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
3033        pmdval = *pvmw->pmd;
3034        pmdp_invalidate(vma, address, pvmw->pmd);
3035        if (pmd_dirty(pmdval))
3036                set_page_dirty(page);
3037        entry = make_migration_entry(page, pmd_write(pmdval));
3038        pmdswp = swp_entry_to_pmd(entry);
3039        if (pmd_soft_dirty(pmdval))
3040                pmdswp = pmd_swp_mksoft_dirty(pmdswp);
3041        set_pmd_at(mm, address, pvmw->pmd, pmdswp);
3042        page_remove_rmap(page, true);
3043        put_page(page);
3044}
3045
3046void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
3047{
3048        struct vm_area_struct *vma = pvmw->vma;
3049        struct mm_struct *mm = vma->vm_mm;
3050        unsigned long address = pvmw->address;
3051        unsigned long mmun_start = address & HPAGE_PMD_MASK;
3052        pmd_t pmde;
3053        swp_entry_t entry;
3054
3055        if (!(pvmw->pmd && !pvmw->pte))
3056                return;
3057
3058        entry = pmd_to_swp_entry(*pvmw->pmd);
3059        get_page(new);
3060        pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
3061        if (pmd_swp_soft_dirty(*pvmw->pmd))
3062                pmde = pmd_mksoft_dirty(pmde);
3063        if (is_write_migration_entry(entry))
3064                pmde = maybe_pmd_mkwrite(pmde, vma);
3065
3066        flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
3067        if (PageAnon(new))
3068                page_add_anon_rmap(new, vma, mmun_start, true);
3069        else
3070                page_add_file_rmap(new, true);
3071        set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
3072        if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new))
3073                mlock_vma_page(new);
3074        update_mmu_cache_pmd(vma, address, pvmw->pmd);
3075}
3076#endif
3077