linux/mm/hugetlb_vmemmap.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * HugeTLB Vmemmap Optimization (HVO)
   4 *
   5 * Copyright (c) 2020, ByteDance. All rights reserved.
   6 *
   7 *     Author: Muchun Song <songmuchun@bytedance.com>
   8 *
   9 * See Documentation/mm/vmemmap_dedup.rst
  10 */
  11#define pr_fmt(fmt)     "HugeTLB: " fmt
  12
  13#include <linux/pgtable.h>
  14#include <linux/moduleparam.h>
  15#include <linux/bootmem_info.h>
  16#include <linux/mmdebug.h>
  17#include <linux/pagewalk.h>
  18#include <asm/pgalloc.h>
  19#include <asm/tlbflush.h>
  20#include "hugetlb_vmemmap.h"
  21
  22/**
  23 * struct vmemmap_remap_walk - walk vmemmap page table
  24 *
  25 * @remap_pte:          called for each lowest-level entry (PTE).
  26 * @nr_walked:          the number of walked pte.
  27 * @reuse_page:         the page which is reused for the tail vmemmap pages.
  28 * @reuse_addr:         the virtual address of the @reuse_page page.
  29 * @vmemmap_pages:      the list head of the vmemmap pages that can be freed
  30 *                      or is mapped from.
  31 * @flags:              used to modify behavior in vmemmap page table walking
  32 *                      operations.
  33 */
  34struct vmemmap_remap_walk {
  35        void                    (*remap_pte)(pte_t *pte, unsigned long addr,
  36                                             struct vmemmap_remap_walk *walk);
  37        unsigned long           nr_walked;
  38        struct page             *reuse_page;
  39        unsigned long           reuse_addr;
  40        struct list_head        *vmemmap_pages;
  41
  42/* Skip the TLB flush when we split the PMD */
  43#define VMEMMAP_SPLIT_NO_TLB_FLUSH      BIT(0)
  44/* Skip the TLB flush when we remap the PTE */
  45#define VMEMMAP_REMAP_NO_TLB_FLUSH      BIT(1)
  46/* synchronize_rcu() to avoid writes from page_ref_add_unless() */
  47#define VMEMMAP_SYNCHRONIZE_RCU         BIT(2)
  48        unsigned long           flags;
  49};
  50
  51static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
  52                             struct vmemmap_remap_walk *walk)
  53{
  54        pmd_t __pmd;
  55        int i;
  56        unsigned long addr = start;
  57        pte_t *pgtable;
  58
  59        pgtable = pte_alloc_one_kernel(&init_mm);
  60        if (!pgtable)
  61                return -ENOMEM;
  62
  63        pmd_populate_kernel(&init_mm, &__pmd, pgtable);
  64
  65        for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
  66                pte_t entry, *pte;
  67                pgprot_t pgprot = PAGE_KERNEL;
  68
  69                entry = mk_pte(head + i, pgprot);
  70                pte = pte_offset_kernel(&__pmd, addr);
  71                set_pte_at(&init_mm, addr, pte, entry);
  72        }
  73
  74        spin_lock(&init_mm.page_table_lock);
  75        if (likely(pmd_leaf(*pmd))) {
  76                /*
  77                 * Higher order allocations from buddy allocator must be able to
  78                 * be treated as indepdenent small pages (as they can be freed
  79                 * individually).
  80                 */
  81                if (!PageReserved(head))
  82                        split_page(head, get_order(PMD_SIZE));
  83
  84                /* Make pte visible before pmd. See comment in pmd_install(). */
  85                smp_wmb();
  86                pmd_populate_kernel(&init_mm, pmd, pgtable);
  87                if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
  88                        flush_tlb_kernel_range(start, start + PMD_SIZE);
  89        } else {
  90                pte_free_kernel(&init_mm, pgtable);
  91        }
  92        spin_unlock(&init_mm.page_table_lock);
  93
  94        return 0;
  95}
  96
  97static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
  98                             unsigned long next, struct mm_walk *walk)
  99{
 100        int ret = 0;
 101        struct page *head;
 102        struct vmemmap_remap_walk *vmemmap_walk = walk->private;
 103
 104        /* Only splitting, not remapping the vmemmap pages. */
 105        if (!vmemmap_walk->remap_pte)
 106                walk->action = ACTION_CONTINUE;
 107
 108        spin_lock(&init_mm.page_table_lock);
 109        head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
 110        /*
 111         * Due to HugeTLB alignment requirements and the vmemmap
 112         * pages being at the start of the hotplugged memory
 113         * region in memory_hotplug.memmap_on_memory case. Checking
 114         * the vmemmap page associated with the first vmemmap page
 115         * if it is self-hosted is sufficient.
 116         *
 117         * [                  hotplugged memory                  ]
 118         * [        section        ][...][        section        ]
 119         * [ vmemmap ][              usable memory               ]
 120         *   ^  | ^                        |
 121         *   +--+ |                        |
 122         *        +------------------------+
 123         */
 124        if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
 125                struct page *page = head ? head + pte_index(addr) :
 126                                    pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
 127
 128                if (PageVmemmapSelfHosted(page))
 129                        ret = -ENOTSUPP;
 130        }
 131        spin_unlock(&init_mm.page_table_lock);
 132        if (!head || ret)
 133                return ret;
 134
 135        return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
 136}
 137
 138static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
 139                             unsigned long next, struct mm_walk *walk)
 140{
 141        struct vmemmap_remap_walk *vmemmap_walk = walk->private;
 142
 143        /*
 144         * The reuse_page is found 'first' in page table walking before
 145         * starting remapping.
 146         */
 147        if (!vmemmap_walk->reuse_page)
 148                vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
 149        else
 150                vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
 151        vmemmap_walk->nr_walked++;
 152
 153        return 0;
 154}
 155
 156static const struct mm_walk_ops vmemmap_remap_ops = {
 157        .pmd_entry      = vmemmap_pmd_entry,
 158        .pte_entry      = vmemmap_pte_entry,
 159};
 160
 161static int vmemmap_remap_range(unsigned long start, unsigned long end,
 162                               struct vmemmap_remap_walk *walk)
 163{
 164        int ret;
 165
 166        VM_BUG_ON(!PAGE_ALIGNED(start | end));
 167
 168        mmap_read_lock(&init_mm);
 169        ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops,
 170                                    NULL, walk);
 171        mmap_read_unlock(&init_mm);
 172        if (ret)
 173                return ret;
 174
 175        if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
 176                flush_tlb_kernel_range(start, end);
 177
 178        return 0;
 179}
 180
 181/*
 182 * Free a vmemmap page. A vmemmap page can be allocated from the memblock
 183 * allocator or buddy allocator. If the PG_reserved flag is set, it means
 184 * that it allocated from the memblock allocator, just free it via the
 185 * free_bootmem_page(). Otherwise, use __free_page().
 186 */
 187static inline void free_vmemmap_page(struct page *page)
 188{
 189        if (PageReserved(page)) {
 190                memmap_boot_pages_add(-1);
 191                free_bootmem_page(page);
 192        } else {
 193                memmap_pages_add(-1);
 194                __free_page(page);
 195        }
 196}
 197
 198/* Free a list of the vmemmap pages */
 199static void free_vmemmap_page_list(struct list_head *list)
 200{
 201        struct page *page, *next;
 202
 203        list_for_each_entry_safe(page, next, list, lru)
 204                free_vmemmap_page(page);
 205}
 206
 207static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
 208                              struct vmemmap_remap_walk *walk)
 209{
 210        /*
 211         * Remap the tail pages as read-only to catch illegal write operation
 212         * to the tail pages.
 213         */
 214        pgprot_t pgprot = PAGE_KERNEL_RO;
 215        struct page *page = pte_page(ptep_get(pte));
 216        pte_t entry;
 217
 218        /* Remapping the head page requires r/w */
 219        if (unlikely(addr == walk->reuse_addr)) {
 220                pgprot = PAGE_KERNEL;
 221                list_del(&walk->reuse_page->lru);
 222
 223                /*
 224                 * Makes sure that preceding stores to the page contents from
 225                 * vmemmap_remap_free() become visible before the set_pte_at()
 226                 * write.
 227                 */
 228                smp_wmb();
 229        }
 230
 231        entry = mk_pte(walk->reuse_page, pgprot);
 232        list_add(&page->lru, walk->vmemmap_pages);
 233        set_pte_at(&init_mm, addr, pte, entry);
 234}
 235
 236/*
 237 * How many struct page structs need to be reset. When we reuse the head
 238 * struct page, the special metadata (e.g. page->flags or page->mapping)
 239 * cannot copy to the tail struct page structs. The invalid value will be
 240 * checked in the free_tail_page_prepare(). In order to avoid the message
 241 * of "corrupted mapping in tail page". We need to reset at least 4 (one
 242 * head struct page struct and three tail struct page structs) struct page
 243 * structs.
 244 */
 245#define NR_RESET_STRUCT_PAGE            4
 246
 247static inline void reset_struct_pages(struct page *start)
 248{
 249        struct page *from = start + NR_RESET_STRUCT_PAGE;
 250
 251        BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
 252        memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
 253}
 254
 255static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
 256                                struct vmemmap_remap_walk *walk)
 257{
 258        pgprot_t pgprot = PAGE_KERNEL;
 259        struct page *page;
 260        void *to;
 261
 262        BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
 263
 264        page = list_first_entry(walk->vmemmap_pages, struct page, lru);
 265        list_del(&page->lru);
 266        to = page_to_virt(page);
 267        copy_page(to, (void *)walk->reuse_addr);
 268        reset_struct_pages(to);
 269
 270        /*
 271         * Makes sure that preceding stores to the page contents become visible
 272         * before the set_pte_at() write.
 273         */
 274        smp_wmb();
 275        set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
 276}
 277
 278/**
 279 * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
 280 *                      backing PMDs of the directmap into PTEs
 281 * @start:     start address of the vmemmap virtual address range that we want
 282 *             to remap.
 283 * @end:       end address of the vmemmap virtual address range that we want to
 284 *             remap.
 285 * @reuse:     reuse address.
 286 *
 287 * Return: %0 on success, negative error code otherwise.
 288 */
 289static int vmemmap_remap_split(unsigned long start, unsigned long end,
 290                               unsigned long reuse)
 291{
 292        struct vmemmap_remap_walk walk = {
 293                .remap_pte      = NULL,
 294                .flags          = VMEMMAP_SPLIT_NO_TLB_FLUSH,
 295        };
 296
 297        /* See the comment in the vmemmap_remap_free(). */
 298        BUG_ON(start - reuse != PAGE_SIZE);
 299
 300        return vmemmap_remap_range(reuse, end, &walk);
 301}
 302
 303/**
 304 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
 305 *                      to the page which @reuse is mapped to, then free vmemmap
 306 *                      which the range are mapped to.
 307 * @start:      start address of the vmemmap virtual address range that we want
 308 *              to remap.
 309 * @end:        end address of the vmemmap virtual address range that we want to
 310 *              remap.
 311 * @reuse:      reuse address.
 312 * @vmemmap_pages: list to deposit vmemmap pages to be freed.  It is callers
 313 *              responsibility to free pages.
 314 * @flags:      modifications to vmemmap_remap_walk flags
 315 *
 316 * Return: %0 on success, negative error code otherwise.
 317 */
 318static int vmemmap_remap_free(unsigned long start, unsigned long end,
 319                              unsigned long reuse,
 320                              struct list_head *vmemmap_pages,
 321                              unsigned long flags)
 322{
 323        int ret;
 324        struct vmemmap_remap_walk walk = {
 325                .remap_pte      = vmemmap_remap_pte,
 326                .reuse_addr     = reuse,
 327                .vmemmap_pages  = vmemmap_pages,
 328                .flags          = flags,
 329        };
 330        int nid = page_to_nid((struct page *)reuse);
 331        gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
 332
 333        /*
 334         * Allocate a new head vmemmap page to avoid breaking a contiguous
 335         * block of struct page memory when freeing it back to page allocator
 336         * in free_vmemmap_page_list(). This will allow the likely contiguous
 337         * struct page backing memory to be kept contiguous and allowing for
 338         * more allocations of hugepages. Fallback to the currently
 339         * mapped head page in case should it fail to allocate.
 340         */
 341        walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
 342        if (walk.reuse_page) {
 343                copy_page(page_to_virt(walk.reuse_page),
 344                          (void *)walk.reuse_addr);
 345                list_add(&walk.reuse_page->lru, vmemmap_pages);
 346                memmap_pages_add(1);
 347        }
 348
 349        /*
 350         * In order to make remapping routine most efficient for the huge pages,
 351         * the routine of vmemmap page table walking has the following rules
 352         * (see more details from the vmemmap_pte_range()):
 353         *
 354         * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
 355         *   should be continuous.
 356         * - The @reuse address is part of the range [@reuse, @end) that we are
 357         *   walking which is passed to vmemmap_remap_range().
 358         * - The @reuse address is the first in the complete range.
 359         *
 360         * So we need to make sure that @start and @reuse meet the above rules.
 361         */
 362        BUG_ON(start - reuse != PAGE_SIZE);
 363
 364        ret = vmemmap_remap_range(reuse, end, &walk);
 365        if (ret && walk.nr_walked) {
 366                end = reuse + walk.nr_walked * PAGE_SIZE;
 367                /*
 368                 * vmemmap_pages contains pages from the previous
 369                 * vmemmap_remap_range call which failed.  These
 370                 * are pages which were removed from the vmemmap.
 371                 * They will be restored in the following call.
 372                 */
 373                walk = (struct vmemmap_remap_walk) {
 374                        .remap_pte      = vmemmap_restore_pte,
 375                        .reuse_addr     = reuse,
 376                        .vmemmap_pages  = vmemmap_pages,
 377                        .flags          = 0,
 378                };
 379
 380                vmemmap_remap_range(reuse, end, &walk);
 381        }
 382
 383        return ret;
 384}
 385
 386static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
 387                                   struct list_head *list)
 388{
 389        gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
 390        unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
 391        int nid = page_to_nid((struct page *)start);
 392        struct page *page, *next;
 393        int i;
 394
 395        for (i = 0; i < nr_pages; i++) {
 396                page = alloc_pages_node(nid, gfp_mask, 0);
 397                if (!page)
 398                        goto out;
 399                list_add(&page->lru, list);
 400        }
 401        memmap_pages_add(nr_pages);
 402
 403        return 0;
 404out:
 405        list_for_each_entry_safe(page, next, list, lru)
 406                __free_page(page);
 407        return -ENOMEM;
 408}
 409
 410/**
 411 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
 412 *                       to the page which is from the @vmemmap_pages
 413 *                       respectively.
 414 * @start:      start address of the vmemmap virtual address range that we want
 415 *              to remap.
 416 * @end:        end address of the vmemmap virtual address range that we want to
 417 *              remap.
 418 * @reuse:      reuse address.
 419 * @flags:      modifications to vmemmap_remap_walk flags
 420 *
 421 * Return: %0 on success, negative error code otherwise.
 422 */
 423static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
 424                               unsigned long reuse, unsigned long flags)
 425{
 426        LIST_HEAD(vmemmap_pages);
 427        struct vmemmap_remap_walk walk = {
 428                .remap_pte      = vmemmap_restore_pte,
 429                .reuse_addr     = reuse,
 430                .vmemmap_pages  = &vmemmap_pages,
 431                .flags          = flags,
 432        };
 433
 434        /* See the comment in the vmemmap_remap_free(). */
 435        BUG_ON(start - reuse != PAGE_SIZE);
 436
 437        if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
 438                return -ENOMEM;
 439
 440        return vmemmap_remap_range(reuse, end, &walk);
 441}
 442
 443DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
 444EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
 445
 446static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
 447static int __init hugetlb_vmemmap_optimize_param(char *buf)
 448{
 449        return kstrtobool(buf, &vmemmap_optimize_enabled);
 450}
 451early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param);
 452
 453static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
 454                                           struct folio *folio, unsigned long flags)
 455{
 456        int ret;
 457        unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
 458        unsigned long vmemmap_reuse;
 459
 460        VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
 461        VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
 462
 463        if (!folio_test_hugetlb_vmemmap_optimized(folio))
 464                return 0;
 465
 466        if (flags & VMEMMAP_SYNCHRONIZE_RCU)
 467                synchronize_rcu();
 468
 469        vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
 470        vmemmap_reuse   = vmemmap_start;
 471        vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
 472
 473        /*
 474         * The pages which the vmemmap virtual address range [@vmemmap_start,
 475         * @vmemmap_end) are mapped to are freed to the buddy allocator, and
 476         * the range is mapped to the page which @vmemmap_reuse is mapped to.
 477         * When a HugeTLB page is freed to the buddy allocator, previously
 478         * discarded vmemmap pages must be allocated and remapping.
 479         */
 480        ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
 481        if (!ret) {
 482                folio_clear_hugetlb_vmemmap_optimized(folio);
 483                static_branch_dec(&hugetlb_optimize_vmemmap_key);
 484        }
 485
 486        return ret;
 487}
 488
 489/**
 490 * hugetlb_vmemmap_restore_folio - restore previously optimized (by
 491 *                              hugetlb_vmemmap_optimize_folio()) vmemmap pages which
 492 *                              will be reallocated and remapped.
 493 * @h:          struct hstate.
 494 * @folio:     the folio whose vmemmap pages will be restored.
 495 *
 496 * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
 497 * negative error code otherwise.
 498 */
 499int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
 500{
 501        return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU);
 502}
 503
 504/**
 505 * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
 506 * @h:                  hstate.
 507 * @folio_list:         list of folios.
 508 * @non_hvo_folios:     Output list of folios for which vmemmap exists.
 509 *
 510 * Return: number of folios for which vmemmap was restored, or an error code
 511 *              if an error was encountered restoring vmemmap for a folio.
 512 *              Folios that have vmemmap are moved to the non_hvo_folios
 513 *              list.  Processing of entries stops when the first error is
 514 *              encountered. The folio that experienced the error and all
 515 *              non-processed folios will remain on folio_list.
 516 */
 517long hugetlb_vmemmap_restore_folios(const struct hstate *h,
 518                                        struct list_head *folio_list,
 519                                        struct list_head *non_hvo_folios)
 520{
 521        struct folio *folio, *t_folio;
 522        long restored = 0;
 523        long ret = 0;
 524        unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
 525
 526        list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
 527                if (folio_test_hugetlb_vmemmap_optimized(folio)) {
 528                        ret = __hugetlb_vmemmap_restore_folio(h, folio, flags);
 529                        /* only need to synchronize_rcu() once for each batch */
 530                        flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
 531
 532                        if (ret)
 533                                break;
 534                        restored++;
 535                }
 536
 537                /* Add non-optimized folios to output list */
 538                list_move(&folio->lru, non_hvo_folios);
 539        }
 540
 541        if (restored)
 542                flush_tlb_all();
 543        if (!ret)
 544                ret = restored;
 545        return ret;
 546}
 547
 548/* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
 549static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
 550{
 551        if (folio_test_hugetlb_vmemmap_optimized(folio))
 552                return false;
 553
 554        if (!READ_ONCE(vmemmap_optimize_enabled))
 555                return false;
 556
 557        if (!hugetlb_vmemmap_optimizable(h))
 558                return false;
 559
 560        return true;
 561}
 562
 563static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 564                                            struct folio *folio,
 565                                            struct list_head *vmemmap_pages,
 566                                            unsigned long flags)
 567{
 568        int ret = 0;
 569        unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
 570        unsigned long vmemmap_reuse;
 571
 572        VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
 573        VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
 574
 575        if (!vmemmap_should_optimize_folio(h, folio))
 576                return ret;
 577
 578        static_branch_inc(&hugetlb_optimize_vmemmap_key);
 579
 580        if (flags & VMEMMAP_SYNCHRONIZE_RCU)
 581                synchronize_rcu();
 582        /*
 583         * Very Subtle
 584         * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
 585         * immediately after remapping.  As a result, subsequent accesses
 586         * and modifications to struct pages associated with the hugetlb
 587         * page could be to the OLD struct pages.  Set the vmemmap optimized
 588         * flag here so that it is copied to the new head page.  This keeps
 589         * the old and new struct pages in sync.
 590         * If there is an error during optimization, we will immediately FLUSH
 591         * the TLB and clear the flag below.
 592         */
 593        folio_set_hugetlb_vmemmap_optimized(folio);
 594
 595        vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
 596        vmemmap_reuse   = vmemmap_start;
 597        vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
 598
 599        /*
 600         * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
 601         * to the page which @vmemmap_reuse is mapped to.  Add pages previously
 602         * mapping the range to vmemmap_pages list so that they can be freed by
 603         * the caller.
 604         */
 605        ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
 606                                 vmemmap_pages, flags);
 607        if (ret) {
 608                static_branch_dec(&hugetlb_optimize_vmemmap_key);
 609                folio_clear_hugetlb_vmemmap_optimized(folio);
 610        }
 611
 612        return ret;
 613}
 614
 615/**
 616 * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
 617 * @h:          struct hstate.
 618 * @folio:     the folio whose vmemmap pages will be optimized.
 619 *
 620 * This function only tries to optimize @folio's vmemmap pages and does not
 621 * guarantee that the optimization will succeed after it returns. The caller
 622 * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
 623 * vmemmap pages have been optimized.
 624 */
 625void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
 626{
 627        LIST_HEAD(vmemmap_pages);
 628
 629        __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU);
 630        free_vmemmap_page_list(&vmemmap_pages);
 631}
 632
 633static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
 634{
 635        unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
 636        unsigned long vmemmap_reuse;
 637
 638        if (!vmemmap_should_optimize_folio(h, folio))
 639                return 0;
 640
 641        vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
 642        vmemmap_reuse   = vmemmap_start;
 643        vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
 644
 645        /*
 646         * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
 647         * @vmemmap_end]
 648         */
 649        return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
 650}
 651
 652static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
 653                                              struct list_head *folio_list,
 654                                              bool boot)
 655{
 656        struct folio *folio;
 657        int nr_to_optimize;
 658        LIST_HEAD(vmemmap_pages);
 659        unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
 660
 661        nr_to_optimize = 0;
 662        list_for_each_entry(folio, folio_list, lru) {
 663                int ret;
 664                unsigned long spfn, epfn;
 665
 666                if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) {
 667                        /*
 668                         * Already optimized by pre-HVO, just map the
 669                         * mirrored tail page structs RO.
 670                         */
 671                        spfn = (unsigned long)&folio->page;
 672                        epfn = spfn + pages_per_huge_page(h);
 673                        vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio),
 674                                        HUGETLB_VMEMMAP_RESERVE_SIZE);
 675                        register_page_bootmem_memmap(pfn_to_section_nr(spfn),
 676                                        &folio->page,
 677                                        HUGETLB_VMEMMAP_RESERVE_SIZE);
 678                        static_branch_inc(&hugetlb_optimize_vmemmap_key);
 679                        continue;
 680                }
 681
 682                nr_to_optimize++;
 683
 684                ret = hugetlb_vmemmap_split_folio(h, folio);
 685
 686                /*
 687                 * Spliting the PMD requires allocating a page, thus lets fail
 688                 * early once we encounter the first OOM. No point in retrying
 689                 * as it can be dynamically done on remap with the memory
 690                 * we get back from the vmemmap deduplication.
 691                 */
 692                if (ret == -ENOMEM)
 693                        break;
 694        }
 695
 696        if (!nr_to_optimize)
 697                /*
 698                 * All pre-HVO folios, nothing left to do. It's ok if
 699                 * there is a mix of pre-HVO and not yet HVO-ed folios
 700                 * here, as __hugetlb_vmemmap_optimize_folio() will
 701                 * skip any folios that already have the optimized flag
 702                 * set, see vmemmap_should_optimize_folio().
 703                 */
 704                goto out;
 705
 706        flush_tlb_all();
 707
 708        list_for_each_entry(folio, folio_list, lru) {
 709                int ret;
 710
 711                ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
 712                /* only need to synchronize_rcu() once for each batch */
 713                flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
 714
 715                /*
 716                 * Pages to be freed may have been accumulated.  If we
 717                 * encounter an ENOMEM,  free what we have and try again.
 718                 * This can occur in the case that both spliting fails
 719                 * halfway and head page allocation also failed. In this
 720                 * case __hugetlb_vmemmap_optimize_folio() would free memory
 721                 * allowing more vmemmap remaps to occur.
 722                 */
 723                if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
 724                        flush_tlb_all();
 725                        free_vmemmap_page_list(&vmemmap_pages);
 726                        INIT_LIST_HEAD(&vmemmap_pages);
 727                        __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
 728                }
 729        }
 730
 731out:
 732        flush_tlb_all();
 733        free_vmemmap_page_list(&vmemmap_pages);
 734}
 735
 736void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
 737{
 738        __hugetlb_vmemmap_optimize_folios(h, folio_list, false);
 739}
 740
 741void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list)
 742{
 743        __hugetlb_vmemmap_optimize_folios(h, folio_list, true);
 744}
 745
 746#ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
 747
 748/* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */
 749static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m)
 750{
 751        unsigned long section_size, psize, pmd_vmemmap_size;
 752        phys_addr_t paddr;
 753
 754        if (!READ_ONCE(vmemmap_optimize_enabled))
 755                return false;
 756
 757        if (!hugetlb_vmemmap_optimizable(m->hstate))
 758                return false;
 759
 760        psize = huge_page_size(m->hstate);
 761        paddr = virt_to_phys(m);
 762
 763        /*
 764         * Pre-HVO only works if the bootmem huge page
 765         * is aligned to the section size.
 766         */
 767        section_size = (1UL << PA_SECTION_SHIFT);
 768        if (!IS_ALIGNED(paddr, section_size) ||
 769            !IS_ALIGNED(psize, section_size))
 770                return false;
 771
 772        /*
 773         * The pre-HVO code does not deal with splitting PMDS,
 774         * so the bootmem page must be aligned to the number
 775         * of base pages that can be mapped with one vmemmap PMD.
 776         */
 777        pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT;
 778        if (!IS_ALIGNED(paddr, pmd_vmemmap_size) ||
 779            !IS_ALIGNED(psize, pmd_vmemmap_size))
 780                return false;
 781
 782        return true;
 783}
 784
 785/*
 786 * Initialize memmap section for a gigantic page, HVO-style.
 787 */
 788void __init hugetlb_vmemmap_init_early(int nid)
 789{
 790        unsigned long psize, paddr, section_size;
 791        unsigned long ns, i, pnum, pfn, nr_pages;
 792        unsigned long start, end;
 793        struct huge_bootmem_page *m = NULL;
 794        void *map;
 795
 796        /*
 797         * Noting to do if bootmem pages were not allocated
 798         * early in boot, or if HVO wasn't enabled in the
 799         * first place.
 800         */
 801        if (!hugetlb_bootmem_allocated())
 802                return;
 803
 804        if (!READ_ONCE(vmemmap_optimize_enabled))
 805                return;
 806
 807        section_size = (1UL << PA_SECTION_SHIFT);
 808
 809        list_for_each_entry(m, &huge_boot_pages[nid], list) {
 810                if (!vmemmap_should_optimize_bootmem_page(m))
 811                        continue;
 812
 813                nr_pages = pages_per_huge_page(m->hstate);
 814                psize = nr_pages << PAGE_SHIFT;
 815                paddr = virt_to_phys(m);
 816                pfn = PHYS_PFN(paddr);
 817                map = pfn_to_page(pfn);
 818                start = (unsigned long)map;
 819                end = start + nr_pages * sizeof(struct page);
 820
 821                if (vmemmap_populate_hvo(start, end, nid,
 822                                        HUGETLB_VMEMMAP_RESERVE_SIZE) < 0)
 823                        continue;
 824
 825                memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE);
 826
 827                pnum = pfn_to_section_nr(pfn);
 828                ns = psize / section_size;
 829
 830                for (i = 0; i < ns; i++) {
 831                        sparse_init_early_section(nid, map, pnum,
 832                                        SECTION_IS_VMEMMAP_PREINIT);
 833                        map += section_map_size();
 834                        pnum++;
 835                }
 836
 837                m->flags |= HUGE_BOOTMEM_HVO;
 838        }
 839}
 840
 841void __init hugetlb_vmemmap_init_late(int nid)
 842{
 843        struct huge_bootmem_page *m, *tm;
 844        unsigned long phys, nr_pages, start, end;
 845        unsigned long pfn, nr_mmap;
 846        struct hstate *h;
 847        void *map;
 848
 849        if (!hugetlb_bootmem_allocated())
 850                return;
 851
 852        if (!READ_ONCE(vmemmap_optimize_enabled))
 853                return;
 854
 855        list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
 856                if (!(m->flags & HUGE_BOOTMEM_HVO))
 857                        continue;
 858
 859                phys = virt_to_phys(m);
 860                h = m->hstate;
 861                pfn = PHYS_PFN(phys);
 862                nr_pages = pages_per_huge_page(h);
 863
 864                if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
 865                        /*
 866                         * Oops, the hugetlb page spans multiple zones.
 867                         * Remove it from the list, and undo HVO.
 868                         */
 869                        list_del(&m->list);
 870
 871                        map = pfn_to_page(pfn);
 872
 873                        start = (unsigned long)map;
 874                        end = start + nr_pages * sizeof(struct page);
 875
 876                        vmemmap_undo_hvo(start, end, nid,
 877                                         HUGETLB_VMEMMAP_RESERVE_SIZE);
 878                        nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE;
 879                        memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE));
 880
 881                        memblock_phys_free(phys, huge_page_size(h));
 882                        continue;
 883                } else
 884                        m->flags |= HUGE_BOOTMEM_ZONES_VALID;
 885        }
 886}
 887#endif
 888
 889static const struct ctl_table hugetlb_vmemmap_sysctls[] = {
 890        {
 891                .procname       = "hugetlb_optimize_vmemmap",
 892                .data           = &vmemmap_optimize_enabled,
 893                .maxlen         = sizeof(vmemmap_optimize_enabled),
 894                .mode           = 0644,
 895                .proc_handler   = proc_dobool,
 896        },
 897};
 898
 899static int __init hugetlb_vmemmap_init(void)
 900{
 901        const struct hstate *h;
 902
 903        /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
 904        BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
 905
 906        for_each_hstate(h) {
 907                if (hugetlb_vmemmap_optimizable(h)) {
 908                        register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
 909                        break;
 910                }
 911        }
 912        return 0;
 913}
 914late_initcall(hugetlb_vmemmap_init);
 915