linux/mm/sparse-vmemmap.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Virtual Memory Map support
   4 *
   5 * (C) 2007 sgi. Christoph Lameter.
   6 *
   7 * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
   8 * virt_to_page, page_address() to be implemented as a base offset
   9 * calculation without memory access.
  10 *
  11 * However, virtual mappings need a page table and TLBs. Many Linux
  12 * architectures already map their physical space using 1-1 mappings
  13 * via TLBs. For those arches the virtual memory map is essentially
  14 * for free if we use the same page size as the 1-1 mappings. In that
  15 * case the overhead consists of a few additional pages that are
  16 * allocated to create a view of memory for vmemmap.
  17 *
  18 * The architecture is expected to provide a vmemmap_populate() function
  19 * to instantiate the mapping.
  20 */
  21#include <linux/mm.h>
  22#include <linux/mmzone.h>
  23#include <linux/memblock.h>
  24#include <linux/memremap.h>
  25#include <linux/highmem.h>
  26#include <linux/slab.h>
  27#include <linux/spinlock.h>
  28#include <linux/vmalloc.h>
  29#include <linux/sched.h>
  30#include <linux/pgtable.h>
  31#include <linux/bootmem_info.h>
  32
  33#include <asm/dma.h>
  34#include <asm/pgalloc.h>
  35#include <asm/tlbflush.h>
  36
  37/**
  38 * struct vmemmap_remap_walk - walk vmemmap page table
  39 *
  40 * @remap_pte:          called for each lowest-level entry (PTE).
  41 * @nr_walked:          the number of walked pte.
  42 * @reuse_page:         the page which is reused for the tail vmemmap pages.
  43 * @reuse_addr:         the virtual address of the @reuse_page page.
  44 * @vmemmap_pages:      the list head of the vmemmap pages that can be freed
  45 *                      or is mapped from.
  46 */
  47struct vmemmap_remap_walk {
  48        void (*remap_pte)(pte_t *pte, unsigned long addr,
  49                          struct vmemmap_remap_walk *walk);
  50        unsigned long nr_walked;
  51        struct page *reuse_page;
  52        unsigned long reuse_addr;
  53        struct list_head *vmemmap_pages;
  54};
  55
  56static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
  57                                  struct vmemmap_remap_walk *walk)
  58{
  59        pmd_t __pmd;
  60        int i;
  61        unsigned long addr = start;
  62        struct page *page = pmd_page(*pmd);
  63        pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
  64
  65        if (!pgtable)
  66                return -ENOMEM;
  67
  68        pmd_populate_kernel(&init_mm, &__pmd, pgtable);
  69
  70        for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
  71                pte_t entry, *pte;
  72                pgprot_t pgprot = PAGE_KERNEL;
  73
  74                entry = mk_pte(page + i, pgprot);
  75                pte = pte_offset_kernel(&__pmd, addr);
  76                set_pte_at(&init_mm, addr, pte, entry);
  77        }
  78
  79        /* Make pte visible before pmd. See comment in __pte_alloc(). */
  80        smp_wmb();
  81        pmd_populate_kernel(&init_mm, pmd, pgtable);
  82
  83        flush_tlb_kernel_range(start, start + PMD_SIZE);
  84
  85        return 0;
  86}
  87
  88static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
  89                              unsigned long end,
  90                              struct vmemmap_remap_walk *walk)
  91{
  92        pte_t *pte = pte_offset_kernel(pmd, addr);
  93
  94        /*
  95         * The reuse_page is found 'first' in table walk before we start
  96         * remapping (which is calling @walk->remap_pte).
  97         */
  98        if (!walk->reuse_page) {
  99                walk->reuse_page = pte_page(*pte);
 100                /*
 101                 * Because the reuse address is part of the range that we are
 102                 * walking, skip the reuse address range.
 103                 */
 104                addr += PAGE_SIZE;
 105                pte++;
 106                walk->nr_walked++;
 107        }
 108
 109        for (; addr != end; addr += PAGE_SIZE, pte++) {
 110                walk->remap_pte(pte, addr, walk);
 111                walk->nr_walked++;
 112        }
 113}
 114
 115static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
 116                             unsigned long end,
 117                             struct vmemmap_remap_walk *walk)
 118{
 119        pmd_t *pmd;
 120        unsigned long next;
 121
 122        pmd = pmd_offset(pud, addr);
 123        do {
 124                if (pmd_leaf(*pmd)) {
 125                        int ret;
 126
 127                        ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk);
 128                        if (ret)
 129                                return ret;
 130                }
 131                next = pmd_addr_end(addr, end);
 132                vmemmap_pte_range(pmd, addr, next, walk);
 133        } while (pmd++, addr = next, addr != end);
 134
 135        return 0;
 136}
 137
 138static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
 139                             unsigned long end,
 140                             struct vmemmap_remap_walk *walk)
 141{
 142        pud_t *pud;
 143        unsigned long next;
 144
 145        pud = pud_offset(p4d, addr);
 146        do {
 147                int ret;
 148
 149                next = pud_addr_end(addr, end);
 150                ret = vmemmap_pmd_range(pud, addr, next, walk);
 151                if (ret)
 152                        return ret;
 153        } while (pud++, addr = next, addr != end);
 154
 155        return 0;
 156}
 157
 158static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
 159                             unsigned long end,
 160                             struct vmemmap_remap_walk *walk)
 161{
 162        p4d_t *p4d;
 163        unsigned long next;
 164
 165        p4d = p4d_offset(pgd, addr);
 166        do {
 167                int ret;
 168
 169                next = p4d_addr_end(addr, end);
 170                ret = vmemmap_pud_range(p4d, addr, next, walk);
 171                if (ret)
 172                        return ret;
 173        } while (p4d++, addr = next, addr != end);
 174
 175        return 0;
 176}
 177
 178static int vmemmap_remap_range(unsigned long start, unsigned long end,
 179                               struct vmemmap_remap_walk *walk)
 180{
 181        unsigned long addr = start;
 182        unsigned long next;
 183        pgd_t *pgd;
 184
 185        VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
 186        VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
 187
 188        pgd = pgd_offset_k(addr);
 189        do {
 190                int ret;
 191
 192                next = pgd_addr_end(addr, end);
 193                ret = vmemmap_p4d_range(pgd, addr, next, walk);
 194                if (ret)
 195                        return ret;
 196        } while (pgd++, addr = next, addr != end);
 197
 198        /*
 199         * We only change the mapping of the vmemmap virtual address range
 200         * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
 201         * belongs to the range.
 202         */
 203        flush_tlb_kernel_range(start + PAGE_SIZE, end);
 204
 205        return 0;
 206}
 207
 208/*
 209 * Free a vmemmap page. A vmemmap page can be allocated from the memblock
 210 * allocator or buddy allocator. If the PG_reserved flag is set, it means
 211 * that it allocated from the memblock allocator, just free it via the
 212 * free_bootmem_page(). Otherwise, use __free_page().
 213 */
 214static inline void free_vmemmap_page(struct page *page)
 215{
 216        if (PageReserved(page))
 217                free_bootmem_page(page);
 218        else
 219                __free_page(page);
 220}
 221
 222/* Free a list of the vmemmap pages */
 223static void free_vmemmap_page_list(struct list_head *list)
 224{
 225        struct page *page, *next;
 226
 227        list_for_each_entry_safe(page, next, list, lru) {
 228                list_del(&page->lru);
 229                free_vmemmap_page(page);
 230        }
 231}
 232
 233static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
 234                              struct vmemmap_remap_walk *walk)
 235{
 236        /*
 237         * Remap the tail pages as read-only to catch illegal write operation
 238         * to the tail pages.
 239         */
 240        pgprot_t pgprot = PAGE_KERNEL_RO;
 241        pte_t entry = mk_pte(walk->reuse_page, pgprot);
 242        struct page *page = pte_page(*pte);
 243
 244        list_add_tail(&page->lru, walk->vmemmap_pages);
 245        set_pte_at(&init_mm, addr, pte, entry);
 246}
 247
 248static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
 249                                struct vmemmap_remap_walk *walk)
 250{
 251        pgprot_t pgprot = PAGE_KERNEL;
 252        struct page *page;
 253        void *to;
 254
 255        BUG_ON(pte_page(*pte) != walk->reuse_page);
 256
 257        page = list_first_entry(walk->vmemmap_pages, struct page, lru);
 258        list_del(&page->lru);
 259        to = page_to_virt(page);
 260        copy_page(to, (void *)walk->reuse_addr);
 261
 262        set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
 263}
 264
 265/**
 266 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
 267 *                      to the page which @reuse is mapped to, then free vmemmap
 268 *                      which the range are mapped to.
 269 * @start:      start address of the vmemmap virtual address range that we want
 270 *              to remap.
 271 * @end:        end address of the vmemmap virtual address range that we want to
 272 *              remap.
 273 * @reuse:      reuse address.
 274 *
 275 * Return: %0 on success, negative error code otherwise.
 276 */
 277int vmemmap_remap_free(unsigned long start, unsigned long end,
 278                       unsigned long reuse)
 279{
 280        int ret;
 281        LIST_HEAD(vmemmap_pages);
 282        struct vmemmap_remap_walk walk = {
 283                .remap_pte      = vmemmap_remap_pte,
 284                .reuse_addr     = reuse,
 285                .vmemmap_pages  = &vmemmap_pages,
 286        };
 287
 288        /*
 289         * In order to make remapping routine most efficient for the huge pages,
 290         * the routine of vmemmap page table walking has the following rules
 291         * (see more details from the vmemmap_pte_range()):
 292         *
 293         * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
 294         *   should be continuous.
 295         * - The @reuse address is part of the range [@reuse, @end) that we are
 296         *   walking which is passed to vmemmap_remap_range().
 297         * - The @reuse address is the first in the complete range.
 298         *
 299         * So we need to make sure that @start and @reuse meet the above rules.
 300         */
 301        BUG_ON(start - reuse != PAGE_SIZE);
 302
 303        mmap_write_lock(&init_mm);
 304        ret = vmemmap_remap_range(reuse, end, &walk);
 305        mmap_write_downgrade(&init_mm);
 306
 307        if (ret && walk.nr_walked) {
 308                end = reuse + walk.nr_walked * PAGE_SIZE;
 309                /*
 310                 * vmemmap_pages contains pages from the previous
 311                 * vmemmap_remap_range call which failed.  These
 312                 * are pages which were removed from the vmemmap.
 313                 * They will be restored in the following call.
 314                 */
 315                walk = (struct vmemmap_remap_walk) {
 316                        .remap_pte      = vmemmap_restore_pte,
 317                        .reuse_addr     = reuse,
 318                        .vmemmap_pages  = &vmemmap_pages,
 319                };
 320
 321                vmemmap_remap_range(reuse, end, &walk);
 322        }
 323        mmap_read_unlock(&init_mm);
 324
 325        free_vmemmap_page_list(&vmemmap_pages);
 326
 327        return ret;
 328}
 329
 330static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
 331                                   gfp_t gfp_mask, struct list_head *list)
 332{
 333        unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
 334        int nid = page_to_nid((struct page *)start);
 335        struct page *page, *next;
 336
 337        while (nr_pages--) {
 338                page = alloc_pages_node(nid, gfp_mask, 0);
 339                if (!page)
 340                        goto out;
 341                list_add_tail(&page->lru, list);
 342        }
 343
 344        return 0;
 345out:
 346        list_for_each_entry_safe(page, next, list, lru)
 347                __free_pages(page, 0);
 348        return -ENOMEM;
 349}
 350
 351/**
 352 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
 353 *                       to the page which is from the @vmemmap_pages
 354 *                       respectively.
 355 * @start:      start address of the vmemmap virtual address range that we want
 356 *              to remap.
 357 * @end:        end address of the vmemmap virtual address range that we want to
 358 *              remap.
 359 * @reuse:      reuse address.
 360 * @gfp_mask:   GFP flag for allocating vmemmap pages.
 361 *
 362 * Return: %0 on success, negative error code otherwise.
 363 */
 364int vmemmap_remap_alloc(unsigned long start, unsigned long end,
 365                        unsigned long reuse, gfp_t gfp_mask)
 366{
 367        LIST_HEAD(vmemmap_pages);
 368        struct vmemmap_remap_walk walk = {
 369                .remap_pte      = vmemmap_restore_pte,
 370                .reuse_addr     = reuse,
 371                .vmemmap_pages  = &vmemmap_pages,
 372        };
 373
 374        /* See the comment in the vmemmap_remap_free(). */
 375        BUG_ON(start - reuse != PAGE_SIZE);
 376
 377        if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
 378                return -ENOMEM;
 379
 380        mmap_read_lock(&init_mm);
 381        vmemmap_remap_range(reuse, end, &walk);
 382        mmap_read_unlock(&init_mm);
 383
 384        return 0;
 385}
 386
 387/*
 388 * Allocate a block of memory to be used to back the virtual memory map
 389 * or to back the page tables that are used to create the mapping.
 390 * Uses the main allocators if they are available, else bootmem.
 391 */
 392
 393static void * __ref __earlyonly_bootmem_alloc(int node,
 394                                unsigned long size,
 395                                unsigned long align,
 396                                unsigned long goal)
 397{
 398        return memblock_alloc_try_nid_raw(size, align, goal,
 399                                               MEMBLOCK_ALLOC_ACCESSIBLE, node);
 400}
 401
 402void * __meminit vmemmap_alloc_block(unsigned long size, int node)
 403{
 404        /* If the main allocator is up use that, fallback to bootmem. */
 405        if (slab_is_available()) {
 406                gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
 407                int order = get_order(size);
 408                static bool warned;
 409                struct page *page;
 410
 411                page = alloc_pages_node(node, gfp_mask, order);
 412                if (page)
 413                        return page_address(page);
 414
 415                if (!warned) {
 416                        warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL,
 417                                   "vmemmap alloc failure: order:%u", order);
 418                        warned = true;
 419                }
 420                return NULL;
 421        } else
 422                return __earlyonly_bootmem_alloc(node, size, size,
 423                                __pa(MAX_DMA_ADDRESS));
 424}
 425
 426static void * __meminit altmap_alloc_block_buf(unsigned long size,
 427                                               struct vmem_altmap *altmap);
 428
 429/* need to make sure size is all the same during early stage */
 430void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
 431                                         struct vmem_altmap *altmap)
 432{
 433        void *ptr;
 434
 435        if (altmap)
 436                return altmap_alloc_block_buf(size, altmap);
 437
 438        ptr = sparse_buffer_alloc(size);
 439        if (!ptr)
 440                ptr = vmemmap_alloc_block(size, node);
 441        return ptr;
 442}
 443
 444static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
 445{
 446        return altmap->base_pfn + altmap->reserve + altmap->alloc
 447                + altmap->align;
 448}
 449
 450static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
 451{
 452        unsigned long allocated = altmap->alloc + altmap->align;
 453
 454        if (altmap->free > allocated)
 455                return altmap->free - allocated;
 456        return 0;
 457}
 458
 459static void * __meminit altmap_alloc_block_buf(unsigned long size,
 460                                               struct vmem_altmap *altmap)
 461{
 462        unsigned long pfn, nr_pfns, nr_align;
 463
 464        if (size & ~PAGE_MASK) {
 465                pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
 466                                __func__, size);
 467                return NULL;
 468        }
 469
 470        pfn = vmem_altmap_next_pfn(altmap);
 471        nr_pfns = size >> PAGE_SHIFT;
 472        nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
 473        nr_align = ALIGN(pfn, nr_align) - pfn;
 474        if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
 475                return NULL;
 476
 477        altmap->alloc += nr_pfns;
 478        altmap->align += nr_align;
 479        pfn += nr_align;
 480
 481        pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
 482                        __func__, pfn, altmap->alloc, altmap->align, nr_pfns);
 483        return __va(__pfn_to_phys(pfn));
 484}
 485
 486void __meminit vmemmap_verify(pte_t *pte, int node,
 487                                unsigned long start, unsigned long end)
 488{
 489        unsigned long pfn = pte_pfn(*pte);
 490        int actual_node = early_pfn_to_nid(pfn);
 491
 492        if (node_distance(actual_node, node) > LOCAL_DISTANCE)
 493                pr_warn("[%lx-%lx] potential offnode page_structs\n",
 494                        start, end - 1);
 495}
 496
 497pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
 498                                       struct vmem_altmap *altmap)
 499{
 500        pte_t *pte = pte_offset_kernel(pmd, addr);
 501        if (pte_none(*pte)) {
 502                pte_t entry;
 503                void *p;
 504
 505                p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
 506                if (!p)
 507                        return NULL;
 508                entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
 509                set_pte_at(&init_mm, addr, pte, entry);
 510        }
 511        return pte;
 512}
 513
 514static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
 515{
 516        void *p = vmemmap_alloc_block(size, node);
 517
 518        if (!p)
 519                return NULL;
 520        memset(p, 0, size);
 521
 522        return p;
 523}
 524
 525pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
 526{
 527        pmd_t *pmd = pmd_offset(pud, addr);
 528        if (pmd_none(*pmd)) {
 529                void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
 530                if (!p)
 531                        return NULL;
 532                pmd_populate_kernel(&init_mm, pmd, p);
 533        }
 534        return pmd;
 535}
 536
 537pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
 538{
 539        pud_t *pud = pud_offset(p4d, addr);
 540        if (pud_none(*pud)) {
 541                void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
 542                if (!p)
 543                        return NULL;
 544                pud_populate(&init_mm, pud, p);
 545        }
 546        return pud;
 547}
 548
 549p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
 550{
 551        p4d_t *p4d = p4d_offset(pgd, addr);
 552        if (p4d_none(*p4d)) {
 553                void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
 554                if (!p)
 555                        return NULL;
 556                p4d_populate(&init_mm, p4d, p);
 557        }
 558        return p4d;
 559}
 560
 561pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
 562{
 563        pgd_t *pgd = pgd_offset_k(addr);
 564        if (pgd_none(*pgd)) {
 565                void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
 566                if (!p)
 567                        return NULL;
 568                pgd_populate(&init_mm, pgd, p);
 569        }
 570        return pgd;
 571}
 572
 573int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
 574                                         int node, struct vmem_altmap *altmap)
 575{
 576        unsigned long addr = start;
 577        pgd_t *pgd;
 578        p4d_t *p4d;
 579        pud_t *pud;
 580        pmd_t *pmd;
 581        pte_t *pte;
 582
 583        for (; addr < end; addr += PAGE_SIZE) {
 584                pgd = vmemmap_pgd_populate(addr, node);
 585                if (!pgd)
 586                        return -ENOMEM;
 587                p4d = vmemmap_p4d_populate(pgd, addr, node);
 588                if (!p4d)
 589                        return -ENOMEM;
 590                pud = vmemmap_pud_populate(p4d, addr, node);
 591                if (!pud)
 592                        return -ENOMEM;
 593                pmd = vmemmap_pmd_populate(pud, addr, node);
 594                if (!pmd)
 595                        return -ENOMEM;
 596                pte = vmemmap_pte_populate(pmd, addr, node, altmap);
 597                if (!pte)
 598                        return -ENOMEM;
 599                vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
 600        }
 601
 602        return 0;
 603}
 604
 605struct page * __meminit __populate_section_memmap(unsigned long pfn,
 606                unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
 607{
 608        unsigned long start = (unsigned long) pfn_to_page(pfn);
 609        unsigned long end = start + nr_pages * sizeof(struct page);
 610
 611        if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
 612                !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
 613                return NULL;
 614
 615        if (vmemmap_populate(start, end, nid, altmap))
 616                return NULL;
 617
 618        return pfn_to_page(pfn);
 619}
 620