linux/arch/s390/mm/vmem.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *    Copyright IBM Corp. 2006
   4 *    Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
   5 */
   6
   7#include <linux/memory_hotplug.h>
   8#include <linux/memblock.h>
   9#include <linux/pfn.h>
  10#include <linux/mm.h>
  11#include <linux/init.h>
  12#include <linux/list.h>
  13#include <linux/hugetlb.h>
  14#include <linux/slab.h>
  15#include <asm/cacheflush.h>
  16#include <asm/pgalloc.h>
  17#include <asm/setup.h>
  18#include <asm/tlbflush.h>
  19#include <asm/sections.h>
  20#include <asm/set_memory.h>
  21
  22static DEFINE_MUTEX(vmem_mutex);
  23
  24static void __ref *vmem_alloc_pages(unsigned int order)
  25{
  26        unsigned long size = PAGE_SIZE << order;
  27
  28        if (slab_is_available())
  29                return (void *)__get_free_pages(GFP_KERNEL, order);
  30        return memblock_alloc(size, size);
  31}
  32
  33static void vmem_free_pages(unsigned long addr, int order)
  34{
  35        /* We don't expect boot memory to be removed ever. */
  36        if (!slab_is_available() ||
  37            WARN_ON_ONCE(PageReserved(virt_to_page(addr))))
  38                return;
  39        free_pages(addr, order);
  40}
  41
  42void *vmem_crst_alloc(unsigned long val)
  43{
  44        unsigned long *table;
  45
  46        table = vmem_alloc_pages(CRST_ALLOC_ORDER);
  47        if (table)
  48                crst_table_init(table, val);
  49        return table;
  50}
  51
  52pte_t __ref *vmem_pte_alloc(void)
  53{
  54        unsigned long size = PTRS_PER_PTE * sizeof(pte_t);
  55        pte_t *pte;
  56
  57        if (slab_is_available())
  58                pte = (pte_t *) page_table_alloc(&init_mm);
  59        else
  60                pte = (pte_t *) memblock_alloc(size, size);
  61        if (!pte)
  62                return NULL;
  63        memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
  64        return pte;
  65}
  66
  67static void vmem_pte_free(unsigned long *table)
  68{
  69        /* We don't expect boot memory to be removed ever. */
  70        if (!slab_is_available() ||
  71            WARN_ON_ONCE(PageReserved(virt_to_page(table))))
  72                return;
  73        page_table_free(&init_mm, table);
  74}
  75
  76#define PAGE_UNUSED 0xFD
  77
  78/*
  79 * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
  80 * from unused_sub_pmd_start to next PMD_SIZE boundary.
  81 */
  82static unsigned long unused_sub_pmd_start;
  83
  84static void vmemmap_flush_unused_sub_pmd(void)
  85{
  86        if (!unused_sub_pmd_start)
  87                return;
  88        memset((void *)unused_sub_pmd_start, PAGE_UNUSED,
  89               ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start);
  90        unused_sub_pmd_start = 0;
  91}
  92
  93static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end)
  94{
  95        /*
  96         * As we expect to add in the same granularity as we remove, it's
  97         * sufficient to mark only some piece used to block the memmap page from
  98         * getting removed (just in case the memmap never gets initialized,
  99         * e.g., because the memory block never gets onlined).
 100         */
 101        memset((void *)start, 0, sizeof(struct page));
 102}
 103
 104static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
 105{
 106        /*
 107         * We only optimize if the new used range directly follows the
 108         * previously unused range (esp., when populating consecutive sections).
 109         */
 110        if (unused_sub_pmd_start == start) {
 111                unused_sub_pmd_start = end;
 112                if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE)))
 113                        unused_sub_pmd_start = 0;
 114                return;
 115        }
 116        vmemmap_flush_unused_sub_pmd();
 117        vmemmap_mark_sub_pmd_used(start, end);
 118}
 119
 120static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
 121{
 122        unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
 123
 124        vmemmap_flush_unused_sub_pmd();
 125
 126        /* Could be our memmap page is filled with PAGE_UNUSED already ... */
 127        vmemmap_mark_sub_pmd_used(start, end);
 128
 129        /* Mark the unused parts of the new memmap page PAGE_UNUSED. */
 130        if (!IS_ALIGNED(start, PMD_SIZE))
 131                memset((void *)page, PAGE_UNUSED, start - page);
 132        /*
 133         * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
 134         * consecutive sections. Remember for the last added PMD the last
 135         * unused range in the populated PMD.
 136         */
 137        if (!IS_ALIGNED(end, PMD_SIZE))
 138                unused_sub_pmd_start = end;
 139}
 140
 141/* Returns true if the PMD is completely unused and can be freed. */
 142static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
 143{
 144        unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
 145
 146        vmemmap_flush_unused_sub_pmd();
 147        memset((void *)start, PAGE_UNUSED, end - start);
 148        return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE);
 149}
 150
 151/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
 152static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
 153                                  unsigned long end, bool add, bool direct)
 154{
 155        unsigned long prot, pages = 0;
 156        int ret = -ENOMEM;
 157        pte_t *pte;
 158
 159        prot = pgprot_val(PAGE_KERNEL);
 160        if (!MACHINE_HAS_NX)
 161                prot &= ~_PAGE_NOEXEC;
 162
 163        pte = pte_offset_kernel(pmd, addr);
 164        for (; addr < end; addr += PAGE_SIZE, pte++) {
 165                if (!add) {
 166                        if (pte_none(*pte))
 167                                continue;
 168                        if (!direct)
 169                                vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0);
 170                        pte_clear(&init_mm, addr, pte);
 171                } else if (pte_none(*pte)) {
 172                        if (!direct) {
 173                                void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
 174
 175                                if (!new_page)
 176                                        goto out;
 177                                pte_val(*pte) = __pa(new_page) | prot;
 178                        } else {
 179                                pte_val(*pte) = __pa(addr) | prot;
 180                        }
 181                } else {
 182                        continue;
 183                }
 184                pages++;
 185        }
 186        ret = 0;
 187out:
 188        if (direct)
 189                update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages);
 190        return ret;
 191}
 192
 193static void try_free_pte_table(pmd_t *pmd, unsigned long start)
 194{
 195        pte_t *pte;
 196        int i;
 197
 198        /* We can safely assume this is fully in 1:1 mapping & vmemmap area */
 199        pte = pte_offset_kernel(pmd, start);
 200        for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
 201                if (!pte_none(*pte))
 202                        return;
 203        }
 204        vmem_pte_free((unsigned long *) pmd_deref(*pmd));
 205        pmd_clear(pmd);
 206}
 207
 208/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
 209static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 210                                  unsigned long end, bool add, bool direct)
 211{
 212        unsigned long next, prot, pages = 0;
 213        int ret = -ENOMEM;
 214        pmd_t *pmd;
 215        pte_t *pte;
 216
 217        prot = pgprot_val(SEGMENT_KERNEL);
 218        if (!MACHINE_HAS_NX)
 219                prot &= ~_SEGMENT_ENTRY_NOEXEC;
 220
 221        pmd = pmd_offset(pud, addr);
 222        for (; addr < end; addr = next, pmd++) {
 223                next = pmd_addr_end(addr, end);
 224                if (!add) {
 225                        if (pmd_none(*pmd))
 226                                continue;
 227                        if (pmd_large(*pmd)) {
 228                                if (IS_ALIGNED(addr, PMD_SIZE) &&
 229                                    IS_ALIGNED(next, PMD_SIZE)) {
 230                                        if (!direct)
 231                                                vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
 232                                        pmd_clear(pmd);
 233                                        pages++;
 234                                } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) {
 235                                        vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
 236                                        pmd_clear(pmd);
 237                                }
 238                                continue;
 239                        }
 240                } else if (pmd_none(*pmd)) {
 241                        if (IS_ALIGNED(addr, PMD_SIZE) &&
 242                            IS_ALIGNED(next, PMD_SIZE) &&
 243                            MACHINE_HAS_EDAT1 && addr && direct &&
 244                            !debug_pagealloc_enabled()) {
 245                                pmd_val(*pmd) = __pa(addr) | prot;
 246                                pages++;
 247                                continue;
 248                        } else if (!direct && MACHINE_HAS_EDAT1) {
 249                                void *new_page;
 250
 251                                /*
 252                                 * Use 1MB frames for vmemmap if available. We
 253                                 * always use large frames even if they are only
 254                                 * partially used. Otherwise we would have also
 255                                 * page tables since vmemmap_populate gets
 256                                 * called for each section separately.
 257                                 */
 258                                new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE);
 259                                if (new_page) {
 260                                        pmd_val(*pmd) = __pa(new_page) | prot;
 261                                        if (!IS_ALIGNED(addr, PMD_SIZE) ||
 262                                            !IS_ALIGNED(next, PMD_SIZE)) {
 263                                                vmemmap_use_new_sub_pmd(addr, next);
 264                                        }
 265                                        continue;
 266                                }
 267                        }
 268                        pte = vmem_pte_alloc();
 269                        if (!pte)
 270                                goto out;
 271                        pmd_populate(&init_mm, pmd, pte);
 272                } else if (pmd_large(*pmd)) {
 273                        if (!direct)
 274                                vmemmap_use_sub_pmd(addr, next);
 275                        continue;
 276                }
 277                ret = modify_pte_table(pmd, addr, next, add, direct);
 278                if (ret)
 279                        goto out;
 280                if (!add)
 281                        try_free_pte_table(pmd, addr & PMD_MASK);
 282        }
 283        ret = 0;
 284out:
 285        if (direct)
 286                update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages);
 287        return ret;
 288}
 289
 290static void try_free_pmd_table(pud_t *pud, unsigned long start)
 291{
 292        const unsigned long end = start + PUD_SIZE;
 293        pmd_t *pmd;
 294        int i;
 295
 296        /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
 297        if (end > VMALLOC_START)
 298                return;
 299#ifdef CONFIG_KASAN
 300        if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
 301                return;
 302#endif
 303        pmd = pmd_offset(pud, start);
 304        for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
 305                if (!pmd_none(*pmd))
 306                        return;
 307        vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER);
 308        pud_clear(pud);
 309}
 310
 311static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
 312                            bool add, bool direct)
 313{
 314        unsigned long next, prot, pages = 0;
 315        int ret = -ENOMEM;
 316        pud_t *pud;
 317        pmd_t *pmd;
 318
 319        prot = pgprot_val(REGION3_KERNEL);
 320        if (!MACHINE_HAS_NX)
 321                prot &= ~_REGION_ENTRY_NOEXEC;
 322        pud = pud_offset(p4d, addr);
 323        for (; addr < end; addr = next, pud++) {
 324                next = pud_addr_end(addr, end);
 325                if (!add) {
 326                        if (pud_none(*pud))
 327                                continue;
 328                        if (pud_large(*pud)) {
 329                                if (IS_ALIGNED(addr, PUD_SIZE) &&
 330                                    IS_ALIGNED(next, PUD_SIZE)) {
 331                                        pud_clear(pud);
 332                                        pages++;
 333                                }
 334                                continue;
 335                        }
 336                } else if (pud_none(*pud)) {
 337                        if (IS_ALIGNED(addr, PUD_SIZE) &&
 338                            IS_ALIGNED(next, PUD_SIZE) &&
 339                            MACHINE_HAS_EDAT2 && addr && direct &&
 340                            !debug_pagealloc_enabled()) {
 341                                pud_val(*pud) = __pa(addr) | prot;
 342                                pages++;
 343                                continue;
 344                        }
 345                        pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
 346                        if (!pmd)
 347                                goto out;
 348                        pud_populate(&init_mm, pud, pmd);
 349                } else if (pud_large(*pud)) {
 350                        continue;
 351                }
 352                ret = modify_pmd_table(pud, addr, next, add, direct);
 353                if (ret)
 354                        goto out;
 355                if (!add)
 356                        try_free_pmd_table(pud, addr & PUD_MASK);
 357        }
 358        ret = 0;
 359out:
 360        if (direct)
 361                update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages);
 362        return ret;
 363}
 364
 365static void try_free_pud_table(p4d_t *p4d, unsigned long start)
 366{
 367        const unsigned long end = start + P4D_SIZE;
 368        pud_t *pud;
 369        int i;
 370
 371        /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
 372        if (end > VMALLOC_START)
 373                return;
 374#ifdef CONFIG_KASAN
 375        if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
 376                return;
 377#endif
 378
 379        pud = pud_offset(p4d, start);
 380        for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
 381                if (!pud_none(*pud))
 382                        return;
 383        }
 384        vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER);
 385        p4d_clear(p4d);
 386}
 387
 388static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
 389                            bool add, bool direct)
 390{
 391        unsigned long next;
 392        int ret = -ENOMEM;
 393        p4d_t *p4d;
 394        pud_t *pud;
 395
 396        p4d = p4d_offset(pgd, addr);
 397        for (; addr < end; addr = next, p4d++) {
 398                next = p4d_addr_end(addr, end);
 399                if (!add) {
 400                        if (p4d_none(*p4d))
 401                                continue;
 402                } else if (p4d_none(*p4d)) {
 403                        pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
 404                        if (!pud)
 405                                goto out;
 406                        p4d_populate(&init_mm, p4d, pud);
 407                }
 408                ret = modify_pud_table(p4d, addr, next, add, direct);
 409                if (ret)
 410                        goto out;
 411                if (!add)
 412                        try_free_pud_table(p4d, addr & P4D_MASK);
 413        }
 414        ret = 0;
 415out:
 416        return ret;
 417}
 418
 419static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
 420{
 421        const unsigned long end = start + PGDIR_SIZE;
 422        p4d_t *p4d;
 423        int i;
 424
 425        /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
 426        if (end > VMALLOC_START)
 427                return;
 428#ifdef CONFIG_KASAN
 429        if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
 430                return;
 431#endif
 432
 433        p4d = p4d_offset(pgd, start);
 434        for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
 435                if (!p4d_none(*p4d))
 436                        return;
 437        }
 438        vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER);
 439        pgd_clear(pgd);
 440}
 441
 442static int modify_pagetable(unsigned long start, unsigned long end, bool add,
 443                            bool direct)
 444{
 445        unsigned long addr, next;
 446        int ret = -ENOMEM;
 447        pgd_t *pgd;
 448        p4d_t *p4d;
 449
 450        if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
 451                return -EINVAL;
 452        for (addr = start; addr < end; addr = next) {
 453                next = pgd_addr_end(addr, end);
 454                pgd = pgd_offset_k(addr);
 455
 456                if (!add) {
 457                        if (pgd_none(*pgd))
 458                                continue;
 459                } else if (pgd_none(*pgd)) {
 460                        p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
 461                        if (!p4d)
 462                                goto out;
 463                        pgd_populate(&init_mm, pgd, p4d);
 464                }
 465                ret = modify_p4d_table(pgd, addr, next, add, direct);
 466                if (ret)
 467                        goto out;
 468                if (!add)
 469                        try_free_p4d_table(pgd, addr & PGDIR_MASK);
 470        }
 471        ret = 0;
 472out:
 473        if (!add)
 474                flush_tlb_kernel_range(start, end);
 475        return ret;
 476}
 477
 478static int add_pagetable(unsigned long start, unsigned long end, bool direct)
 479{
 480        return modify_pagetable(start, end, true, direct);
 481}
 482
 483static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
 484{
 485        return modify_pagetable(start, end, false, direct);
 486}
 487
 488/*
 489 * Add a physical memory range to the 1:1 mapping.
 490 */
 491static int vmem_add_range(unsigned long start, unsigned long size)
 492{
 493        return add_pagetable(start, start + size, true);
 494}
 495
 496/*
 497 * Remove a physical memory range from the 1:1 mapping.
 498 */
 499static void vmem_remove_range(unsigned long start, unsigned long size)
 500{
 501        remove_pagetable(start, start + size, true);
 502}
 503
 504/*
 505 * Add a backed mem_map array to the virtual mem_map array.
 506 */
 507int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 508                               struct vmem_altmap *altmap)
 509{
 510        int ret;
 511
 512        mutex_lock(&vmem_mutex);
 513        /* We don't care about the node, just use NUMA_NO_NODE on allocations */
 514        ret = add_pagetable(start, end, false);
 515        if (ret)
 516                remove_pagetable(start, end, false);
 517        mutex_unlock(&vmem_mutex);
 518        return ret;
 519}
 520
 521void vmemmap_free(unsigned long start, unsigned long end,
 522                  struct vmem_altmap *altmap)
 523{
 524        mutex_lock(&vmem_mutex);
 525        remove_pagetable(start, end, false);
 526        mutex_unlock(&vmem_mutex);
 527}
 528
 529void vmem_remove_mapping(unsigned long start, unsigned long size)
 530{
 531        mutex_lock(&vmem_mutex);
 532        vmem_remove_range(start, size);
 533        mutex_unlock(&vmem_mutex);
 534}
 535
 536struct range arch_get_mappable_range(void)
 537{
 538        struct range mhp_range;
 539
 540        mhp_range.start = 0;
 541        mhp_range.end =  VMEM_MAX_PHYS - 1;
 542        return mhp_range;
 543}
 544
 545int vmem_add_mapping(unsigned long start, unsigned long size)
 546{
 547        struct range range = arch_get_mappable_range();
 548        int ret;
 549
 550        if (start < range.start ||
 551            start + size > range.end + 1 ||
 552            start + size < start)
 553                return -ERANGE;
 554
 555        mutex_lock(&vmem_mutex);
 556        ret = vmem_add_range(start, size);
 557        if (ret)
 558                vmem_remove_range(start, size);
 559        mutex_unlock(&vmem_mutex);
 560        return ret;
 561}
 562
 563/*
 564 * map whole physical memory to virtual memory (identity mapping)
 565 * we reserve enough space in the vmalloc area for vmemmap to hotplug
 566 * additional memory segments.
 567 */
 568void __init vmem_map_init(void)
 569{
 570        phys_addr_t base, end;
 571        u64 i;
 572
 573        for_each_mem_range(i, &base, &end)
 574                vmem_add_range(base, end - base);
 575        __set_memory((unsigned long)_stext,
 576                     (unsigned long)(_etext - _stext) >> PAGE_SHIFT,
 577                     SET_MEMORY_RO | SET_MEMORY_X);
 578        __set_memory((unsigned long)_etext,
 579                     (unsigned long)(__end_rodata - _etext) >> PAGE_SHIFT,
 580                     SET_MEMORY_RO);
 581        __set_memory((unsigned long)_sinittext,
 582                     (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
 583                     SET_MEMORY_RO | SET_MEMORY_X);
 584        __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT,
 585                     SET_MEMORY_RO | SET_MEMORY_X);
 586
 587        /* we need lowcore executable for our LPSWE instructions */
 588        set_memory_x(0, 1);
 589
 590        pr_info("Write protected kernel read-only data: %luk\n",
 591                (unsigned long)(__end_rodata - _stext) >> 10);
 592}
 593