linux/arch/s390/mm/pgalloc.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  Page table allocation functions
   4 *
   5 *    Copyright IBM Corp. 2016
   6 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
   7 */
   8
   9#include <linux/sysctl.h>
  10#include <linux/slab.h>
  11#include <linux/mm.h>
  12#include <asm/mmu_context.h>
  13#include <asm/pgalloc.h>
  14#include <asm/gmap.h>
  15#include <asm/tlb.h>
  16#include <asm/tlbflush.h>
  17
  18#ifdef CONFIG_PGSTE
  19
  20int page_table_allocate_pgste = 0;
  21EXPORT_SYMBOL(page_table_allocate_pgste);
  22
  23static struct ctl_table page_table_sysctl[] = {
  24        {
  25                .procname       = "allocate_pgste",
  26                .data           = &page_table_allocate_pgste,
  27                .maxlen         = sizeof(int),
  28                .mode           = S_IRUGO | S_IWUSR,
  29                .proc_handler   = proc_dointvec_minmax,
  30                .extra1         = SYSCTL_ZERO,
  31                .extra2         = SYSCTL_ONE,
  32        },
  33        { }
  34};
  35
  36static struct ctl_table page_table_sysctl_dir[] = {
  37        {
  38                .procname       = "vm",
  39                .maxlen         = 0,
  40                .mode           = 0555,
  41                .child          = page_table_sysctl,
  42        },
  43        { }
  44};
  45
  46static int __init page_table_register_sysctl(void)
  47{
  48        return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
  49}
  50__initcall(page_table_register_sysctl);
  51
  52#endif /* CONFIG_PGSTE */
  53
  54unsigned long *crst_table_alloc(struct mm_struct *mm)
  55{
  56        struct page *page = alloc_pages(GFP_KERNEL, 2);
  57
  58        if (!page)
  59                return NULL;
  60        arch_set_page_dat(page, 2);
  61        return (unsigned long *) page_to_virt(page);
  62}
  63
  64void crst_table_free(struct mm_struct *mm, unsigned long *table)
  65{
  66        free_pages((unsigned long) table, 2);
  67}
  68
  69static void __crst_table_upgrade(void *arg)
  70{
  71        struct mm_struct *mm = arg;
  72
  73        /* change all active ASCEs to avoid the creation of new TLBs */
  74        if (current->active_mm == mm) {
  75                S390_lowcore.user_asce = mm->context.asce;
  76                __ctl_load(S390_lowcore.user_asce, 7, 7);
  77        }
  78        __tlb_flush_local();
  79}
  80
  81int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
  82{
  83        unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
  84        unsigned long asce_limit = mm->context.asce_limit;
  85
  86        /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
  87        VM_BUG_ON(asce_limit < _REGION2_SIZE);
  88
  89        if (end <= asce_limit)
  90                return 0;
  91
  92        if (asce_limit == _REGION2_SIZE) {
  93                p4d = crst_table_alloc(mm);
  94                if (unlikely(!p4d))
  95                        goto err_p4d;
  96                crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
  97        }
  98        if (end > _REGION1_SIZE) {
  99                pgd = crst_table_alloc(mm);
 100                if (unlikely(!pgd))
 101                        goto err_pgd;
 102                crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
 103        }
 104
 105        spin_lock_bh(&mm->page_table_lock);
 106
 107        /*
 108         * This routine gets called with mmap_lock lock held and there is
 109         * no reason to optimize for the case of otherwise. However, if
 110         * that would ever change, the below check will let us know.
 111         */
 112        VM_BUG_ON(asce_limit != mm->context.asce_limit);
 113
 114        if (p4d) {
 115                __pgd = (unsigned long *) mm->pgd;
 116                p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
 117                mm->pgd = (pgd_t *) p4d;
 118                mm->context.asce_limit = _REGION1_SIZE;
 119                mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 120                        _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
 121                mm_inc_nr_puds(mm);
 122        }
 123        if (pgd) {
 124                __pgd = (unsigned long *) mm->pgd;
 125                pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
 126                mm->pgd = (pgd_t *) pgd;
 127                mm->context.asce_limit = TASK_SIZE_MAX;
 128                mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 129                        _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
 130        }
 131
 132        spin_unlock_bh(&mm->page_table_lock);
 133
 134        on_each_cpu(__crst_table_upgrade, mm, 0);
 135
 136        return 0;
 137
 138err_pgd:
 139        crst_table_free(mm, p4d);
 140err_p4d:
 141        return -ENOMEM;
 142}
 143
 144static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
 145{
 146        unsigned int old, new;
 147
 148        do {
 149                old = atomic_read(v);
 150                new = old ^ bits;
 151        } while (atomic_cmpxchg(v, old, new) != old);
 152        return new;
 153}
 154
 155#ifdef CONFIG_PGSTE
 156
 157struct page *page_table_alloc_pgste(struct mm_struct *mm)
 158{
 159        struct page *page;
 160        u64 *table;
 161
 162        page = alloc_page(GFP_KERNEL);
 163        if (page) {
 164                table = (u64 *)page_to_virt(page);
 165                memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
 166                memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
 167        }
 168        return page;
 169}
 170
 171void page_table_free_pgste(struct page *page)
 172{
 173        __free_page(page);
 174}
 175
 176#endif /* CONFIG_PGSTE */
 177
 178/*
 179 * page table entry allocation/free routines.
 180 */
 181unsigned long *page_table_alloc(struct mm_struct *mm)
 182{
 183        unsigned long *table;
 184        struct page *page;
 185        unsigned int mask, bit;
 186
 187        /* Try to get a fragment of a 4K page as a 2K page table */
 188        if (!mm_alloc_pgste(mm)) {
 189                table = NULL;
 190                spin_lock_bh(&mm->context.lock);
 191                if (!list_empty(&mm->context.pgtable_list)) {
 192                        page = list_first_entry(&mm->context.pgtable_list,
 193                                                struct page, lru);
 194                        mask = atomic_read(&page->_refcount) >> 24;
 195                        mask = (mask | (mask >> 4)) & 3;
 196                        if (mask != 3) {
 197                                table = (unsigned long *) page_to_virt(page);
 198                                bit = mask & 1;         /* =1 -> second 2K */
 199                                if (bit)
 200                                        table += PTRS_PER_PTE;
 201                                atomic_xor_bits(&page->_refcount,
 202                                                        1U << (bit + 24));
 203                                list_del(&page->lru);
 204                        }
 205                }
 206                spin_unlock_bh(&mm->context.lock);
 207                if (table)
 208                        return table;
 209        }
 210        /* Allocate a fresh page */
 211        page = alloc_page(GFP_KERNEL);
 212        if (!page)
 213                return NULL;
 214        if (!pgtable_pte_page_ctor(page)) {
 215                __free_page(page);
 216                return NULL;
 217        }
 218        arch_set_page_dat(page, 0);
 219        /* Initialize page table */
 220        table = (unsigned long *) page_to_virt(page);
 221        if (mm_alloc_pgste(mm)) {
 222                /* Return 4K page table with PGSTEs */
 223                atomic_xor_bits(&page->_refcount, 3 << 24);
 224                memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
 225                memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
 226        } else {
 227                /* Return the first 2K fragment of the page */
 228                atomic_xor_bits(&page->_refcount, 1 << 24);
 229                memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
 230                spin_lock_bh(&mm->context.lock);
 231                list_add(&page->lru, &mm->context.pgtable_list);
 232                spin_unlock_bh(&mm->context.lock);
 233        }
 234        return table;
 235}
 236
 237void page_table_free(struct mm_struct *mm, unsigned long *table)
 238{
 239        struct page *page;
 240        unsigned int bit, mask;
 241
 242        page = virt_to_page(table);
 243        if (!mm_alloc_pgste(mm)) {
 244                /* Free 2K page table fragment of a 4K page */
 245                bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
 246                spin_lock_bh(&mm->context.lock);
 247                mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24));
 248                mask >>= 24;
 249                if (mask & 3)
 250                        list_add(&page->lru, &mm->context.pgtable_list);
 251                else
 252                        list_del(&page->lru);
 253                spin_unlock_bh(&mm->context.lock);
 254                if (mask != 0)
 255                        return;
 256        } else {
 257                atomic_xor_bits(&page->_refcount, 3U << 24);
 258        }
 259
 260        pgtable_pte_page_dtor(page);
 261        __free_page(page);
 262}
 263
 264void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
 265                         unsigned long vmaddr)
 266{
 267        struct mm_struct *mm;
 268        struct page *page;
 269        unsigned int bit, mask;
 270
 271        mm = tlb->mm;
 272        page = virt_to_page(table);
 273        if (mm_alloc_pgste(mm)) {
 274                gmap_unlink(mm, table, vmaddr);
 275                table = (unsigned long *) ((unsigned long)table | 3);
 276                tlb_remove_table(tlb, table);
 277                return;
 278        }
 279        bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
 280        spin_lock_bh(&mm->context.lock);
 281        mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
 282        mask >>= 24;
 283        if (mask & 3)
 284                list_add_tail(&page->lru, &mm->context.pgtable_list);
 285        else
 286                list_del(&page->lru);
 287        spin_unlock_bh(&mm->context.lock);
 288        table = (unsigned long *) ((unsigned long) table | (1U << bit));
 289        tlb_remove_table(tlb, table);
 290}
 291
 292void __tlb_remove_table(void *_table)
 293{
 294        unsigned int mask = (unsigned long) _table & 3;
 295        void *table = (void *)((unsigned long) _table ^ mask);
 296        struct page *page = virt_to_page(table);
 297
 298        switch (mask) {
 299        case 0:         /* pmd, pud, or p4d */
 300                free_pages((unsigned long) table, 2);
 301                break;
 302        case 1:         /* lower 2K of a 4K page table */
 303        case 2:         /* higher 2K of a 4K page table */
 304                mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
 305                mask >>= 24;
 306                if (mask != 0)
 307                        break;
 308                fallthrough;
 309        case 3:         /* 4K page table with pgstes */
 310                if (mask & 3)
 311                        atomic_xor_bits(&page->_refcount, 3 << 24);
 312                pgtable_pte_page_dtor(page);
 313                __free_page(page);
 314                break;
 315        }
 316}
 317
 318/*
 319 * Base infrastructure required to generate basic asces, region, segment,
 320 * and page tables that do not make use of enhanced features like EDAT1.
 321 */
 322
 323static struct kmem_cache *base_pgt_cache;
 324
 325static unsigned long base_pgt_alloc(void)
 326{
 327        u64 *table;
 328
 329        table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
 330        if (table)
 331                memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
 332        return (unsigned long) table;
 333}
 334
 335static void base_pgt_free(unsigned long table)
 336{
 337        kmem_cache_free(base_pgt_cache, (void *) table);
 338}
 339
 340static unsigned long base_crst_alloc(unsigned long val)
 341{
 342        unsigned long table;
 343
 344        table =  __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
 345        if (table)
 346                crst_table_init((unsigned long *)table, val);
 347        return table;
 348}
 349
 350static void base_crst_free(unsigned long table)
 351{
 352        free_pages(table, CRST_ALLOC_ORDER);
 353}
 354
 355#define BASE_ADDR_END_FUNC(NAME, SIZE)                                  \
 356static inline unsigned long base_##NAME##_addr_end(unsigned long addr,  \
 357                                                   unsigned long end)   \
 358{                                                                       \
 359        unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1);           \
 360                                                                        \
 361        return (next - 1) < (end - 1) ? next : end;                     \
 362}
 363
 364BASE_ADDR_END_FUNC(page,    _PAGE_SIZE)
 365BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
 366BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
 367BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
 368BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
 369
 370static inline unsigned long base_lra(unsigned long address)
 371{
 372        unsigned long real;
 373
 374        asm volatile(
 375                "       lra     %0,0(%1)\n"
 376                : "=d" (real) : "a" (address) : "cc");
 377        return real;
 378}
 379
 380static int base_page_walk(unsigned long origin, unsigned long addr,
 381                          unsigned long end, int alloc)
 382{
 383        unsigned long *pte, next;
 384
 385        if (!alloc)
 386                return 0;
 387        pte = (unsigned long *) origin;
 388        pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
 389        do {
 390                next = base_page_addr_end(addr, end);
 391                *pte = base_lra(addr);
 392        } while (pte++, addr = next, addr < end);
 393        return 0;
 394}
 395
 396static int base_segment_walk(unsigned long origin, unsigned long addr,
 397                             unsigned long end, int alloc)
 398{
 399        unsigned long *ste, next, table;
 400        int rc;
 401
 402        ste = (unsigned long *) origin;
 403        ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
 404        do {
 405                next = base_segment_addr_end(addr, end);
 406                if (*ste & _SEGMENT_ENTRY_INVALID) {
 407                        if (!alloc)
 408                                continue;
 409                        table = base_pgt_alloc();
 410                        if (!table)
 411                                return -ENOMEM;
 412                        *ste = table | _SEGMENT_ENTRY;
 413                }
 414                table = *ste & _SEGMENT_ENTRY_ORIGIN;
 415                rc = base_page_walk(table, addr, next, alloc);
 416                if (rc)
 417                        return rc;
 418                if (!alloc)
 419                        base_pgt_free(table);
 420                cond_resched();
 421        } while (ste++, addr = next, addr < end);
 422        return 0;
 423}
 424
 425static int base_region3_walk(unsigned long origin, unsigned long addr,
 426                             unsigned long end, int alloc)
 427{
 428        unsigned long *rtte, next, table;
 429        int rc;
 430
 431        rtte = (unsigned long *) origin;
 432        rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
 433        do {
 434                next = base_region3_addr_end(addr, end);
 435                if (*rtte & _REGION_ENTRY_INVALID) {
 436                        if (!alloc)
 437                                continue;
 438                        table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
 439                        if (!table)
 440                                return -ENOMEM;
 441                        *rtte = table | _REGION3_ENTRY;
 442                }
 443                table = *rtte & _REGION_ENTRY_ORIGIN;
 444                rc = base_segment_walk(table, addr, next, alloc);
 445                if (rc)
 446                        return rc;
 447                if (!alloc)
 448                        base_crst_free(table);
 449        } while (rtte++, addr = next, addr < end);
 450        return 0;
 451}
 452
 453static int base_region2_walk(unsigned long origin, unsigned long addr,
 454                             unsigned long end, int alloc)
 455{
 456        unsigned long *rste, next, table;
 457        int rc;
 458
 459        rste = (unsigned long *) origin;
 460        rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
 461        do {
 462                next = base_region2_addr_end(addr, end);
 463                if (*rste & _REGION_ENTRY_INVALID) {
 464                        if (!alloc)
 465                                continue;
 466                        table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
 467                        if (!table)
 468                                return -ENOMEM;
 469                        *rste = table | _REGION2_ENTRY;
 470                }
 471                table = *rste & _REGION_ENTRY_ORIGIN;
 472                rc = base_region3_walk(table, addr, next, alloc);
 473                if (rc)
 474                        return rc;
 475                if (!alloc)
 476                        base_crst_free(table);
 477        } while (rste++, addr = next, addr < end);
 478        return 0;
 479}
 480
 481static int base_region1_walk(unsigned long origin, unsigned long addr,
 482                             unsigned long end, int alloc)
 483{
 484        unsigned long *rfte, next, table;
 485        int rc;
 486
 487        rfte = (unsigned long *) origin;
 488        rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
 489        do {
 490                next = base_region1_addr_end(addr, end);
 491                if (*rfte & _REGION_ENTRY_INVALID) {
 492                        if (!alloc)
 493                                continue;
 494                        table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
 495                        if (!table)
 496                                return -ENOMEM;
 497                        *rfte = table | _REGION1_ENTRY;
 498                }
 499                table = *rfte & _REGION_ENTRY_ORIGIN;
 500                rc = base_region2_walk(table, addr, next, alloc);
 501                if (rc)
 502                        return rc;
 503                if (!alloc)
 504                        base_crst_free(table);
 505        } while (rfte++, addr = next, addr < end);
 506        return 0;
 507}
 508
 509/**
 510 * base_asce_free - free asce and tables returned from base_asce_alloc()
 511 * @asce: asce to be freed
 512 *
 513 * Frees all region, segment, and page tables that were allocated with a
 514 * corresponding base_asce_alloc() call.
 515 */
 516void base_asce_free(unsigned long asce)
 517{
 518        unsigned long table = asce & _ASCE_ORIGIN;
 519
 520        if (!asce)
 521                return;
 522        switch (asce & _ASCE_TYPE_MASK) {
 523        case _ASCE_TYPE_SEGMENT:
 524                base_segment_walk(table, 0, _REGION3_SIZE, 0);
 525                break;
 526        case _ASCE_TYPE_REGION3:
 527                base_region3_walk(table, 0, _REGION2_SIZE, 0);
 528                break;
 529        case _ASCE_TYPE_REGION2:
 530                base_region2_walk(table, 0, _REGION1_SIZE, 0);
 531                break;
 532        case _ASCE_TYPE_REGION1:
 533                base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
 534                break;
 535        }
 536        base_crst_free(table);
 537}
 538
 539static int base_pgt_cache_init(void)
 540{
 541        static DEFINE_MUTEX(base_pgt_cache_mutex);
 542        unsigned long sz = _PAGE_TABLE_SIZE;
 543
 544        if (base_pgt_cache)
 545                return 0;
 546        mutex_lock(&base_pgt_cache_mutex);
 547        if (!base_pgt_cache)
 548                base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
 549        mutex_unlock(&base_pgt_cache_mutex);
 550        return base_pgt_cache ? 0 : -ENOMEM;
 551}
 552
 553/**
 554 * base_asce_alloc - create kernel mapping without enhanced DAT features
 555 * @addr: virtual start address of kernel mapping
 556 * @num_pages: number of consecutive pages
 557 *
 558 * Generate an asce, including all required region, segment and page tables,
 559 * that can be used to access the virtual kernel mapping. The difference is
 560 * that the returned asce does not make use of any enhanced DAT features like
 561 * e.g. large pages. This is required for some I/O functions that pass an
 562 * asce, like e.g. some service call requests.
 563 *
 564 * Note: the returned asce may NEVER be attached to any cpu. It may only be
 565 *       used for I/O requests. tlb entries that might result because the
 566 *       asce was attached to a cpu won't be cleared.
 567 */
 568unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
 569{
 570        unsigned long asce, table, end;
 571        int rc;
 572
 573        if (base_pgt_cache_init())
 574                return 0;
 575        end = addr + num_pages * PAGE_SIZE;
 576        if (end <= _REGION3_SIZE) {
 577                table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
 578                if (!table)
 579                        return 0;
 580                rc = base_segment_walk(table, addr, end, 1);
 581                asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
 582        } else if (end <= _REGION2_SIZE) {
 583                table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
 584                if (!table)
 585                        return 0;
 586                rc = base_region3_walk(table, addr, end, 1);
 587                asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
 588        } else if (end <= _REGION1_SIZE) {
 589                table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
 590                if (!table)
 591                        return 0;
 592                rc = base_region2_walk(table, addr, end, 1);
 593                asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
 594        } else {
 595                table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
 596                if (!table)
 597                        return 0;
 598                rc = base_region1_walk(table, addr, end, 1);
 599                asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
 600        }
 601        if (rc) {
 602                base_asce_free(asce);
 603                asce = 0;
 604        }
 605        return asce;
 606}
 607