linux/arch/s390/mm/pgalloc.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  Page table allocation functions
   4 *
   5 *    Copyright IBM Corp. 2016
   6 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
   7 */
   8
   9#include <linux/sysctl.h>
  10#include <linux/slab.h>
  11#include <linux/mm.h>
  12#include <asm/mmu_context.h>
  13#include <asm/pgalloc.h>
  14#include <asm/gmap.h>
  15#include <asm/tlb.h>
  16#include <asm/tlbflush.h>
  17
  18#ifdef CONFIG_PGSTE
  19
  20int page_table_allocate_pgste = 0;
  21EXPORT_SYMBOL(page_table_allocate_pgste);
  22
  23static struct ctl_table page_table_sysctl[] = {
  24        {
  25                .procname       = "allocate_pgste",
  26                .data           = &page_table_allocate_pgste,
  27                .maxlen         = sizeof(int),
  28                .mode           = S_IRUGO | S_IWUSR,
  29                .proc_handler   = proc_dointvec_minmax,
  30                .extra1         = SYSCTL_ZERO,
  31                .extra2         = SYSCTL_ONE,
  32        },
  33        { }
  34};
  35
  36static struct ctl_table page_table_sysctl_dir[] = {
  37        {
  38                .procname       = "vm",
  39                .maxlen         = 0,
  40                .mode           = 0555,
  41                .child          = page_table_sysctl,
  42        },
  43        { }
  44};
  45
  46static int __init page_table_register_sysctl(void)
  47{
  48        return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
  49}
  50__initcall(page_table_register_sysctl);
  51
  52#endif /* CONFIG_PGSTE */
  53
  54unsigned long *crst_table_alloc(struct mm_struct *mm)
  55{
  56        struct page *page = alloc_pages(GFP_KERNEL, 2);
  57
  58        if (!page)
  59                return NULL;
  60        arch_set_page_dat(page, 2);
  61        return (unsigned long *) page_to_phys(page);
  62}
  63
  64void crst_table_free(struct mm_struct *mm, unsigned long *table)
  65{
  66        free_pages((unsigned long) table, 2);
  67}
  68
  69static void __crst_table_upgrade(void *arg)
  70{
  71        struct mm_struct *mm = arg;
  72
  73        if (current->active_mm == mm)
  74                set_user_asce(mm);
  75        __tlb_flush_local();
  76}
  77
  78int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
  79{
  80        unsigned long *table, *pgd;
  81        int rc, notify;
  82
  83        /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
  84        VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE);
  85        rc = 0;
  86        notify = 0;
  87        while (mm->context.asce_limit < end) {
  88                table = crst_table_alloc(mm);
  89                if (!table) {
  90                        rc = -ENOMEM;
  91                        break;
  92                }
  93                spin_lock_bh(&mm->page_table_lock);
  94                pgd = (unsigned long *) mm->pgd;
  95                if (mm->context.asce_limit == _REGION2_SIZE) {
  96                        crst_table_init(table, _REGION2_ENTRY_EMPTY);
  97                        p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd);
  98                        mm->pgd = (pgd_t *) table;
  99                        mm->context.asce_limit = _REGION1_SIZE;
 100                        mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 101                                _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
 102                        mm_inc_nr_puds(mm);
 103                } else {
 104                        crst_table_init(table, _REGION1_ENTRY_EMPTY);
 105                        pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd);
 106                        mm->pgd = (pgd_t *) table;
 107                        mm->context.asce_limit = -PAGE_SIZE;
 108                        mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 109                                _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
 110                }
 111                notify = 1;
 112                spin_unlock_bh(&mm->page_table_lock);
 113        }
 114        if (notify)
 115                on_each_cpu(__crst_table_upgrade, mm, 0);
 116        return rc;
 117}
 118
 119void crst_table_downgrade(struct mm_struct *mm)
 120{
 121        pgd_t *pgd;
 122
 123        /* downgrade should only happen from 3 to 2 levels (compat only) */
 124        VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE);
 125
 126        if (current->active_mm == mm) {
 127                clear_user_asce();
 128                __tlb_flush_mm(mm);
 129        }
 130
 131        pgd = mm->pgd;
 132        mm_dec_nr_pmds(mm);
 133        mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
 134        mm->context.asce_limit = _REGION3_SIZE;
 135        mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 136                           _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
 137        crst_table_free(mm, (unsigned long *) pgd);
 138
 139        if (current->active_mm == mm)
 140                set_user_asce(mm);
 141}
 142
 143static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
 144{
 145        unsigned int old, new;
 146
 147        do {
 148                old = atomic_read(v);
 149                new = old ^ bits;
 150        } while (atomic_cmpxchg(v, old, new) != old);
 151        return new;
 152}
 153
 154#ifdef CONFIG_PGSTE
 155
 156struct page *page_table_alloc_pgste(struct mm_struct *mm)
 157{
 158        struct page *page;
 159        u64 *table;
 160
 161        page = alloc_page(GFP_KERNEL);
 162        if (page) {
 163                table = (u64 *)page_to_phys(page);
 164                memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
 165                memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
 166        }
 167        return page;
 168}
 169
 170void page_table_free_pgste(struct page *page)
 171{
 172        __free_page(page);
 173}
 174
 175#endif /* CONFIG_PGSTE */
 176
 177/*
 178 * page table entry allocation/free routines.
 179 */
 180unsigned long *page_table_alloc(struct mm_struct *mm)
 181{
 182        unsigned long *table;
 183        struct page *page;
 184        unsigned int mask, bit;
 185
 186        /* Try to get a fragment of a 4K page as a 2K page table */
 187        if (!mm_alloc_pgste(mm)) {
 188                table = NULL;
 189                spin_lock_bh(&mm->context.lock);
 190                if (!list_empty(&mm->context.pgtable_list)) {
 191                        page = list_first_entry(&mm->context.pgtable_list,
 192                                                struct page, lru);
 193                        mask = atomic_read(&page->_refcount) >> 24;
 194                        mask = (mask | (mask >> 4)) & 3;
 195                        if (mask != 3) {
 196                                table = (unsigned long *) page_to_phys(page);
 197                                bit = mask & 1;         /* =1 -> second 2K */
 198                                if (bit)
 199                                        table += PTRS_PER_PTE;
 200                                atomic_xor_bits(&page->_refcount,
 201                                                        1U << (bit + 24));
 202                                list_del(&page->lru);
 203                        }
 204                }
 205                spin_unlock_bh(&mm->context.lock);
 206                if (table)
 207                        return table;
 208        }
 209        /* Allocate a fresh page */
 210        page = alloc_page(GFP_KERNEL);
 211        if (!page)
 212                return NULL;
 213        if (!pgtable_page_ctor(page)) {
 214                __free_page(page);
 215                return NULL;
 216        }
 217        arch_set_page_dat(page, 0);
 218        /* Initialize page table */
 219        table = (unsigned long *) page_to_phys(page);
 220        if (mm_alloc_pgste(mm)) {
 221                /* Return 4K page table with PGSTEs */
 222                atomic_xor_bits(&page->_refcount, 3 << 24);
 223                memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
 224                memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
 225        } else {
 226                /* Return the first 2K fragment of the page */
 227                atomic_xor_bits(&page->_refcount, 1 << 24);
 228                memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
 229                spin_lock_bh(&mm->context.lock);
 230                list_add(&page->lru, &mm->context.pgtable_list);
 231                spin_unlock_bh(&mm->context.lock);
 232        }
 233        return table;
 234}
 235
 236void page_table_free(struct mm_struct *mm, unsigned long *table)
 237{
 238        struct page *page;
 239        unsigned int bit, mask;
 240
 241        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 242        if (!mm_alloc_pgste(mm)) {
 243                /* Free 2K page table fragment of a 4K page */
 244                bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
 245                spin_lock_bh(&mm->context.lock);
 246                mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24));
 247                mask >>= 24;
 248                if (mask & 3)
 249                        list_add(&page->lru, &mm->context.pgtable_list);
 250                else
 251                        list_del(&page->lru);
 252                spin_unlock_bh(&mm->context.lock);
 253                if (mask != 0)
 254                        return;
 255        } else {
 256                atomic_xor_bits(&page->_refcount, 3U << 24);
 257        }
 258
 259        pgtable_page_dtor(page);
 260        __free_page(page);
 261}
 262
 263void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
 264                         unsigned long vmaddr)
 265{
 266        struct mm_struct *mm;
 267        struct page *page;
 268        unsigned int bit, mask;
 269
 270        mm = tlb->mm;
 271        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 272        if (mm_alloc_pgste(mm)) {
 273                gmap_unlink(mm, table, vmaddr);
 274                table = (unsigned long *) (__pa(table) | 3);
 275                tlb_remove_table(tlb, table);
 276                return;
 277        }
 278        bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
 279        spin_lock_bh(&mm->context.lock);
 280        mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
 281        mask >>= 24;
 282        if (mask & 3)
 283                list_add_tail(&page->lru, &mm->context.pgtable_list);
 284        else
 285                list_del(&page->lru);
 286        spin_unlock_bh(&mm->context.lock);
 287        table = (unsigned long *) (__pa(table) | (1U << bit));
 288        tlb_remove_table(tlb, table);
 289}
 290
 291void __tlb_remove_table(void *_table)
 292{
 293        unsigned int mask = (unsigned long) _table & 3;
 294        void *table = (void *)((unsigned long) _table ^ mask);
 295        struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 296
 297        switch (mask) {
 298        case 0:         /* pmd, pud, or p4d */
 299                free_pages((unsigned long) table, 2);
 300                break;
 301        case 1:         /* lower 2K of a 4K page table */
 302        case 2:         /* higher 2K of a 4K page table */
 303                mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
 304                mask >>= 24;
 305                if (mask != 0)
 306                        break;
 307                /* fallthrough */
 308        case 3:         /* 4K page table with pgstes */
 309                if (mask & 3)
 310                        atomic_xor_bits(&page->_refcount, 3 << 24);
 311                pgtable_page_dtor(page);
 312                __free_page(page);
 313                break;
 314        }
 315}
 316
 317/*
 318 * Base infrastructure required to generate basic asces, region, segment,
 319 * and page tables that do not make use of enhanced features like EDAT1.
 320 */
 321
 322static struct kmem_cache *base_pgt_cache;
 323
 324static unsigned long base_pgt_alloc(void)
 325{
 326        u64 *table;
 327
 328        table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
 329        if (table)
 330                memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
 331        return (unsigned long) table;
 332}
 333
 334static void base_pgt_free(unsigned long table)
 335{
 336        kmem_cache_free(base_pgt_cache, (void *) table);
 337}
 338
 339static unsigned long base_crst_alloc(unsigned long val)
 340{
 341        unsigned long table;
 342
 343        table =  __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
 344        if (table)
 345                crst_table_init((unsigned long *)table, val);
 346        return table;
 347}
 348
 349static void base_crst_free(unsigned long table)
 350{
 351        free_pages(table, CRST_ALLOC_ORDER);
 352}
 353
 354#define BASE_ADDR_END_FUNC(NAME, SIZE)                                  \
 355static inline unsigned long base_##NAME##_addr_end(unsigned long addr,  \
 356                                                   unsigned long end)   \
 357{                                                                       \
 358        unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1);           \
 359                                                                        \
 360        return (next - 1) < (end - 1) ? next : end;                     \
 361}
 362
 363BASE_ADDR_END_FUNC(page,    _PAGE_SIZE)
 364BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
 365BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
 366BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
 367BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
 368
 369static inline unsigned long base_lra(unsigned long address)
 370{
 371        unsigned long real;
 372
 373        asm volatile(
 374                "       lra     %0,0(%1)\n"
 375                : "=d" (real) : "a" (address) : "cc");
 376        return real;
 377}
 378
 379static int base_page_walk(unsigned long origin, unsigned long addr,
 380                          unsigned long end, int alloc)
 381{
 382        unsigned long *pte, next;
 383
 384        if (!alloc)
 385                return 0;
 386        pte = (unsigned long *) origin;
 387        pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
 388        do {
 389                next = base_page_addr_end(addr, end);
 390                *pte = base_lra(addr);
 391        } while (pte++, addr = next, addr < end);
 392        return 0;
 393}
 394
 395static int base_segment_walk(unsigned long origin, unsigned long addr,
 396                             unsigned long end, int alloc)
 397{
 398        unsigned long *ste, next, table;
 399        int rc;
 400
 401        ste = (unsigned long *) origin;
 402        ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
 403        do {
 404                next = base_segment_addr_end(addr, end);
 405                if (*ste & _SEGMENT_ENTRY_INVALID) {
 406                        if (!alloc)
 407                                continue;
 408                        table = base_pgt_alloc();
 409                        if (!table)
 410                                return -ENOMEM;
 411                        *ste = table | _SEGMENT_ENTRY;
 412                }
 413                table = *ste & _SEGMENT_ENTRY_ORIGIN;
 414                rc = base_page_walk(table, addr, next, alloc);
 415                if (rc)
 416                        return rc;
 417                if (!alloc)
 418                        base_pgt_free(table);
 419                cond_resched();
 420        } while (ste++, addr = next, addr < end);
 421        return 0;
 422}
 423
 424static int base_region3_walk(unsigned long origin, unsigned long addr,
 425                             unsigned long end, int alloc)
 426{
 427        unsigned long *rtte, next, table;
 428        int rc;
 429
 430        rtte = (unsigned long *) origin;
 431        rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
 432        do {
 433                next = base_region3_addr_end(addr, end);
 434                if (*rtte & _REGION_ENTRY_INVALID) {
 435                        if (!alloc)
 436                                continue;
 437                        table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
 438                        if (!table)
 439                                return -ENOMEM;
 440                        *rtte = table | _REGION3_ENTRY;
 441                }
 442                table = *rtte & _REGION_ENTRY_ORIGIN;
 443                rc = base_segment_walk(table, addr, next, alloc);
 444                if (rc)
 445                        return rc;
 446                if (!alloc)
 447                        base_crst_free(table);
 448        } while (rtte++, addr = next, addr < end);
 449        return 0;
 450}
 451
 452static int base_region2_walk(unsigned long origin, unsigned long addr,
 453                             unsigned long end, int alloc)
 454{
 455        unsigned long *rste, next, table;
 456        int rc;
 457
 458        rste = (unsigned long *) origin;
 459        rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
 460        do {
 461                next = base_region2_addr_end(addr, end);
 462                if (*rste & _REGION_ENTRY_INVALID) {
 463                        if (!alloc)
 464                                continue;
 465                        table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
 466                        if (!table)
 467                                return -ENOMEM;
 468                        *rste = table | _REGION2_ENTRY;
 469                }
 470                table = *rste & _REGION_ENTRY_ORIGIN;
 471                rc = base_region3_walk(table, addr, next, alloc);
 472                if (rc)
 473                        return rc;
 474                if (!alloc)
 475                        base_crst_free(table);
 476        } while (rste++, addr = next, addr < end);
 477        return 0;
 478}
 479
 480static int base_region1_walk(unsigned long origin, unsigned long addr,
 481                             unsigned long end, int alloc)
 482{
 483        unsigned long *rfte, next, table;
 484        int rc;
 485
 486        rfte = (unsigned long *) origin;
 487        rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
 488        do {
 489                next = base_region1_addr_end(addr, end);
 490                if (*rfte & _REGION_ENTRY_INVALID) {
 491                        if (!alloc)
 492                                continue;
 493                        table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
 494                        if (!table)
 495                                return -ENOMEM;
 496                        *rfte = table | _REGION1_ENTRY;
 497                }
 498                table = *rfte & _REGION_ENTRY_ORIGIN;
 499                rc = base_region2_walk(table, addr, next, alloc);
 500                if (rc)
 501                        return rc;
 502                if (!alloc)
 503                        base_crst_free(table);
 504        } while (rfte++, addr = next, addr < end);
 505        return 0;
 506}
 507
 508/**
 509 * base_asce_free - free asce and tables returned from base_asce_alloc()
 510 * @asce: asce to be freed
 511 *
 512 * Frees all region, segment, and page tables that were allocated with a
 513 * corresponding base_asce_alloc() call.
 514 */
 515void base_asce_free(unsigned long asce)
 516{
 517        unsigned long table = asce & _ASCE_ORIGIN;
 518
 519        if (!asce)
 520                return;
 521        switch (asce & _ASCE_TYPE_MASK) {
 522        case _ASCE_TYPE_SEGMENT:
 523                base_segment_walk(table, 0, _REGION3_SIZE, 0);
 524                break;
 525        case _ASCE_TYPE_REGION3:
 526                base_region3_walk(table, 0, _REGION2_SIZE, 0);
 527                break;
 528        case _ASCE_TYPE_REGION2:
 529                base_region2_walk(table, 0, _REGION1_SIZE, 0);
 530                break;
 531        case _ASCE_TYPE_REGION1:
 532                base_region1_walk(table, 0, -_PAGE_SIZE, 0);
 533                break;
 534        }
 535        base_crst_free(table);
 536}
 537
 538static int base_pgt_cache_init(void)
 539{
 540        static DEFINE_MUTEX(base_pgt_cache_mutex);
 541        unsigned long sz = _PAGE_TABLE_SIZE;
 542
 543        if (base_pgt_cache)
 544                return 0;
 545        mutex_lock(&base_pgt_cache_mutex);
 546        if (!base_pgt_cache)
 547                base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
 548        mutex_unlock(&base_pgt_cache_mutex);
 549        return base_pgt_cache ? 0 : -ENOMEM;
 550}
 551
 552/**
 553 * base_asce_alloc - create kernel mapping without enhanced DAT features
 554 * @addr: virtual start address of kernel mapping
 555 * @num_pages: number of consecutive pages
 556 *
 557 * Generate an asce, including all required region, segment and page tables,
 558 * that can be used to access the virtual kernel mapping. The difference is
 559 * that the returned asce does not make use of any enhanced DAT features like
 560 * e.g. large pages. This is required for some I/O functions that pass an
 561 * asce, like e.g. some service call requests.
 562 *
 563 * Note: the returned asce may NEVER be attached to any cpu. It may only be
 564 *       used for I/O requests. tlb entries that might result because the
 565 *       asce was attached to a cpu won't be cleared.
 566 */
 567unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
 568{
 569        unsigned long asce, table, end;
 570        int rc;
 571
 572        if (base_pgt_cache_init())
 573                return 0;
 574        end = addr + num_pages * PAGE_SIZE;
 575        if (end <= _REGION3_SIZE) {
 576                table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
 577                if (!table)
 578                        return 0;
 579                rc = base_segment_walk(table, addr, end, 1);
 580                asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
 581        } else if (end <= _REGION2_SIZE) {
 582                table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
 583                if (!table)
 584                        return 0;
 585                rc = base_region3_walk(table, addr, end, 1);
 586                asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
 587        } else if (end <= _REGION1_SIZE) {
 588                table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
 589                if (!table)
 590                        return 0;
 591                rc = base_region2_walk(table, addr, end, 1);
 592                asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
 593        } else {
 594                table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
 595                if (!table)
 596                        return 0;
 597                rc = base_region1_walk(table, addr, end, 1);
 598                asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
 599        }
 600        if (rc) {
 601                base_asce_free(asce);
 602                asce = 0;
 603        }
 604        return asce;
 605}
 606