linux/arch/s390/mm/pgalloc.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  Page table allocation functions
   4 *
   5 *    Copyright IBM Corp. 2016
   6 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
   7 */
   8
   9#include <linux/sysctl.h>
  10#include <linux/slab.h>
  11#include <linux/mm.h>
  12#include <asm/mmu_context.h>
  13#include <asm/pgalloc.h>
  14#include <asm/gmap.h>
  15#include <asm/tlb.h>
  16#include <asm/tlbflush.h>
  17
  18#ifdef CONFIG_PGSTE
  19
  20static int page_table_allocate_pgste_min = 0;
  21static int page_table_allocate_pgste_max = 1;
  22int page_table_allocate_pgste = 0;
  23EXPORT_SYMBOL(page_table_allocate_pgste);
  24
  25static struct ctl_table page_table_sysctl[] = {
  26        {
  27                .procname       = "allocate_pgste",
  28                .data           = &page_table_allocate_pgste,
  29                .maxlen         = sizeof(int),
  30                .mode           = S_IRUGO | S_IWUSR,
  31                .proc_handler   = proc_dointvec_minmax,
  32                .extra1         = &page_table_allocate_pgste_min,
  33                .extra2         = &page_table_allocate_pgste_max,
  34        },
  35        { }
  36};
  37
  38static struct ctl_table page_table_sysctl_dir[] = {
  39        {
  40                .procname       = "vm",
  41                .maxlen         = 0,
  42                .mode           = 0555,
  43                .child          = page_table_sysctl,
  44        },
  45        { }
  46};
  47
  48static int __init page_table_register_sysctl(void)
  49{
  50        return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
  51}
  52__initcall(page_table_register_sysctl);
  53
  54#endif /* CONFIG_PGSTE */
  55
  56unsigned long *crst_table_alloc(struct mm_struct *mm)
  57{
  58        struct page *page = alloc_pages(GFP_KERNEL, 2);
  59
  60        if (!page)
  61                return NULL;
  62        arch_set_page_dat(page, 2);
  63        return (unsigned long *) page_to_phys(page);
  64}
  65
  66void crst_table_free(struct mm_struct *mm, unsigned long *table)
  67{
  68        free_pages((unsigned long) table, 2);
  69}
  70
  71static void __crst_table_upgrade(void *arg)
  72{
  73        struct mm_struct *mm = arg;
  74
  75        if (current->active_mm == mm)
  76                set_user_asce(mm);
  77        __tlb_flush_local();
  78}
  79
  80int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
  81{
  82        unsigned long *table, *pgd;
  83        int rc, notify;
  84
  85        /* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
  86        VM_BUG_ON(mm->context.asce_limit < _REGION2_SIZE);
  87        rc = 0;
  88        notify = 0;
  89        while (mm->context.asce_limit < end) {
  90                table = crst_table_alloc(mm);
  91                if (!table) {
  92                        rc = -ENOMEM;
  93                        break;
  94                }
  95                spin_lock_bh(&mm->page_table_lock);
  96                pgd = (unsigned long *) mm->pgd;
  97                if (mm->context.asce_limit == _REGION2_SIZE) {
  98                        crst_table_init(table, _REGION2_ENTRY_EMPTY);
  99                        p4d_populate(mm, (p4d_t *) table, (pud_t *) pgd);
 100                        mm->pgd = (pgd_t *) table;
 101                        mm->context.asce_limit = _REGION1_SIZE;
 102                        mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 103                                _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
 104                        mm_inc_nr_puds(mm);
 105                } else {
 106                        crst_table_init(table, _REGION1_ENTRY_EMPTY);
 107                        pgd_populate(mm, (pgd_t *) table, (p4d_t *) pgd);
 108                        mm->pgd = (pgd_t *) table;
 109                        mm->context.asce_limit = -PAGE_SIZE;
 110                        mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 111                                _ASCE_USER_BITS | _ASCE_TYPE_REGION1;
 112                }
 113                notify = 1;
 114                spin_unlock_bh(&mm->page_table_lock);
 115        }
 116        if (notify)
 117                on_each_cpu(__crst_table_upgrade, mm, 0);
 118        return rc;
 119}
 120
 121void crst_table_downgrade(struct mm_struct *mm)
 122{
 123        pgd_t *pgd;
 124
 125        /* downgrade should only happen from 3 to 2 levels (compat only) */
 126        VM_BUG_ON(mm->context.asce_limit != _REGION2_SIZE);
 127
 128        if (current->active_mm == mm) {
 129                clear_user_asce();
 130                __tlb_flush_mm(mm);
 131        }
 132
 133        pgd = mm->pgd;
 134        mm_dec_nr_pmds(mm);
 135        mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
 136        mm->context.asce_limit = _REGION3_SIZE;
 137        mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
 138                           _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
 139        crst_table_free(mm, (unsigned long *) pgd);
 140
 141        if (current->active_mm == mm)
 142                set_user_asce(mm);
 143}
 144
 145static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
 146{
 147        unsigned int old, new;
 148
 149        do {
 150                old = atomic_read(v);
 151                new = old ^ bits;
 152        } while (atomic_cmpxchg(v, old, new) != old);
 153        return new;
 154}
 155
 156#ifdef CONFIG_PGSTE
 157
 158struct page *page_table_alloc_pgste(struct mm_struct *mm)
 159{
 160        struct page *page;
 161        u64 *table;
 162
 163        page = alloc_page(GFP_KERNEL);
 164        if (page) {
 165                table = (u64 *)page_to_phys(page);
 166                memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
 167                memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
 168        }
 169        return page;
 170}
 171
 172void page_table_free_pgste(struct page *page)
 173{
 174        __free_page(page);
 175}
 176
 177#endif /* CONFIG_PGSTE */
 178
 179/*
 180 * page table entry allocation/free routines.
 181 */
 182unsigned long *page_table_alloc(struct mm_struct *mm)
 183{
 184        unsigned long *table;
 185        struct page *page;
 186        unsigned int mask, bit;
 187
 188        /* Try to get a fragment of a 4K page as a 2K page table */
 189        if (!mm_alloc_pgste(mm)) {
 190                table = NULL;
 191                spin_lock_bh(&mm->context.lock);
 192                if (!list_empty(&mm->context.pgtable_list)) {
 193                        page = list_first_entry(&mm->context.pgtable_list,
 194                                                struct page, lru);
 195                        mask = atomic_read(&page->_refcount) >> 24;
 196                        mask = (mask | (mask >> 4)) & 3;
 197                        if (mask != 3) {
 198                                table = (unsigned long *) page_to_phys(page);
 199                                bit = mask & 1;         /* =1 -> second 2K */
 200                                if (bit)
 201                                        table += PTRS_PER_PTE;
 202                                atomic_xor_bits(&page->_refcount,
 203                                                        1U << (bit + 24));
 204                                list_del(&page->lru);
 205                        }
 206                }
 207                spin_unlock_bh(&mm->context.lock);
 208                if (table)
 209                        return table;
 210        }
 211        /* Allocate a fresh page */
 212        page = alloc_page(GFP_KERNEL);
 213        if (!page)
 214                return NULL;
 215        if (!pgtable_page_ctor(page)) {
 216                __free_page(page);
 217                return NULL;
 218        }
 219        arch_set_page_dat(page, 0);
 220        /* Initialize page table */
 221        table = (unsigned long *) page_to_phys(page);
 222        if (mm_alloc_pgste(mm)) {
 223                /* Return 4K page table with PGSTEs */
 224                atomic_xor_bits(&page->_refcount, 3 << 24);
 225                memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
 226                memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
 227        } else {
 228                /* Return the first 2K fragment of the page */
 229                atomic_xor_bits(&page->_refcount, 1 << 24);
 230                memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
 231                spin_lock_bh(&mm->context.lock);
 232                list_add(&page->lru, &mm->context.pgtable_list);
 233                spin_unlock_bh(&mm->context.lock);
 234        }
 235        return table;
 236}
 237
 238void page_table_free(struct mm_struct *mm, unsigned long *table)
 239{
 240        struct page *page;
 241        unsigned int bit, mask;
 242
 243        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 244        if (!mm_alloc_pgste(mm)) {
 245                /* Free 2K page table fragment of a 4K page */
 246                bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
 247                spin_lock_bh(&mm->context.lock);
 248                mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24));
 249                mask >>= 24;
 250                if (mask & 3)
 251                        list_add(&page->lru, &mm->context.pgtable_list);
 252                else
 253                        list_del(&page->lru);
 254                spin_unlock_bh(&mm->context.lock);
 255                if (mask != 0)
 256                        return;
 257        } else {
 258                atomic_xor_bits(&page->_refcount, 3U << 24);
 259        }
 260
 261        pgtable_page_dtor(page);
 262        __free_page(page);
 263}
 264
 265void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
 266                         unsigned long vmaddr)
 267{
 268        struct mm_struct *mm;
 269        struct page *page;
 270        unsigned int bit, mask;
 271
 272        mm = tlb->mm;
 273        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 274        if (mm_alloc_pgste(mm)) {
 275                gmap_unlink(mm, table, vmaddr);
 276                table = (unsigned long *) (__pa(table) | 3);
 277                tlb_remove_table(tlb, table);
 278                return;
 279        }
 280        bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
 281        spin_lock_bh(&mm->context.lock);
 282        mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
 283        mask >>= 24;
 284        if (mask & 3)
 285                list_add_tail(&page->lru, &mm->context.pgtable_list);
 286        else
 287                list_del(&page->lru);
 288        spin_unlock_bh(&mm->context.lock);
 289        table = (unsigned long *) (__pa(table) | (1U << bit));
 290        tlb_remove_table(tlb, table);
 291}
 292
 293void __tlb_remove_table(void *_table)
 294{
 295        unsigned int mask = (unsigned long) _table & 3;
 296        void *table = (void *)((unsigned long) _table ^ mask);
 297        struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 298
 299        switch (mask) {
 300        case 0:         /* pmd, pud, or p4d */
 301                free_pages((unsigned long) table, 2);
 302                break;
 303        case 1:         /* lower 2K of a 4K page table */
 304        case 2:         /* higher 2K of a 4K page table */
 305                mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
 306                mask >>= 24;
 307                if (mask != 0)
 308                        break;
 309                /* fallthrough */
 310        case 3:         /* 4K page table with pgstes */
 311                if (mask & 3)
 312                        atomic_xor_bits(&page->_refcount, 3 << 24);
 313                pgtable_page_dtor(page);
 314                __free_page(page);
 315                break;
 316        }
 317}
 318
 319/*
 320 * Base infrastructure required to generate basic asces, region, segment,
 321 * and page tables that do not make use of enhanced features like EDAT1.
 322 */
 323
 324static struct kmem_cache *base_pgt_cache;
 325
 326static unsigned long base_pgt_alloc(void)
 327{
 328        u64 *table;
 329
 330        table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
 331        if (table)
 332                memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
 333        return (unsigned long) table;
 334}
 335
 336static void base_pgt_free(unsigned long table)
 337{
 338        kmem_cache_free(base_pgt_cache, (void *) table);
 339}
 340
 341static unsigned long base_crst_alloc(unsigned long val)
 342{
 343        unsigned long table;
 344
 345        table =  __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
 346        if (table)
 347                crst_table_init((unsigned long *)table, val);
 348        return table;
 349}
 350
 351static void base_crst_free(unsigned long table)
 352{
 353        free_pages(table, CRST_ALLOC_ORDER);
 354}
 355
 356#define BASE_ADDR_END_FUNC(NAME, SIZE)                                  \
 357static inline unsigned long base_##NAME##_addr_end(unsigned long addr,  \
 358                                                   unsigned long end)   \
 359{                                                                       \
 360        unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1);           \
 361                                                                        \
 362        return (next - 1) < (end - 1) ? next : end;                     \
 363}
 364
 365BASE_ADDR_END_FUNC(page,    _PAGE_SIZE)
 366BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
 367BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
 368BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
 369BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
 370
 371static inline unsigned long base_lra(unsigned long address)
 372{
 373        unsigned long real;
 374
 375        asm volatile(
 376                "       lra     %0,0(%1)\n"
 377                : "=d" (real) : "a" (address) : "cc");
 378        return real;
 379}
 380
 381static int base_page_walk(unsigned long origin, unsigned long addr,
 382                          unsigned long end, int alloc)
 383{
 384        unsigned long *pte, next;
 385
 386        if (!alloc)
 387                return 0;
 388        pte = (unsigned long *) origin;
 389        pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
 390        do {
 391                next = base_page_addr_end(addr, end);
 392                *pte = base_lra(addr);
 393        } while (pte++, addr = next, addr < end);
 394        return 0;
 395}
 396
 397static int base_segment_walk(unsigned long origin, unsigned long addr,
 398                             unsigned long end, int alloc)
 399{
 400        unsigned long *ste, next, table;
 401        int rc;
 402
 403        ste = (unsigned long *) origin;
 404        ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
 405        do {
 406                next = base_segment_addr_end(addr, end);
 407                if (*ste & _SEGMENT_ENTRY_INVALID) {
 408                        if (!alloc)
 409                                continue;
 410                        table = base_pgt_alloc();
 411                        if (!table)
 412                                return -ENOMEM;
 413                        *ste = table | _SEGMENT_ENTRY;
 414                }
 415                table = *ste & _SEGMENT_ENTRY_ORIGIN;
 416                rc = base_page_walk(table, addr, next, alloc);
 417                if (rc)
 418                        return rc;
 419                if (!alloc)
 420                        base_pgt_free(table);
 421                cond_resched();
 422        } while (ste++, addr = next, addr < end);
 423        return 0;
 424}
 425
 426static int base_region3_walk(unsigned long origin, unsigned long addr,
 427                             unsigned long end, int alloc)
 428{
 429        unsigned long *rtte, next, table;
 430        int rc;
 431
 432        rtte = (unsigned long *) origin;
 433        rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
 434        do {
 435                next = base_region3_addr_end(addr, end);
 436                if (*rtte & _REGION_ENTRY_INVALID) {
 437                        if (!alloc)
 438                                continue;
 439                        table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
 440                        if (!table)
 441                                return -ENOMEM;
 442                        *rtte = table | _REGION3_ENTRY;
 443                }
 444                table = *rtte & _REGION_ENTRY_ORIGIN;
 445                rc = base_segment_walk(table, addr, next, alloc);
 446                if (rc)
 447                        return rc;
 448                if (!alloc)
 449                        base_crst_free(table);
 450        } while (rtte++, addr = next, addr < end);
 451        return 0;
 452}
 453
 454static int base_region2_walk(unsigned long origin, unsigned long addr,
 455                             unsigned long end, int alloc)
 456{
 457        unsigned long *rste, next, table;
 458        int rc;
 459
 460        rste = (unsigned long *) origin;
 461        rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
 462        do {
 463                next = base_region2_addr_end(addr, end);
 464                if (*rste & _REGION_ENTRY_INVALID) {
 465                        if (!alloc)
 466                                continue;
 467                        table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
 468                        if (!table)
 469                                return -ENOMEM;
 470                        *rste = table | _REGION2_ENTRY;
 471                }
 472                table = *rste & _REGION_ENTRY_ORIGIN;
 473                rc = base_region3_walk(table, addr, next, alloc);
 474                if (rc)
 475                        return rc;
 476                if (!alloc)
 477                        base_crst_free(table);
 478        } while (rste++, addr = next, addr < end);
 479        return 0;
 480}
 481
 482static int base_region1_walk(unsigned long origin, unsigned long addr,
 483                             unsigned long end, int alloc)
 484{
 485        unsigned long *rfte, next, table;
 486        int rc;
 487
 488        rfte = (unsigned long *) origin;
 489        rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
 490        do {
 491                next = base_region1_addr_end(addr, end);
 492                if (*rfte & _REGION_ENTRY_INVALID) {
 493                        if (!alloc)
 494                                continue;
 495                        table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
 496                        if (!table)
 497                                return -ENOMEM;
 498                        *rfte = table | _REGION1_ENTRY;
 499                }
 500                table = *rfte & _REGION_ENTRY_ORIGIN;
 501                rc = base_region2_walk(table, addr, next, alloc);
 502                if (rc)
 503                        return rc;
 504                if (!alloc)
 505                        base_crst_free(table);
 506        } while (rfte++, addr = next, addr < end);
 507        return 0;
 508}
 509
 510/**
 511 * base_asce_free - free asce and tables returned from base_asce_alloc()
 512 * @asce: asce to be freed
 513 *
 514 * Frees all region, segment, and page tables that were allocated with a
 515 * corresponding base_asce_alloc() call.
 516 */
 517void base_asce_free(unsigned long asce)
 518{
 519        unsigned long table = asce & _ASCE_ORIGIN;
 520
 521        if (!asce)
 522                return;
 523        switch (asce & _ASCE_TYPE_MASK) {
 524        case _ASCE_TYPE_SEGMENT:
 525                base_segment_walk(table, 0, _REGION3_SIZE, 0);
 526                break;
 527        case _ASCE_TYPE_REGION3:
 528                base_region3_walk(table, 0, _REGION2_SIZE, 0);
 529                break;
 530        case _ASCE_TYPE_REGION2:
 531                base_region2_walk(table, 0, _REGION1_SIZE, 0);
 532                break;
 533        case _ASCE_TYPE_REGION1:
 534                base_region1_walk(table, 0, -_PAGE_SIZE, 0);
 535                break;
 536        }
 537        base_crst_free(table);
 538}
 539
 540static int base_pgt_cache_init(void)
 541{
 542        static DEFINE_MUTEX(base_pgt_cache_mutex);
 543        unsigned long sz = _PAGE_TABLE_SIZE;
 544
 545        if (base_pgt_cache)
 546                return 0;
 547        mutex_lock(&base_pgt_cache_mutex);
 548        if (!base_pgt_cache)
 549                base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
 550        mutex_unlock(&base_pgt_cache_mutex);
 551        return base_pgt_cache ? 0 : -ENOMEM;
 552}
 553
 554/**
 555 * base_asce_alloc - create kernel mapping without enhanced DAT features
 556 * @addr: virtual start address of kernel mapping
 557 * @num_pages: number of consecutive pages
 558 *
 559 * Generate an asce, including all required region, segment and page tables,
 560 * that can be used to access the virtual kernel mapping. The difference is
 561 * that the returned asce does not make use of any enhanced DAT features like
 562 * e.g. large pages. This is required for some I/O functions that pass an
 563 * asce, like e.g. some service call requests.
 564 *
 565 * Note: the returned asce may NEVER be attached to any cpu. It may only be
 566 *       used for I/O requests. tlb entries that might result because the
 567 *       asce was attached to a cpu won't be cleared.
 568 */
 569unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
 570{
 571        unsigned long asce, table, end;
 572        int rc;
 573
 574        if (base_pgt_cache_init())
 575                return 0;
 576        end = addr + num_pages * PAGE_SIZE;
 577        if (end <= _REGION3_SIZE) {
 578                table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
 579                if (!table)
 580                        return 0;
 581                rc = base_segment_walk(table, addr, end, 1);
 582                asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
 583        } else if (end <= _REGION2_SIZE) {
 584                table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
 585                if (!table)
 586                        return 0;
 587                rc = base_region3_walk(table, addr, end, 1);
 588                asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
 589        } else if (end <= _REGION1_SIZE) {
 590                table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
 591                if (!table)
 592                        return 0;
 593                rc = base_region2_walk(table, addr, end, 1);
 594                asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
 595        } else {
 596                table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
 597                if (!table)
 598                        return 0;
 599                rc = base_region1_walk(table, addr, end, 1);
 600                asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
 601        }
 602        if (rc) {
 603                base_asce_free(asce);
 604                asce = 0;
 605        }
 606        return asce;
 607}
 608