linux/arch/s390/mm/pgtable.c
<<
>>
Prefs
   1/*
   2 *    Copyright IBM Corp. 2007, 2011
   3 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/kernel.h>
   8#include <linux/errno.h>
   9#include <linux/gfp.h>
  10#include <linux/mm.h>
  11#include <linux/swap.h>
  12#include <linux/smp.h>
  13#include <linux/highmem.h>
  14#include <linux/pagemap.h>
  15#include <linux/spinlock.h>
  16#include <linux/module.h>
  17#include <linux/quicklist.h>
  18#include <linux/rcupdate.h>
  19#include <linux/slab.h>
  20
  21#include <asm/pgtable.h>
  22#include <asm/pgalloc.h>
  23#include <asm/tlb.h>
  24#include <asm/tlbflush.h>
  25#include <asm/mmu_context.h>
  26
  27#ifndef CONFIG_64BIT
  28#define ALLOC_ORDER     1
  29#define FRAG_MASK       0x0f
  30#else
  31#define ALLOC_ORDER     2
  32#define FRAG_MASK       0x03
  33#endif
  34
  35
  36unsigned long *crst_table_alloc(struct mm_struct *mm)
  37{
  38        struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
  39
  40        if (!page)
  41                return NULL;
  42        return (unsigned long *) page_to_phys(page);
  43}
  44
  45void crst_table_free(struct mm_struct *mm, unsigned long *table)
  46{
  47        free_pages((unsigned long) table, ALLOC_ORDER);
  48}
  49
  50#ifdef CONFIG_64BIT
  51int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
  52{
  53        unsigned long *table, *pgd;
  54        unsigned long entry;
  55
  56        BUG_ON(limit > (1UL << 53));
  57repeat:
  58        table = crst_table_alloc(mm);
  59        if (!table)
  60                return -ENOMEM;
  61        spin_lock_bh(&mm->page_table_lock);
  62        if (mm->context.asce_limit < limit) {
  63                pgd = (unsigned long *) mm->pgd;
  64                if (mm->context.asce_limit <= (1UL << 31)) {
  65                        entry = _REGION3_ENTRY_EMPTY;
  66                        mm->context.asce_limit = 1UL << 42;
  67                        mm->context.asce_bits = _ASCE_TABLE_LENGTH |
  68                                                _ASCE_USER_BITS |
  69                                                _ASCE_TYPE_REGION3;
  70                } else {
  71                        entry = _REGION2_ENTRY_EMPTY;
  72                        mm->context.asce_limit = 1UL << 53;
  73                        mm->context.asce_bits = _ASCE_TABLE_LENGTH |
  74                                                _ASCE_USER_BITS |
  75                                                _ASCE_TYPE_REGION2;
  76                }
  77                crst_table_init(table, entry);
  78                pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
  79                mm->pgd = (pgd_t *) table;
  80                mm->task_size = mm->context.asce_limit;
  81                table = NULL;
  82        }
  83        spin_unlock_bh(&mm->page_table_lock);
  84        if (table)
  85                crst_table_free(mm, table);
  86        if (mm->context.asce_limit < limit)
  87                goto repeat;
  88        return 0;
  89}
  90
  91void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
  92{
  93        pgd_t *pgd;
  94
  95        while (mm->context.asce_limit > limit) {
  96                pgd = mm->pgd;
  97                switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
  98                case _REGION_ENTRY_TYPE_R2:
  99                        mm->context.asce_limit = 1UL << 42;
 100                        mm->context.asce_bits = _ASCE_TABLE_LENGTH |
 101                                                _ASCE_USER_BITS |
 102                                                _ASCE_TYPE_REGION3;
 103                        break;
 104                case _REGION_ENTRY_TYPE_R3:
 105                        mm->context.asce_limit = 1UL << 31;
 106                        mm->context.asce_bits = _ASCE_TABLE_LENGTH |
 107                                                _ASCE_USER_BITS |
 108                                                _ASCE_TYPE_SEGMENT;
 109                        break;
 110                default:
 111                        BUG();
 112                }
 113                mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
 114                mm->task_size = mm->context.asce_limit;
 115                crst_table_free(mm, (unsigned long *) pgd);
 116        }
 117}
 118#endif
 119
 120#ifdef CONFIG_PGSTE
 121
 122/**
 123 * gmap_alloc - allocate a guest address space
 124 * @mm: pointer to the parent mm_struct
 125 *
 126 * Returns a guest address space structure.
 127 */
 128struct gmap *gmap_alloc(struct mm_struct *mm)
 129{
 130        struct gmap *gmap;
 131        struct page *page;
 132        unsigned long *table;
 133
 134        gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
 135        if (!gmap)
 136                goto out;
 137        INIT_LIST_HEAD(&gmap->crst_list);
 138        gmap->mm = mm;
 139        page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
 140        if (!page)
 141                goto out_free;
 142        list_add(&page->lru, &gmap->crst_list);
 143        table = (unsigned long *) page_to_phys(page);
 144        crst_table_init(table, _REGION1_ENTRY_EMPTY);
 145        gmap->table = table;
 146        gmap->asce = _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH |
 147                     _ASCE_USER_BITS | __pa(table);
 148        list_add(&gmap->list, &mm->context.gmap_list);
 149        return gmap;
 150
 151out_free:
 152        kfree(gmap);
 153out:
 154        return NULL;
 155}
 156EXPORT_SYMBOL_GPL(gmap_alloc);
 157
 158static int gmap_unlink_segment(struct gmap *gmap, unsigned long *table)
 159{
 160        struct gmap_pgtable *mp;
 161        struct gmap_rmap *rmap;
 162        struct page *page;
 163
 164        if (*table & _SEGMENT_ENTRY_INV)
 165                return 0;
 166        page = pfn_to_page(*table >> PAGE_SHIFT);
 167        mp = (struct gmap_pgtable *) page->index;
 168        list_for_each_entry(rmap, &mp->mapper, list) {
 169                if (rmap->entry != table)
 170                        continue;
 171                list_del(&rmap->list);
 172                kfree(rmap);
 173                break;
 174        }
 175        *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
 176        return 1;
 177}
 178
 179static void gmap_flush_tlb(struct gmap *gmap)
 180{
 181        if (MACHINE_HAS_IDTE)
 182                __tlb_flush_idte((unsigned long) gmap->table |
 183                                 _ASCE_TYPE_REGION1);
 184        else
 185                __tlb_flush_global();
 186}
 187
 188/**
 189 * gmap_free - free a guest address space
 190 * @gmap: pointer to the guest address space structure
 191 */
 192void gmap_free(struct gmap *gmap)
 193{
 194        struct page *page, *next;
 195        unsigned long *table;
 196        int i;
 197
 198
 199        /* Flush tlb. */
 200        if (MACHINE_HAS_IDTE)
 201                __tlb_flush_idte((unsigned long) gmap->table |
 202                                 _ASCE_TYPE_REGION1);
 203        else
 204                __tlb_flush_global();
 205
 206        /* Free all segment & region tables. */
 207        down_read(&gmap->mm->mmap_sem);
 208        spin_lock(&gmap->mm->page_table_lock);
 209        list_for_each_entry_safe(page, next, &gmap->crst_list, lru) {
 210                table = (unsigned long *) page_to_phys(page);
 211                if ((*table & _REGION_ENTRY_TYPE_MASK) == 0)
 212                        /* Remove gmap rmap structures for segment table. */
 213                        for (i = 0; i < PTRS_PER_PMD; i++, table++)
 214                                gmap_unlink_segment(gmap, table);
 215                __free_pages(page, ALLOC_ORDER);
 216        }
 217        spin_unlock(&gmap->mm->page_table_lock);
 218        up_read(&gmap->mm->mmap_sem);
 219        list_del(&gmap->list);
 220        kfree(gmap);
 221}
 222EXPORT_SYMBOL_GPL(gmap_free);
 223
 224/**
 225 * gmap_enable - switch primary space to the guest address space
 226 * @gmap: pointer to the guest address space structure
 227 */
 228void gmap_enable(struct gmap *gmap)
 229{
 230        S390_lowcore.gmap = (unsigned long) gmap;
 231}
 232EXPORT_SYMBOL_GPL(gmap_enable);
 233
 234/**
 235 * gmap_disable - switch back to the standard primary address space
 236 * @gmap: pointer to the guest address space structure
 237 */
 238void gmap_disable(struct gmap *gmap)
 239{
 240        S390_lowcore.gmap = 0UL;
 241}
 242EXPORT_SYMBOL_GPL(gmap_disable);
 243
 244/*
 245 * gmap_alloc_table is assumed to be called with mmap_sem held
 246 */
 247static int gmap_alloc_table(struct gmap *gmap,
 248                               unsigned long *table, unsigned long init)
 249{
 250        struct page *page;
 251        unsigned long *new;
 252
 253        /* since we dont free the gmap table until gmap_free we can unlock */
 254        spin_unlock(&gmap->mm->page_table_lock);
 255        page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
 256        spin_lock(&gmap->mm->page_table_lock);
 257        if (!page)
 258                return -ENOMEM;
 259        new = (unsigned long *) page_to_phys(page);
 260        crst_table_init(new, init);
 261        if (*table & _REGION_ENTRY_INV) {
 262                list_add(&page->lru, &gmap->crst_list);
 263                *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
 264                        (*table & _REGION_ENTRY_TYPE_MASK);
 265        } else
 266                __free_pages(page, ALLOC_ORDER);
 267        return 0;
 268}
 269
 270/**
 271 * gmap_unmap_segment - unmap segment from the guest address space
 272 * @gmap: pointer to the guest address space structure
 273 * @addr: address in the guest address space
 274 * @len: length of the memory area to unmap
 275 *
 276 * Returns 0 if the unmap succeded, -EINVAL if not.
 277 */
 278int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
 279{
 280        unsigned long *table;
 281        unsigned long off;
 282        int flush;
 283
 284        if ((to | len) & (PMD_SIZE - 1))
 285                return -EINVAL;
 286        if (len == 0 || to + len < to)
 287                return -EINVAL;
 288
 289        flush = 0;
 290        down_read(&gmap->mm->mmap_sem);
 291        spin_lock(&gmap->mm->page_table_lock);
 292        for (off = 0; off < len; off += PMD_SIZE) {
 293                /* Walk the guest addr space page table */
 294                table = gmap->table + (((to + off) >> 53) & 0x7ff);
 295                if (*table & _REGION_ENTRY_INV)
 296                        goto out;
 297                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 298                table = table + (((to + off) >> 42) & 0x7ff);
 299                if (*table & _REGION_ENTRY_INV)
 300                        goto out;
 301                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 302                table = table + (((to + off) >> 31) & 0x7ff);
 303                if (*table & _REGION_ENTRY_INV)
 304                        goto out;
 305                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 306                table = table + (((to + off) >> 20) & 0x7ff);
 307
 308                /* Clear segment table entry in guest address space. */
 309                flush |= gmap_unlink_segment(gmap, table);
 310                *table = _SEGMENT_ENTRY_INV;
 311        }
 312out:
 313        spin_unlock(&gmap->mm->page_table_lock);
 314        up_read(&gmap->mm->mmap_sem);
 315        if (flush)
 316                gmap_flush_tlb(gmap);
 317        return 0;
 318}
 319EXPORT_SYMBOL_GPL(gmap_unmap_segment);
 320
 321/**
 322 * gmap_mmap_segment - map a segment to the guest address space
 323 * @gmap: pointer to the guest address space structure
 324 * @from: source address in the parent address space
 325 * @to: target address in the guest address space
 326 *
 327 * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not.
 328 */
 329int gmap_map_segment(struct gmap *gmap, unsigned long from,
 330                     unsigned long to, unsigned long len)
 331{
 332        unsigned long *table;
 333        unsigned long off;
 334        int flush;
 335
 336        if ((from | to | len) & (PMD_SIZE - 1))
 337                return -EINVAL;
 338        if (len == 0 || from + len > PGDIR_SIZE ||
 339            from + len < from || to + len < to)
 340                return -EINVAL;
 341
 342        flush = 0;
 343        down_read(&gmap->mm->mmap_sem);
 344        spin_lock(&gmap->mm->page_table_lock);
 345        for (off = 0; off < len; off += PMD_SIZE) {
 346                /* Walk the gmap address space page table */
 347                table = gmap->table + (((to + off) >> 53) & 0x7ff);
 348                if ((*table & _REGION_ENTRY_INV) &&
 349                    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY))
 350                        goto out_unmap;
 351                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 352                table = table + (((to + off) >> 42) & 0x7ff);
 353                if ((*table & _REGION_ENTRY_INV) &&
 354                    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY))
 355                        goto out_unmap;
 356                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 357                table = table + (((to + off) >> 31) & 0x7ff);
 358                if ((*table & _REGION_ENTRY_INV) &&
 359                    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY))
 360                        goto out_unmap;
 361                table = (unsigned long *) (*table & _REGION_ENTRY_ORIGIN);
 362                table = table + (((to + off) >> 20) & 0x7ff);
 363
 364                /* Store 'from' address in an invalid segment table entry. */
 365                flush |= gmap_unlink_segment(gmap, table);
 366                *table = _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | (from + off);
 367        }
 368        spin_unlock(&gmap->mm->page_table_lock);
 369        up_read(&gmap->mm->mmap_sem);
 370        if (flush)
 371                gmap_flush_tlb(gmap);
 372        return 0;
 373
 374out_unmap:
 375        spin_unlock(&gmap->mm->page_table_lock);
 376        up_read(&gmap->mm->mmap_sem);
 377        gmap_unmap_segment(gmap, to, len);
 378        return -ENOMEM;
 379}
 380EXPORT_SYMBOL_GPL(gmap_map_segment);
 381
 382static unsigned long *gmap_table_walk(unsigned long address, struct gmap *gmap)
 383{
 384        unsigned long *table;
 385
 386        table = gmap->table + ((address >> 53) & 0x7ff);
 387        if (unlikely(*table & _REGION_ENTRY_INV))
 388                return ERR_PTR(-EFAULT);
 389        table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 390        table = table + ((address >> 42) & 0x7ff);
 391        if (unlikely(*table & _REGION_ENTRY_INV))
 392                return ERR_PTR(-EFAULT);
 393        table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 394        table = table + ((address >> 31) & 0x7ff);
 395        if (unlikely(*table & _REGION_ENTRY_INV))
 396                return ERR_PTR(-EFAULT);
 397        table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 398        table = table + ((address >> 20) & 0x7ff);
 399        return table;
 400}
 401
 402/**
 403 * __gmap_translate - translate a guest address to a user space address
 404 * @address: guest address
 405 * @gmap: pointer to guest mapping meta data structure
 406 *
 407 * Returns user space address which corresponds to the guest address or
 408 * -EFAULT if no such mapping exists.
 409 * This function does not establish potentially missing page table entries.
 410 * The mmap_sem of the mm that belongs to the address space must be held
 411 * when this function gets called.
 412 */
 413unsigned long __gmap_translate(unsigned long address, struct gmap *gmap)
 414{
 415        unsigned long *segment_ptr, vmaddr, segment;
 416        struct gmap_pgtable *mp;
 417        struct page *page;
 418
 419        current->thread.gmap_addr = address;
 420        segment_ptr = gmap_table_walk(address, gmap);
 421        if (IS_ERR(segment_ptr))
 422                return PTR_ERR(segment_ptr);
 423        /* Convert the gmap address to an mm address. */
 424        segment = *segment_ptr;
 425        if (!(segment & _SEGMENT_ENTRY_INV)) {
 426                page = pfn_to_page(segment >> PAGE_SHIFT);
 427                mp = (struct gmap_pgtable *) page->index;
 428                return mp->vmaddr | (address & ~PMD_MASK);
 429        } else if (segment & _SEGMENT_ENTRY_RO) {
 430                vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
 431                return vmaddr | (address & ~PMD_MASK);
 432        }
 433        return -EFAULT;
 434}
 435EXPORT_SYMBOL_GPL(__gmap_translate);
 436
 437/**
 438 * gmap_translate - translate a guest address to a user space address
 439 * @address: guest address
 440 * @gmap: pointer to guest mapping meta data structure
 441 *
 442 * Returns user space address which corresponds to the guest address or
 443 * -EFAULT if no such mapping exists.
 444 * This function does not establish potentially missing page table entries.
 445 */
 446unsigned long gmap_translate(unsigned long address, struct gmap *gmap)
 447{
 448        unsigned long rc;
 449
 450        down_read(&gmap->mm->mmap_sem);
 451        rc = __gmap_translate(address, gmap);
 452        up_read(&gmap->mm->mmap_sem);
 453        return rc;
 454}
 455EXPORT_SYMBOL_GPL(gmap_translate);
 456
 457static int gmap_connect_pgtable(unsigned long address, unsigned long segment,
 458                                unsigned long *segment_ptr, struct gmap *gmap)
 459{
 460        unsigned long vmaddr;
 461        struct vm_area_struct *vma;
 462        struct gmap_pgtable *mp;
 463        struct gmap_rmap *rmap;
 464        struct mm_struct *mm;
 465        struct page *page;
 466        pgd_t *pgd;
 467        pud_t *pud;
 468        pmd_t *pmd;
 469
 470        mm = gmap->mm;
 471        vmaddr = segment & _SEGMENT_ENTRY_ORIGIN;
 472        vma = find_vma(mm, vmaddr);
 473        if (!vma || vma->vm_start > vmaddr)
 474                return -EFAULT;
 475        /* Walk the parent mm page table */
 476        pgd = pgd_offset(mm, vmaddr);
 477        pud = pud_alloc(mm, pgd, vmaddr);
 478        if (!pud)
 479                return -ENOMEM;
 480        pmd = pmd_alloc(mm, pud, vmaddr);
 481        if (!pmd)
 482                return -ENOMEM;
 483        if (!pmd_present(*pmd) &&
 484            __pte_alloc(mm, vma, pmd, vmaddr))
 485                return -ENOMEM;
 486        /* pmd now points to a valid segment table entry. */
 487        rmap = kmalloc(sizeof(*rmap), GFP_KERNEL|__GFP_REPEAT);
 488        if (!rmap)
 489                return -ENOMEM;
 490        /* Link gmap segment table entry location to page table. */
 491        page = pmd_page(*pmd);
 492        mp = (struct gmap_pgtable *) page->index;
 493        rmap->gmap = gmap;
 494        rmap->entry = segment_ptr;
 495        rmap->vmaddr = address & PMD_MASK;
 496        spin_lock(&mm->page_table_lock);
 497        if (*segment_ptr == segment) {
 498                list_add(&rmap->list, &mp->mapper);
 499                /* Set gmap segment table entry to page table. */
 500                *segment_ptr = pmd_val(*pmd) & PAGE_MASK;
 501                rmap = NULL;
 502        }
 503        spin_unlock(&mm->page_table_lock);
 504        kfree(rmap);
 505        return 0;
 506}
 507
 508static void gmap_disconnect_pgtable(struct mm_struct *mm, unsigned long *table)
 509{
 510        struct gmap_rmap *rmap, *next;
 511        struct gmap_pgtable *mp;
 512        struct page *page;
 513        int flush;
 514
 515        flush = 0;
 516        spin_lock(&mm->page_table_lock);
 517        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 518        mp = (struct gmap_pgtable *) page->index;
 519        list_for_each_entry_safe(rmap, next, &mp->mapper, list) {
 520                *rmap->entry =
 521                        _SEGMENT_ENTRY_INV | _SEGMENT_ENTRY_RO | mp->vmaddr;
 522                list_del(&rmap->list);
 523                kfree(rmap);
 524                flush = 1;
 525        }
 526        spin_unlock(&mm->page_table_lock);
 527        if (flush)
 528                __tlb_flush_global();
 529}
 530
 531/*
 532 * this function is assumed to be called with mmap_sem held
 533 */
 534unsigned long __gmap_fault(unsigned long address, struct gmap *gmap)
 535{
 536        unsigned long *segment_ptr, segment;
 537        struct gmap_pgtable *mp;
 538        struct page *page;
 539        int rc;
 540
 541        current->thread.gmap_addr = address;
 542        segment_ptr = gmap_table_walk(address, gmap);
 543        if (IS_ERR(segment_ptr))
 544                return -EFAULT;
 545        /* Convert the gmap address to an mm address. */
 546        while (1) {
 547                segment = *segment_ptr;
 548                if (!(segment & _SEGMENT_ENTRY_INV)) {
 549                        /* Page table is present */
 550                        page = pfn_to_page(segment >> PAGE_SHIFT);
 551                        mp = (struct gmap_pgtable *) page->index;
 552                        return mp->vmaddr | (address & ~PMD_MASK);
 553                }
 554                if (!(segment & _SEGMENT_ENTRY_RO))
 555                        /* Nothing mapped in the gmap address space. */
 556                        break;
 557                rc = gmap_connect_pgtable(address, segment, segment_ptr, gmap);
 558                if (rc)
 559                        return rc;
 560        }
 561        return -EFAULT;
 562}
 563
 564unsigned long gmap_fault(unsigned long address, struct gmap *gmap)
 565{
 566        unsigned long rc;
 567
 568        down_read(&gmap->mm->mmap_sem);
 569        rc = __gmap_fault(address, gmap);
 570        up_read(&gmap->mm->mmap_sem);
 571
 572        return rc;
 573}
 574EXPORT_SYMBOL_GPL(gmap_fault);
 575
 576void gmap_discard(unsigned long from, unsigned long to, struct gmap *gmap)
 577{
 578
 579        unsigned long *table, address, size;
 580        struct vm_area_struct *vma;
 581        struct gmap_pgtable *mp;
 582        struct page *page;
 583
 584        down_read(&gmap->mm->mmap_sem);
 585        address = from;
 586        while (address < to) {
 587                /* Walk the gmap address space page table */
 588                table = gmap->table + ((address >> 53) & 0x7ff);
 589                if (unlikely(*table & _REGION_ENTRY_INV)) {
 590                        address = (address + PMD_SIZE) & PMD_MASK;
 591                        continue;
 592                }
 593                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 594                table = table + ((address >> 42) & 0x7ff);
 595                if (unlikely(*table & _REGION_ENTRY_INV)) {
 596                        address = (address + PMD_SIZE) & PMD_MASK;
 597                        continue;
 598                }
 599                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 600                table = table + ((address >> 31) & 0x7ff);
 601                if (unlikely(*table & _REGION_ENTRY_INV)) {
 602                        address = (address + PMD_SIZE) & PMD_MASK;
 603                        continue;
 604                }
 605                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 606                table = table + ((address >> 20) & 0x7ff);
 607                if (unlikely(*table & _SEGMENT_ENTRY_INV)) {
 608                        address = (address + PMD_SIZE) & PMD_MASK;
 609                        continue;
 610                }
 611                page = pfn_to_page(*table >> PAGE_SHIFT);
 612                mp = (struct gmap_pgtable *) page->index;
 613                vma = find_vma(gmap->mm, mp->vmaddr);
 614                size = min(to - address, PMD_SIZE - (address & ~PMD_MASK));
 615                zap_page_range(vma, mp->vmaddr | (address & ~PMD_MASK),
 616                               size, NULL);
 617                address = (address + PMD_SIZE) & PMD_MASK;
 618        }
 619        up_read(&gmap->mm->mmap_sem);
 620}
 621EXPORT_SYMBOL_GPL(gmap_discard);
 622
 623static LIST_HEAD(gmap_notifier_list);
 624static DEFINE_SPINLOCK(gmap_notifier_lock);
 625
 626/**
 627 * gmap_register_ipte_notifier - register a pte invalidation callback
 628 * @nb: pointer to the gmap notifier block
 629 */
 630void gmap_register_ipte_notifier(struct gmap_notifier *nb)
 631{
 632        spin_lock(&gmap_notifier_lock);
 633        list_add(&nb->list, &gmap_notifier_list);
 634        spin_unlock(&gmap_notifier_lock);
 635}
 636EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
 637
 638/**
 639 * gmap_unregister_ipte_notifier - remove a pte invalidation callback
 640 * @nb: pointer to the gmap notifier block
 641 */
 642void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
 643{
 644        spin_lock(&gmap_notifier_lock);
 645        list_del_init(&nb->list);
 646        spin_unlock(&gmap_notifier_lock);
 647}
 648EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
 649
 650/**
 651 * gmap_ipte_notify - mark a range of ptes for invalidation notification
 652 * @gmap: pointer to guest mapping meta data structure
 653 * @address: virtual address in the guest address space
 654 * @len: size of area
 655 *
 656 * Returns 0 if for each page in the given range a gmap mapping exists and
 657 * the invalidation notification could be set. If the gmap mapping is missing
 658 * for one or more pages -EFAULT is returned. If no memory could be allocated
 659 * -ENOMEM is returned. This function establishes missing page table entries.
 660 */
 661int gmap_ipte_notify(struct gmap *gmap, unsigned long start, unsigned long len)
 662{
 663        unsigned long addr;
 664        spinlock_t *ptl;
 665        pte_t *ptep, entry;
 666        pgste_t pgste;
 667        int rc = 0;
 668
 669        if ((start & ~PAGE_MASK) || (len & ~PAGE_MASK))
 670                return -EINVAL;
 671        down_read(&gmap->mm->mmap_sem);
 672        while (len) {
 673                /* Convert gmap address and connect the page tables */
 674                addr = __gmap_fault(start, gmap);
 675                if (IS_ERR_VALUE(addr)) {
 676                        rc = addr;
 677                        break;
 678                }
 679                /* Get the page mapped */
 680                if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE)) {
 681                        rc = -EFAULT;
 682                        break;
 683                }
 684                /* Walk the process page table, lock and get pte pointer */
 685                ptep = get_locked_pte(gmap->mm, addr, &ptl);
 686                if (unlikely(!ptep))
 687                        continue;
 688                /* Set notification bit in the pgste of the pte */
 689                entry = *ptep;
 690                if ((pte_val(entry) & (_PAGE_INVALID | _PAGE_RO)) == 0) {
 691                        pgste = pgste_get_lock(ptep);
 692                        pgste_val(pgste) |= PGSTE_IN_BIT;
 693                        pgste_set_unlock(ptep, pgste);
 694                        start += PAGE_SIZE;
 695                        len -= PAGE_SIZE;
 696                }
 697                spin_unlock(ptl);
 698        }
 699        up_read(&gmap->mm->mmap_sem);
 700        return rc;
 701}
 702EXPORT_SYMBOL_GPL(gmap_ipte_notify);
 703
 704/**
 705 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte.
 706 * @mm: pointer to the process mm_struct
 707 * @addr: virtual address in the process address space
 708 * @pte: pointer to the page table entry
 709 *
 710 * This function is assumed to be called with the page table lock held
 711 * for the pte to notify.
 712 */
 713void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long addr, pte_t *pte)
 714{
 715        unsigned long segment_offset;
 716        struct gmap_notifier *nb;
 717        struct gmap_pgtable *mp;
 718        struct gmap_rmap *rmap;
 719        struct page *page;
 720
 721        segment_offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
 722        segment_offset = segment_offset * (4096 / sizeof(pte_t));
 723        page = pfn_to_page(__pa(pte) >> PAGE_SHIFT);
 724        mp = (struct gmap_pgtable *) page->index;
 725        spin_lock(&gmap_notifier_lock);
 726        list_for_each_entry(rmap, &mp->mapper, list) {
 727                list_for_each_entry(nb, &gmap_notifier_list, list)
 728                        nb->notifier_call(rmap->gmap,
 729                                          rmap->vmaddr + segment_offset);
 730        }
 731        spin_unlock(&gmap_notifier_lock);
 732}
 733
 734static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
 735                                                    unsigned long vmaddr)
 736{
 737        struct page *page;
 738        unsigned long *table;
 739        struct gmap_pgtable *mp;
 740
 741        page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
 742        if (!page)
 743                return NULL;
 744        mp = kmalloc(sizeof(*mp), GFP_KERNEL|__GFP_REPEAT);
 745        if (!mp) {
 746                __free_page(page);
 747                return NULL;
 748        }
 749        pgtable_page_ctor(page);
 750        mp->vmaddr = vmaddr & PMD_MASK;
 751        INIT_LIST_HEAD(&mp->mapper);
 752        page->index = (unsigned long) mp;
 753        atomic_set(&page->_mapcount, 3);
 754        table = (unsigned long *) page_to_phys(page);
 755        clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
 756        clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
 757        return table;
 758}
 759
 760static inline void page_table_free_pgste(unsigned long *table)
 761{
 762        struct page *page;
 763        struct gmap_pgtable *mp;
 764
 765        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 766        mp = (struct gmap_pgtable *) page->index;
 767        BUG_ON(!list_empty(&mp->mapper));
 768        pgtable_page_dtor(page);
 769        atomic_set(&page->_mapcount, -1);
 770        kfree(mp);
 771        __free_page(page);
 772}
 773
 774int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
 775                          unsigned long key, bool nq)
 776{
 777        spinlock_t *ptl;
 778        pgste_t old, new;
 779        pte_t *ptep;
 780
 781        down_read(&mm->mmap_sem);
 782        ptep = get_locked_pte(current->mm, addr, &ptl);
 783        if (unlikely(!ptep)) {
 784                up_read(&mm->mmap_sem);
 785                return -EFAULT;
 786        }
 787
 788        new = old = pgste_get_lock(ptep);
 789        pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
 790                            PGSTE_ACC_BITS | PGSTE_FP_BIT);
 791        pgste_val(new) |= (key & (_PAGE_CHANGED | _PAGE_REFERENCED)) << 48;
 792        pgste_val(new) |= (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)) << 56;
 793        if (!(pte_val(*ptep) & _PAGE_INVALID)) {
 794                unsigned long address, bits;
 795                unsigned char skey;
 796
 797                address = pte_val(*ptep) & PAGE_MASK;
 798                skey = page_get_storage_key(address);
 799                bits = skey & (_PAGE_CHANGED | _PAGE_REFERENCED);
 800                /* Set storage key ACC and FP */
 801                page_set_storage_key(address,
 802                                (key & (_PAGE_ACC_BITS | _PAGE_FP_BIT)),
 803                                !nq);
 804
 805                /* Merge host changed & referenced into pgste  */
 806                pgste_val(new) |= bits << 52;
 807                /* Transfer skey changed & referenced bit to kvm user bits */
 808                pgste_val(new) |= bits << 45;   /* PGSTE_UR_BIT & PGSTE_UC_BIT */
 809        }
 810        /* changing the guest storage key is considered a change of the page */
 811        if ((pgste_val(new) ^ pgste_val(old)) &
 812            (PGSTE_ACC_BITS | PGSTE_FP_BIT | PGSTE_GR_BIT | PGSTE_GC_BIT))
 813                pgste_val(new) |= PGSTE_UC_BIT;
 814
 815        pgste_set_unlock(ptep, new);
 816        pte_unmap_unlock(*ptep, ptl);
 817        up_read(&mm->mmap_sem);
 818        return 0;
 819}
 820EXPORT_SYMBOL(set_guest_storage_key);
 821
 822#else /* CONFIG_PGSTE */
 823
 824static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm,
 825                                                    unsigned long vmaddr)
 826{
 827        return NULL;
 828}
 829
 830static inline void page_table_free_pgste(unsigned long *table)
 831{
 832}
 833
 834static inline void gmap_disconnect_pgtable(struct mm_struct *mm,
 835                                           unsigned long *table)
 836{
 837}
 838
 839#endif /* CONFIG_PGSTE */
 840
 841static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
 842{
 843        unsigned int old, new;
 844
 845        do {
 846                old = atomic_read(v);
 847                new = old ^ bits;
 848        } while (atomic_cmpxchg(v, old, new) != old);
 849        return new;
 850}
 851
 852/*
 853 * page table entry allocation/free routines.
 854 */
 855unsigned long *page_table_alloc(struct mm_struct *mm, unsigned long vmaddr)
 856{
 857        unsigned long *uninitialized_var(table);
 858        struct page *uninitialized_var(page);
 859        unsigned int mask, bit;
 860
 861        if (mm_has_pgste(mm))
 862                return page_table_alloc_pgste(mm, vmaddr);
 863        /* Allocate fragments of a 4K page as 1K/2K page table */
 864        spin_lock_bh(&mm->context.list_lock);
 865        mask = FRAG_MASK;
 866        if (!list_empty(&mm->context.pgtable_list)) {
 867                page = list_first_entry(&mm->context.pgtable_list,
 868                                        struct page, lru);
 869                table = (unsigned long *) page_to_phys(page);
 870                mask = atomic_read(&page->_mapcount);
 871                mask = mask | (mask >> 4);
 872        }
 873        if ((mask & FRAG_MASK) == FRAG_MASK) {
 874                spin_unlock_bh(&mm->context.list_lock);
 875                page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
 876                if (!page)
 877                        return NULL;
 878                pgtable_page_ctor(page);
 879                atomic_set(&page->_mapcount, 1);
 880                table = (unsigned long *) page_to_phys(page);
 881                clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
 882                spin_lock_bh(&mm->context.list_lock);
 883                list_add(&page->lru, &mm->context.pgtable_list);
 884        } else {
 885                for (bit = 1; mask & bit; bit <<= 1)
 886                        table += PTRS_PER_PTE;
 887                mask = atomic_xor_bits(&page->_mapcount, bit);
 888                if ((mask & FRAG_MASK) == FRAG_MASK)
 889                        list_del(&page->lru);
 890        }
 891        spin_unlock_bh(&mm->context.list_lock);
 892        return table;
 893}
 894
 895void page_table_free(struct mm_struct *mm, unsigned long *table)
 896{
 897        struct page *page;
 898        unsigned int bit, mask;
 899
 900        if (mm_has_pgste(mm)) {
 901                gmap_disconnect_pgtable(mm, table);
 902                return page_table_free_pgste(table);
 903        }
 904        /* Free 1K/2K page table fragment of a 4K page */
 905        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 906        bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
 907        spin_lock_bh(&mm->context.list_lock);
 908        if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
 909                list_del(&page->lru);
 910        mask = atomic_xor_bits(&page->_mapcount, bit);
 911        if (mask & FRAG_MASK)
 912                list_add(&page->lru, &mm->context.pgtable_list);
 913        spin_unlock_bh(&mm->context.list_lock);
 914        if (mask == 0) {
 915                pgtable_page_dtor(page);
 916                atomic_set(&page->_mapcount, -1);
 917                __free_page(page);
 918        }
 919}
 920
 921static void __page_table_free_rcu(void *table, unsigned bit)
 922{
 923        struct page *page;
 924
 925        if (bit == FRAG_MASK)
 926                return page_table_free_pgste(table);
 927        /* Free 1K/2K page table fragment of a 4K page */
 928        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 929        if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
 930                pgtable_page_dtor(page);
 931                atomic_set(&page->_mapcount, -1);
 932                __free_page(page);
 933        }
 934}
 935
 936void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
 937{
 938        struct mm_struct *mm;
 939        struct page *page;
 940        unsigned int bit, mask;
 941
 942        mm = tlb->mm;
 943        if (mm_has_pgste(mm)) {
 944                gmap_disconnect_pgtable(mm, table);
 945                table = (unsigned long *) (__pa(table) | FRAG_MASK);
 946                tlb_remove_table(tlb, table);
 947                return;
 948        }
 949        bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
 950        page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
 951        spin_lock_bh(&mm->context.list_lock);
 952        if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
 953                list_del(&page->lru);
 954        mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
 955        if (mask & FRAG_MASK)
 956                list_add_tail(&page->lru, &mm->context.pgtable_list);
 957        spin_unlock_bh(&mm->context.list_lock);
 958        table = (unsigned long *) (__pa(table) | (bit << 4));
 959        tlb_remove_table(tlb, table);
 960}
 961
 962void __tlb_remove_table(void *_table)
 963{
 964        const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK;
 965        void *table = (void *)((unsigned long) _table & ~mask);
 966        unsigned type = (unsigned long) _table & mask;
 967
 968        if (type)
 969                __page_table_free_rcu(table, type);
 970        else
 971                free_pages((unsigned long) table, ALLOC_ORDER);
 972}
 973
 974static void tlb_remove_table_smp_sync(void *arg)
 975{
 976        /* Simply deliver the interrupt */
 977}
 978
 979static void tlb_remove_table_one(void *table)
 980{
 981        /*
 982         * This isn't an RCU grace period and hence the page-tables cannot be
 983         * assumed to be actually RCU-freed.
 984         *
 985         * It is however sufficient for software page-table walkers that rely
 986         * on IRQ disabling. See the comment near struct mmu_table_batch.
 987         */
 988        smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
 989        __tlb_remove_table(table);
 990}
 991
 992static void tlb_remove_table_rcu(struct rcu_head *head)
 993{
 994        struct mmu_table_batch *batch;
 995        int i;
 996
 997        batch = container_of(head, struct mmu_table_batch, rcu);
 998
 999        for (i = 0; i < batch->nr; i++)
1000                __tlb_remove_table(batch->tables[i]);
1001
1002        free_page((unsigned long)batch);
1003}
1004
1005void tlb_table_flush(struct mmu_gather *tlb)
1006{
1007        struct mmu_table_batch **batch = &tlb->batch;
1008
1009        if (*batch) {
1010                __tlb_flush_mm(tlb->mm);
1011                call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
1012                *batch = NULL;
1013        }
1014}
1015
1016void tlb_remove_table(struct mmu_gather *tlb, void *table)
1017{
1018        struct mmu_table_batch **batch = &tlb->batch;
1019
1020        if (*batch == NULL) {
1021                *batch = (struct mmu_table_batch *)
1022                        __get_free_page(GFP_NOWAIT | __GFP_NOWARN);
1023                if (*batch == NULL) {
1024                        __tlb_flush_mm(tlb->mm);
1025                        tlb_remove_table_one(table);
1026                        return;
1027                }
1028                (*batch)->nr = 0;
1029        }
1030        (*batch)->tables[(*batch)->nr++] = table;
1031        if ((*batch)->nr == MAX_TABLE_BATCH)
1032                tlb_table_flush(tlb);
1033}
1034
1035#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1036void thp_split_vma(struct vm_area_struct *vma)
1037{
1038        unsigned long addr;
1039        struct page *page;
1040
1041        for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
1042                page = follow_page(vma, addr, FOLL_SPLIT);
1043        }
1044}
1045
1046void thp_split_mm(struct mm_struct *mm)
1047{
1048        struct vm_area_struct *vma = mm->mmap;
1049
1050        while (vma != NULL) {
1051                thp_split_vma(vma);
1052                vma->vm_flags &= ~VM_HUGEPAGE;
1053                vma->vm_flags |= VM_NOHUGEPAGE;
1054                vma = vma->vm_next;
1055        }
1056}
1057#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1058
1059/*
1060 * switch on pgstes for its userspace process (for kvm)
1061 */
1062int s390_enable_sie(void)
1063{
1064        struct task_struct *tsk = current;
1065        struct mm_struct *mm, *old_mm;
1066
1067        /* Do we have switched amode? If no, we cannot do sie */
1068        if (s390_user_mode == HOME_SPACE_MODE)
1069                return -EINVAL;
1070
1071        /* Do we have pgstes? if yes, we are done */
1072        if (mm_has_pgste(tsk->mm))
1073                return 0;
1074
1075        /* lets check if we are allowed to replace the mm */
1076        task_lock(tsk);
1077        if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
1078#ifdef CONFIG_AIO
1079            !hlist_empty(&tsk->mm->ioctx_list) ||
1080#endif
1081            tsk->mm != tsk->active_mm) {
1082                task_unlock(tsk);
1083                return -EINVAL;
1084        }
1085        task_unlock(tsk);
1086
1087        /* we copy the mm and let dup_mm create the page tables with_pgstes */
1088        tsk->mm->context.alloc_pgste = 1;
1089        /* make sure that both mms have a correct rss state */
1090        sync_mm_rss(tsk->mm);
1091        mm = dup_mm(tsk);
1092        tsk->mm->context.alloc_pgste = 0;
1093        if (!mm)
1094                return -ENOMEM;
1095
1096#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1097        /* split thp mappings and disable thp for future mappings */
1098        thp_split_mm(mm);
1099        mm->def_flags |= VM_NOHUGEPAGE;
1100#endif
1101
1102        /* Now lets check again if something happened */
1103        task_lock(tsk);
1104        if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
1105#ifdef CONFIG_AIO
1106            !hlist_empty(&tsk->mm->ioctx_list) ||
1107#endif
1108            tsk->mm != tsk->active_mm) {
1109                mmput(mm);
1110                task_unlock(tsk);
1111                return -EINVAL;
1112        }
1113
1114        /* ok, we are alone. No ptrace, no threads, etc. */
1115        old_mm = tsk->mm;
1116        tsk->mm = tsk->active_mm = mm;
1117        preempt_disable();
1118        update_mm(mm, tsk);
1119        atomic_inc(&mm->context.attach_count);
1120        atomic_dec(&old_mm->context.attach_count);
1121        cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
1122        preempt_enable();
1123        task_unlock(tsk);
1124        mmput(old_mm);
1125        return 0;
1126}
1127EXPORT_SYMBOL_GPL(s390_enable_sie);
1128
1129#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1130int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address,
1131                           pmd_t *pmdp)
1132{
1133        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1134        /* No need to flush TLB
1135         * On s390 reference bits are in storage key and never in TLB */
1136        return pmdp_test_and_clear_young(vma, address, pmdp);
1137}
1138
1139int pmdp_set_access_flags(struct vm_area_struct *vma,
1140                          unsigned long address, pmd_t *pmdp,
1141                          pmd_t entry, int dirty)
1142{
1143        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1144
1145        if (pmd_same(*pmdp, entry))
1146                return 0;
1147        pmdp_invalidate(vma, address, pmdp);
1148        set_pmd_at(vma->vm_mm, address, pmdp, entry);
1149        return 1;
1150}
1151
1152static void pmdp_splitting_flush_sync(void *arg)
1153{
1154        /* Simply deliver the interrupt */
1155}
1156
1157void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
1158                          pmd_t *pmdp)
1159{
1160        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1161        if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT,
1162                              (unsigned long *) pmdp)) {
1163                /* need to serialize against gup-fast (IRQ disabled) */
1164                smp_call_function(pmdp_splitting_flush_sync, NULL, 1);
1165        }
1166}
1167
1168void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1169                                pgtable_t pgtable)
1170{
1171        struct list_head *lh = (struct list_head *) pgtable;
1172
1173        assert_spin_locked(&mm->page_table_lock);
1174
1175        /* FIFO */
1176        if (!mm->pmd_huge_pte)
1177                INIT_LIST_HEAD(lh);
1178        else
1179                list_add(lh, (struct list_head *) mm->pmd_huge_pte);
1180        mm->pmd_huge_pte = pgtable;
1181}
1182
1183pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1184{
1185        struct list_head *lh;
1186        pgtable_t pgtable;
1187        pte_t *ptep;
1188
1189        assert_spin_locked(&mm->page_table_lock);
1190
1191        /* FIFO */
1192        pgtable = mm->pmd_huge_pte;
1193        lh = (struct list_head *) pgtable;
1194        if (list_empty(lh))
1195                mm->pmd_huge_pte = NULL;
1196        else {
1197                mm->pmd_huge_pte = (pgtable_t) lh->next;
1198                list_del(lh);
1199        }
1200        ptep = (pte_t *) pgtable;
1201        pte_val(*ptep) = _PAGE_TYPE_EMPTY;
1202        ptep++;
1203        pte_val(*ptep) = _PAGE_TYPE_EMPTY;
1204        return pgtable;
1205}
1206#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1207