linux/arch/s390/mm/gmap.c
<<
>>
Prefs
   1/*
   2 *  KVM guest address space mapping code
   3 *
   4 *    Copyright IBM Corp. 2007, 2016
   5 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
   6 */
   7
   8#include <linux/kernel.h>
   9#include <linux/mm.h>
  10#include <linux/swap.h>
  11#include <linux/smp.h>
  12#include <linux/spinlock.h>
  13#include <linux/slab.h>
  14#include <linux/swapops.h>
  15#include <linux/ksm.h>
  16#include <linux/mman.h>
  17
  18#include <asm/pgtable.h>
  19#include <asm/pgalloc.h>
  20#include <asm/gmap.h>
  21#include <asm/tlb.h>
  22
  23#define GMAP_SHADOW_FAKE_TABLE 1ULL
  24
  25/**
  26 * gmap_alloc - allocate and initialize a guest address space
  27 * @mm: pointer to the parent mm_struct
  28 * @limit: maximum address of the gmap address space
  29 *
  30 * Returns a guest address space structure.
  31 */
  32static struct gmap *gmap_alloc(unsigned long limit)
  33{
  34        struct gmap *gmap;
  35        struct page *page;
  36        unsigned long *table;
  37        unsigned long etype, atype;
  38
  39        if (limit < (1UL << 31)) {
  40                limit = (1UL << 31) - 1;
  41                atype = _ASCE_TYPE_SEGMENT;
  42                etype = _SEGMENT_ENTRY_EMPTY;
  43        } else if (limit < (1UL << 42)) {
  44                limit = (1UL << 42) - 1;
  45                atype = _ASCE_TYPE_REGION3;
  46                etype = _REGION3_ENTRY_EMPTY;
  47        } else if (limit < (1UL << 53)) {
  48                limit = (1UL << 53) - 1;
  49                atype = _ASCE_TYPE_REGION2;
  50                etype = _REGION2_ENTRY_EMPTY;
  51        } else {
  52                limit = -1UL;
  53                atype = _ASCE_TYPE_REGION1;
  54                etype = _REGION1_ENTRY_EMPTY;
  55        }
  56        gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL);
  57        if (!gmap)
  58                goto out;
  59        INIT_LIST_HEAD(&gmap->crst_list);
  60        INIT_LIST_HEAD(&gmap->children);
  61        INIT_LIST_HEAD(&gmap->pt_list);
  62        INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
  63        INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
  64        INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
  65        spin_lock_init(&gmap->guest_table_lock);
  66        spin_lock_init(&gmap->shadow_lock);
  67        atomic_set(&gmap->ref_count, 1);
  68        page = alloc_pages(GFP_KERNEL, 2);
  69        if (!page)
  70                goto out_free;
  71        page->index = 0;
  72        list_add(&page->lru, &gmap->crst_list);
  73        table = (unsigned long *) page_to_phys(page);
  74        crst_table_init(table, etype);
  75        gmap->table = table;
  76        gmap->asce = atype | _ASCE_TABLE_LENGTH |
  77                _ASCE_USER_BITS | __pa(table);
  78        gmap->asce_end = limit;
  79        return gmap;
  80
  81out_free:
  82        kfree(gmap);
  83out:
  84        return NULL;
  85}
  86
  87/**
  88 * gmap_create - create a guest address space
  89 * @mm: pointer to the parent mm_struct
  90 * @limit: maximum size of the gmap address space
  91 *
  92 * Returns a guest address space structure.
  93 */
  94struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
  95{
  96        struct gmap *gmap;
  97        unsigned long gmap_asce;
  98
  99        gmap = gmap_alloc(limit);
 100        if (!gmap)
 101                return NULL;
 102        gmap->mm = mm;
 103        spin_lock(&mm->context.gmap_lock);
 104        list_add_rcu(&gmap->list, &mm->context.gmap_list);
 105        if (list_is_singular(&mm->context.gmap_list))
 106                gmap_asce = gmap->asce;
 107        else
 108                gmap_asce = -1UL;
 109        WRITE_ONCE(mm->context.gmap_asce, gmap_asce);
 110        spin_unlock(&mm->context.gmap_lock);
 111        return gmap;
 112}
 113EXPORT_SYMBOL_GPL(gmap_create);
 114
 115static void gmap_flush_tlb(struct gmap *gmap)
 116{
 117        if (MACHINE_HAS_IDTE)
 118                __tlb_flush_idte(gmap->asce);
 119        else
 120                __tlb_flush_global();
 121}
 122
 123static void gmap_radix_tree_free(struct radix_tree_root *root)
 124{
 125        struct radix_tree_iter iter;
 126        unsigned long indices[16];
 127        unsigned long index;
 128        void **slot;
 129        int i, nr;
 130
 131        /* A radix tree is freed by deleting all of its entries */
 132        index = 0;
 133        do {
 134                nr = 0;
 135                radix_tree_for_each_slot(slot, root, &iter, index) {
 136                        indices[nr] = iter.index;
 137                        if (++nr == 16)
 138                                break;
 139                }
 140                for (i = 0; i < nr; i++) {
 141                        index = indices[i];
 142                        radix_tree_delete(root, index);
 143                }
 144        } while (nr > 0);
 145}
 146
 147static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
 148{
 149        struct gmap_rmap *rmap, *rnext, *head;
 150        struct radix_tree_iter iter;
 151        unsigned long indices[16];
 152        unsigned long index;
 153        void **slot;
 154        int i, nr;
 155
 156        /* A radix tree is freed by deleting all of its entries */
 157        index = 0;
 158        do {
 159                nr = 0;
 160                radix_tree_for_each_slot(slot, root, &iter, index) {
 161                        indices[nr] = iter.index;
 162                        if (++nr == 16)
 163                                break;
 164                }
 165                for (i = 0; i < nr; i++) {
 166                        index = indices[i];
 167                        head = radix_tree_delete(root, index);
 168                        gmap_for_each_rmap_safe(rmap, rnext, head)
 169                                kfree(rmap);
 170                }
 171        } while (nr > 0);
 172}
 173
 174/**
 175 * gmap_free - free a guest address space
 176 * @gmap: pointer to the guest address space structure
 177 *
 178 * No locks required. There are no references to this gmap anymore.
 179 */
 180static void gmap_free(struct gmap *gmap)
 181{
 182        struct page *page, *next;
 183
 184        /* Flush tlb of all gmaps (if not already done for shadows) */
 185        if (!(gmap_is_shadow(gmap) && gmap->removed))
 186                gmap_flush_tlb(gmap);
 187        /* Free all segment & region tables. */
 188        list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
 189                __free_pages(page, 2);
 190        gmap_radix_tree_free(&gmap->guest_to_host);
 191        gmap_radix_tree_free(&gmap->host_to_guest);
 192
 193        /* Free additional data for a shadow gmap */
 194        if (gmap_is_shadow(gmap)) {
 195                /* Free all page tables. */
 196                list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
 197                        page_table_free_pgste(page);
 198                gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
 199                /* Release reference to the parent */
 200                gmap_put(gmap->parent);
 201        }
 202
 203        kfree(gmap);
 204}
 205
 206/**
 207 * gmap_get - increase reference counter for guest address space
 208 * @gmap: pointer to the guest address space structure
 209 *
 210 * Returns the gmap pointer
 211 */
 212struct gmap *gmap_get(struct gmap *gmap)
 213{
 214        atomic_inc(&gmap->ref_count);
 215        return gmap;
 216}
 217EXPORT_SYMBOL_GPL(gmap_get);
 218
 219/**
 220 * gmap_put - decrease reference counter for guest address space
 221 * @gmap: pointer to the guest address space structure
 222 *
 223 * If the reference counter reaches zero the guest address space is freed.
 224 */
 225void gmap_put(struct gmap *gmap)
 226{
 227        if (atomic_dec_return(&gmap->ref_count) == 0)
 228                gmap_free(gmap);
 229}
 230EXPORT_SYMBOL_GPL(gmap_put);
 231
 232/**
 233 * gmap_remove - remove a guest address space but do not free it yet
 234 * @gmap: pointer to the guest address space structure
 235 */
 236void gmap_remove(struct gmap *gmap)
 237{
 238        struct gmap *sg, *next;
 239        unsigned long gmap_asce;
 240
 241        /* Remove all shadow gmaps linked to this gmap */
 242        if (!list_empty(&gmap->children)) {
 243                spin_lock(&gmap->shadow_lock);
 244                list_for_each_entry_safe(sg, next, &gmap->children, list) {
 245                        list_del(&sg->list);
 246                        gmap_put(sg);
 247                }
 248                spin_unlock(&gmap->shadow_lock);
 249        }
 250        /* Remove gmap from the pre-mm list */
 251        spin_lock(&gmap->mm->context.gmap_lock);
 252        list_del_rcu(&gmap->list);
 253        if (list_empty(&gmap->mm->context.gmap_list))
 254                gmap_asce = 0;
 255        else if (list_is_singular(&gmap->mm->context.gmap_list))
 256                gmap_asce = list_first_entry(&gmap->mm->context.gmap_list,
 257                                             struct gmap, list)->asce;
 258        else
 259                gmap_asce = -1UL;
 260        WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce);
 261        spin_unlock(&gmap->mm->context.gmap_lock);
 262        synchronize_rcu();
 263        /* Put reference */
 264        gmap_put(gmap);
 265}
 266EXPORT_SYMBOL_GPL(gmap_remove);
 267
 268/**
 269 * gmap_enable - switch primary space to the guest address space
 270 * @gmap: pointer to the guest address space structure
 271 */
 272void gmap_enable(struct gmap *gmap)
 273{
 274        S390_lowcore.gmap = (unsigned long) gmap;
 275}
 276EXPORT_SYMBOL_GPL(gmap_enable);
 277
 278/**
 279 * gmap_disable - switch back to the standard primary address space
 280 * @gmap: pointer to the guest address space structure
 281 */
 282void gmap_disable(struct gmap *gmap)
 283{
 284        S390_lowcore.gmap = 0UL;
 285}
 286EXPORT_SYMBOL_GPL(gmap_disable);
 287
 288/**
 289 * gmap_get_enabled - get a pointer to the currently enabled gmap
 290 *
 291 * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
 292 */
 293struct gmap *gmap_get_enabled(void)
 294{
 295        return (struct gmap *) S390_lowcore.gmap;
 296}
 297EXPORT_SYMBOL_GPL(gmap_get_enabled);
 298
 299/*
 300 * gmap_alloc_table is assumed to be called with mmap_sem held
 301 */
 302static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
 303                            unsigned long init, unsigned long gaddr)
 304{
 305        struct page *page;
 306        unsigned long *new;
 307
 308        /* since we dont free the gmap table until gmap_free we can unlock */
 309        page = alloc_pages(GFP_KERNEL, 2);
 310        if (!page)
 311                return -ENOMEM;
 312        new = (unsigned long *) page_to_phys(page);
 313        crst_table_init(new, init);
 314        spin_lock(&gmap->guest_table_lock);
 315        if (*table & _REGION_ENTRY_INVALID) {
 316                list_add(&page->lru, &gmap->crst_list);
 317                *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
 318                        (*table & _REGION_ENTRY_TYPE_MASK);
 319                page->index = gaddr;
 320                page = NULL;
 321        }
 322        spin_unlock(&gmap->guest_table_lock);
 323        if (page)
 324                __free_pages(page, 2);
 325        return 0;
 326}
 327
 328/**
 329 * __gmap_segment_gaddr - find virtual address from segment pointer
 330 * @entry: pointer to a segment table entry in the guest address space
 331 *
 332 * Returns the virtual address in the guest address space for the segment
 333 */
 334static unsigned long __gmap_segment_gaddr(unsigned long *entry)
 335{
 336        struct page *page;
 337        unsigned long offset, mask;
 338
 339        offset = (unsigned long) entry / sizeof(unsigned long);
 340        offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
 341        mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
 342        page = virt_to_page((void *)((unsigned long) entry & mask));
 343        return page->index + offset;
 344}
 345
 346/**
 347 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
 348 * @gmap: pointer to the guest address space structure
 349 * @vmaddr: address in the host process address space
 350 *
 351 * Returns 1 if a TLB flush is required
 352 */
 353static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
 354{
 355        unsigned long *entry;
 356        int flush = 0;
 357
 358        BUG_ON(gmap_is_shadow(gmap));
 359        spin_lock(&gmap->guest_table_lock);
 360        entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
 361        if (entry) {
 362                flush = (*entry != _SEGMENT_ENTRY_INVALID);
 363                *entry = _SEGMENT_ENTRY_INVALID;
 364        }
 365        spin_unlock(&gmap->guest_table_lock);
 366        return flush;
 367}
 368
 369/**
 370 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
 371 * @gmap: pointer to the guest address space structure
 372 * @gaddr: address in the guest address space
 373 *
 374 * Returns 1 if a TLB flush is required
 375 */
 376static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
 377{
 378        unsigned long vmaddr;
 379
 380        vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
 381                                                   gaddr >> PMD_SHIFT);
 382        return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
 383}
 384
 385/**
 386 * gmap_unmap_segment - unmap segment from the guest address space
 387 * @gmap: pointer to the guest address space structure
 388 * @to: address in the guest address space
 389 * @len: length of the memory area to unmap
 390 *
 391 * Returns 0 if the unmap succeeded, -EINVAL if not.
 392 */
 393int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
 394{
 395        unsigned long off;
 396        int flush;
 397
 398        BUG_ON(gmap_is_shadow(gmap));
 399        if ((to | len) & (PMD_SIZE - 1))
 400                return -EINVAL;
 401        if (len == 0 || to + len < to)
 402                return -EINVAL;
 403
 404        flush = 0;
 405        down_write(&gmap->mm->mmap_sem);
 406        for (off = 0; off < len; off += PMD_SIZE)
 407                flush |= __gmap_unmap_by_gaddr(gmap, to + off);
 408        up_write(&gmap->mm->mmap_sem);
 409        if (flush)
 410                gmap_flush_tlb(gmap);
 411        return 0;
 412}
 413EXPORT_SYMBOL_GPL(gmap_unmap_segment);
 414
 415/**
 416 * gmap_map_segment - map a segment to the guest address space
 417 * @gmap: pointer to the guest address space structure
 418 * @from: source address in the parent address space
 419 * @to: target address in the guest address space
 420 * @len: length of the memory area to map
 421 *
 422 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
 423 */
 424int gmap_map_segment(struct gmap *gmap, unsigned long from,
 425                     unsigned long to, unsigned long len)
 426{
 427        unsigned long off;
 428        int flush;
 429
 430        BUG_ON(gmap_is_shadow(gmap));
 431        if ((from | to | len) & (PMD_SIZE - 1))
 432                return -EINVAL;
 433        if (len == 0 || from + len < from || to + len < to ||
 434            from + len - 1 > TASK_MAX_SIZE || to + len - 1 > gmap->asce_end)
 435                return -EINVAL;
 436
 437        flush = 0;
 438        down_write(&gmap->mm->mmap_sem);
 439        for (off = 0; off < len; off += PMD_SIZE) {
 440                /* Remove old translation */
 441                flush |= __gmap_unmap_by_gaddr(gmap, to + off);
 442                /* Store new translation */
 443                if (radix_tree_insert(&gmap->guest_to_host,
 444                                      (to + off) >> PMD_SHIFT,
 445                                      (void *) from + off))
 446                        break;
 447        }
 448        up_write(&gmap->mm->mmap_sem);
 449        if (flush)
 450                gmap_flush_tlb(gmap);
 451        if (off >= len)
 452                return 0;
 453        gmap_unmap_segment(gmap, to, len);
 454        return -ENOMEM;
 455}
 456EXPORT_SYMBOL_GPL(gmap_map_segment);
 457
 458/**
 459 * __gmap_translate - translate a guest address to a user space address
 460 * @gmap: pointer to guest mapping meta data structure
 461 * @gaddr: guest address
 462 *
 463 * Returns user space address which corresponds to the guest address or
 464 * -EFAULT if no such mapping exists.
 465 * This function does not establish potentially missing page table entries.
 466 * The mmap_sem of the mm that belongs to the address space must be held
 467 * when this function gets called.
 468 *
 469 * Note: Can also be called for shadow gmaps.
 470 */
 471unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
 472{
 473        unsigned long vmaddr;
 474
 475        vmaddr = (unsigned long)
 476                radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
 477        /* Note: guest_to_host is empty for a shadow gmap */
 478        return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
 479}
 480EXPORT_SYMBOL_GPL(__gmap_translate);
 481
 482/**
 483 * gmap_translate - translate a guest address to a user space address
 484 * @gmap: pointer to guest mapping meta data structure
 485 * @gaddr: guest address
 486 *
 487 * Returns user space address which corresponds to the guest address or
 488 * -EFAULT if no such mapping exists.
 489 * This function does not establish potentially missing page table entries.
 490 */
 491unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
 492{
 493        unsigned long rc;
 494
 495        down_read(&gmap->mm->mmap_sem);
 496        rc = __gmap_translate(gmap, gaddr);
 497        up_read(&gmap->mm->mmap_sem);
 498        return rc;
 499}
 500EXPORT_SYMBOL_GPL(gmap_translate);
 501
 502/**
 503 * gmap_unlink - disconnect a page table from the gmap shadow tables
 504 * @gmap: pointer to guest mapping meta data structure
 505 * @table: pointer to the host page table
 506 * @vmaddr: vm address associated with the host page table
 507 */
 508void gmap_unlink(struct mm_struct *mm, unsigned long *table,
 509                 unsigned long vmaddr)
 510{
 511        struct gmap *gmap;
 512        int flush;
 513
 514        rcu_read_lock();
 515        list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
 516                flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
 517                if (flush)
 518                        gmap_flush_tlb(gmap);
 519        }
 520        rcu_read_unlock();
 521}
 522
 523/**
 524 * gmap_link - set up shadow page tables to connect a host to a guest address
 525 * @gmap: pointer to guest mapping meta data structure
 526 * @gaddr: guest address
 527 * @vmaddr: vm address
 528 *
 529 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
 530 * if the vm address is already mapped to a different guest segment.
 531 * The mmap_sem of the mm that belongs to the address space must be held
 532 * when this function gets called.
 533 */
 534int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 535{
 536        struct mm_struct *mm;
 537        unsigned long *table;
 538        spinlock_t *ptl;
 539        pgd_t *pgd;
 540        pud_t *pud;
 541        pmd_t *pmd;
 542        int rc;
 543
 544        BUG_ON(gmap_is_shadow(gmap));
 545        /* Create higher level tables in the gmap page table */
 546        table = gmap->table;
 547        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
 548                table += (gaddr >> 53) & 0x7ff;
 549                if ((*table & _REGION_ENTRY_INVALID) &&
 550                    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
 551                                     gaddr & 0xffe0000000000000UL))
 552                        return -ENOMEM;
 553                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 554        }
 555        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
 556                table += (gaddr >> 42) & 0x7ff;
 557                if ((*table & _REGION_ENTRY_INVALID) &&
 558                    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
 559                                     gaddr & 0xfffffc0000000000UL))
 560                        return -ENOMEM;
 561                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 562        }
 563        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
 564                table += (gaddr >> 31) & 0x7ff;
 565                if ((*table & _REGION_ENTRY_INVALID) &&
 566                    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
 567                                     gaddr & 0xffffffff80000000UL))
 568                        return -ENOMEM;
 569                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 570        }
 571        table += (gaddr >> 20) & 0x7ff;
 572        /* Walk the parent mm page table */
 573        mm = gmap->mm;
 574        pgd = pgd_offset(mm, vmaddr);
 575        VM_BUG_ON(pgd_none(*pgd));
 576        pud = pud_offset(pgd, vmaddr);
 577        VM_BUG_ON(pud_none(*pud));
 578        /* large puds cannot yet be handled */
 579        if (pud_large(*pud))
 580                return -EFAULT;
 581        pmd = pmd_offset(pud, vmaddr);
 582        VM_BUG_ON(pmd_none(*pmd));
 583        /* large pmds cannot yet be handled */
 584        if (pmd_large(*pmd))
 585                return -EFAULT;
 586        /* Link gmap segment table entry location to page table. */
 587        rc = radix_tree_preload(GFP_KERNEL);
 588        if (rc)
 589                return rc;
 590        ptl = pmd_lock(mm, pmd);
 591        spin_lock(&gmap->guest_table_lock);
 592        if (*table == _SEGMENT_ENTRY_INVALID) {
 593                rc = radix_tree_insert(&gmap->host_to_guest,
 594                                       vmaddr >> PMD_SHIFT, table);
 595                if (!rc)
 596                        *table = pmd_val(*pmd);
 597        } else
 598                rc = 0;
 599        spin_unlock(&gmap->guest_table_lock);
 600        spin_unlock(ptl);
 601        radix_tree_preload_end();
 602        return rc;
 603}
 604
 605/**
 606 * gmap_fault - resolve a fault on a guest address
 607 * @gmap: pointer to guest mapping meta data structure
 608 * @gaddr: guest address
 609 * @fault_flags: flags to pass down to handle_mm_fault()
 610 *
 611 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
 612 * if the vm address is already mapped to a different guest segment.
 613 */
 614int gmap_fault(struct gmap *gmap, unsigned long gaddr,
 615               unsigned int fault_flags)
 616{
 617        unsigned long vmaddr;
 618        int rc;
 619        bool unlocked;
 620
 621        down_read(&gmap->mm->mmap_sem);
 622
 623retry:
 624        unlocked = false;
 625        vmaddr = __gmap_translate(gmap, gaddr);
 626        if (IS_ERR_VALUE(vmaddr)) {
 627                rc = vmaddr;
 628                goto out_up;
 629        }
 630        if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
 631                             &unlocked)) {
 632                rc = -EFAULT;
 633                goto out_up;
 634        }
 635        /*
 636         * In the case that fixup_user_fault unlocked the mmap_sem during
 637         * faultin redo __gmap_translate to not race with a map/unmap_segment.
 638         */
 639        if (unlocked)
 640                goto retry;
 641
 642        rc = __gmap_link(gmap, gaddr, vmaddr);
 643out_up:
 644        up_read(&gmap->mm->mmap_sem);
 645        return rc;
 646}
 647EXPORT_SYMBOL_GPL(gmap_fault);
 648
 649/*
 650 * this function is assumed to be called with mmap_sem held
 651 */
 652void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
 653{
 654        unsigned long vmaddr;
 655        spinlock_t *ptl;
 656        pte_t *ptep;
 657
 658        /* Find the vm address for the guest address */
 659        vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
 660                                                   gaddr >> PMD_SHIFT);
 661        if (vmaddr) {
 662                vmaddr |= gaddr & ~PMD_MASK;
 663                /* Get pointer to the page table entry */
 664                ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
 665                if (likely(ptep))
 666                        ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
 667                pte_unmap_unlock(ptep, ptl);
 668        }
 669}
 670EXPORT_SYMBOL_GPL(__gmap_zap);
 671
 672void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
 673{
 674        unsigned long gaddr, vmaddr, size;
 675        struct vm_area_struct *vma;
 676
 677        down_read(&gmap->mm->mmap_sem);
 678        for (gaddr = from; gaddr < to;
 679             gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
 680                /* Find the vm address for the guest address */
 681                vmaddr = (unsigned long)
 682                        radix_tree_lookup(&gmap->guest_to_host,
 683                                          gaddr >> PMD_SHIFT);
 684                if (!vmaddr)
 685                        continue;
 686                vmaddr |= gaddr & ~PMD_MASK;
 687                /* Find vma in the parent mm */
 688                vma = find_vma(gmap->mm, vmaddr);
 689                size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
 690                zap_page_range(vma, vmaddr, size, NULL);
 691        }
 692        up_read(&gmap->mm->mmap_sem);
 693}
 694EXPORT_SYMBOL_GPL(gmap_discard);
 695
 696static LIST_HEAD(gmap_notifier_list);
 697static DEFINE_SPINLOCK(gmap_notifier_lock);
 698
 699/**
 700 * gmap_register_pte_notifier - register a pte invalidation callback
 701 * @nb: pointer to the gmap notifier block
 702 */
 703void gmap_register_pte_notifier(struct gmap_notifier *nb)
 704{
 705        spin_lock(&gmap_notifier_lock);
 706        list_add_rcu(&nb->list, &gmap_notifier_list);
 707        spin_unlock(&gmap_notifier_lock);
 708}
 709EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
 710
 711/**
 712 * gmap_unregister_pte_notifier - remove a pte invalidation callback
 713 * @nb: pointer to the gmap notifier block
 714 */
 715void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
 716{
 717        spin_lock(&gmap_notifier_lock);
 718        list_del_rcu(&nb->list);
 719        spin_unlock(&gmap_notifier_lock);
 720        synchronize_rcu();
 721}
 722EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
 723
 724/**
 725 * gmap_call_notifier - call all registered invalidation callbacks
 726 * @gmap: pointer to guest mapping meta data structure
 727 * @start: start virtual address in the guest address space
 728 * @end: end virtual address in the guest address space
 729 */
 730static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
 731                               unsigned long end)
 732{
 733        struct gmap_notifier *nb;
 734
 735        list_for_each_entry(nb, &gmap_notifier_list, list)
 736                nb->notifier_call(gmap, start, end);
 737}
 738
 739/**
 740 * gmap_table_walk - walk the gmap page tables
 741 * @gmap: pointer to guest mapping meta data structure
 742 * @gaddr: virtual address in the guest address space
 743 * @level: page table level to stop at
 744 *
 745 * Returns a table entry pointer for the given guest address and @level
 746 * @level=0 : returns a pointer to a page table table entry (or NULL)
 747 * @level=1 : returns a pointer to a segment table entry (or NULL)
 748 * @level=2 : returns a pointer to a region-3 table entry (or NULL)
 749 * @level=3 : returns a pointer to a region-2 table entry (or NULL)
 750 * @level=4 : returns a pointer to a region-1 table entry (or NULL)
 751 *
 752 * Returns NULL if the gmap page tables could not be walked to the
 753 * requested level.
 754 *
 755 * Note: Can also be called for shadow gmaps.
 756 */
 757static inline unsigned long *gmap_table_walk(struct gmap *gmap,
 758                                             unsigned long gaddr, int level)
 759{
 760        unsigned long *table;
 761
 762        if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
 763                return NULL;
 764        if (gmap_is_shadow(gmap) && gmap->removed)
 765                return NULL;
 766        if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
 767                return NULL;
 768        table = gmap->table;
 769        switch (gmap->asce & _ASCE_TYPE_MASK) {
 770        case _ASCE_TYPE_REGION1:
 771                table += (gaddr >> 53) & 0x7ff;
 772                if (level == 4)
 773                        break;
 774                if (*table & _REGION_ENTRY_INVALID)
 775                        return NULL;
 776                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 777                /* Fallthrough */
 778        case _ASCE_TYPE_REGION2:
 779                table += (gaddr >> 42) & 0x7ff;
 780                if (level == 3)
 781                        break;
 782                if (*table & _REGION_ENTRY_INVALID)
 783                        return NULL;
 784                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 785                /* Fallthrough */
 786        case _ASCE_TYPE_REGION3:
 787                table += (gaddr >> 31) & 0x7ff;
 788                if (level == 2)
 789                        break;
 790                if (*table & _REGION_ENTRY_INVALID)
 791                        return NULL;
 792                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 793                /* Fallthrough */
 794        case _ASCE_TYPE_SEGMENT:
 795                table += (gaddr >> 20) & 0x7ff;
 796                if (level == 1)
 797                        break;
 798                if (*table & _REGION_ENTRY_INVALID)
 799                        return NULL;
 800                table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
 801                table += (gaddr >> 12) & 0xff;
 802        }
 803        return table;
 804}
 805
 806/**
 807 * gmap_pte_op_walk - walk the gmap page table, get the page table lock
 808 *                    and return the pte pointer
 809 * @gmap: pointer to guest mapping meta data structure
 810 * @gaddr: virtual address in the guest address space
 811 * @ptl: pointer to the spinlock pointer
 812 *
 813 * Returns a pointer to the locked pte for a guest address, or NULL
 814 *
 815 * Note: Can also be called for shadow gmaps.
 816 */
 817static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
 818                               spinlock_t **ptl)
 819{
 820        unsigned long *table;
 821
 822        if (gmap_is_shadow(gmap))
 823                spin_lock(&gmap->guest_table_lock);
 824        /* Walk the gmap page table, lock and get pte pointer */
 825        table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
 826        if (!table || *table & _SEGMENT_ENTRY_INVALID) {
 827                if (gmap_is_shadow(gmap))
 828                        spin_unlock(&gmap->guest_table_lock);
 829                return NULL;
 830        }
 831        if (gmap_is_shadow(gmap)) {
 832                *ptl = &gmap->guest_table_lock;
 833                return pte_offset_map((pmd_t *) table, gaddr);
 834        }
 835        return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
 836}
 837
 838/**
 839 * gmap_pte_op_fixup - force a page in and connect the gmap page table
 840 * @gmap: pointer to guest mapping meta data structure
 841 * @gaddr: virtual address in the guest address space
 842 * @vmaddr: address in the host process address space
 843 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
 844 *
 845 * Returns 0 if the caller can retry __gmap_translate (might fail again),
 846 * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
 847 * up or connecting the gmap page table.
 848 */
 849static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
 850                             unsigned long vmaddr, int prot)
 851{
 852        struct mm_struct *mm = gmap->mm;
 853        unsigned int fault_flags;
 854        bool unlocked = false;
 855
 856        BUG_ON(gmap_is_shadow(gmap));
 857        fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
 858        if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
 859                return -EFAULT;
 860        if (unlocked)
 861                /* lost mmap_sem, caller has to retry __gmap_translate */
 862                return 0;
 863        /* Connect the page tables */
 864        return __gmap_link(gmap, gaddr, vmaddr);
 865}
 866
 867/**
 868 * gmap_pte_op_end - release the page table lock
 869 * @ptl: pointer to the spinlock pointer
 870 */
 871static void gmap_pte_op_end(spinlock_t *ptl)
 872{
 873        spin_unlock(ptl);
 874}
 875
 876/*
 877 * gmap_protect_range - remove access rights to memory and set pgste bits
 878 * @gmap: pointer to guest mapping meta data structure
 879 * @gaddr: virtual address in the guest address space
 880 * @len: size of area
 881 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
 882 * @bits: pgste notification bits to set
 883 *
 884 * Returns 0 if successfully protected, -ENOMEM if out of memory and
 885 * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
 886 *
 887 * Called with sg->mm->mmap_sem in read.
 888 *
 889 * Note: Can also be called for shadow gmaps.
 890 */
 891static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
 892                              unsigned long len, int prot, unsigned long bits)
 893{
 894        unsigned long vmaddr;
 895        spinlock_t *ptl;
 896        pte_t *ptep;
 897        int rc;
 898
 899        while (len) {
 900                rc = -EAGAIN;
 901                ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
 902                if (ptep) {
 903                        rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
 904                        gmap_pte_op_end(ptl);
 905                }
 906                if (rc) {
 907                        vmaddr = __gmap_translate(gmap, gaddr);
 908                        if (IS_ERR_VALUE(vmaddr))
 909                                return vmaddr;
 910                        rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
 911                        if (rc)
 912                                return rc;
 913                        continue;
 914                }
 915                gaddr += PAGE_SIZE;
 916                len -= PAGE_SIZE;
 917        }
 918        return 0;
 919}
 920
 921/**
 922 * gmap_mprotect_notify - change access rights for a range of ptes and
 923 *                        call the notifier if any pte changes again
 924 * @gmap: pointer to guest mapping meta data structure
 925 * @gaddr: virtual address in the guest address space
 926 * @len: size of area
 927 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
 928 *
 929 * Returns 0 if for each page in the given range a gmap mapping exists,
 930 * the new access rights could be set and the notifier could be armed.
 931 * If the gmap mapping is missing for one or more pages -EFAULT is
 932 * returned. If no memory could be allocated -ENOMEM is returned.
 933 * This function establishes missing page table entries.
 934 */
 935int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
 936                         unsigned long len, int prot)
 937{
 938        int rc;
 939
 940        if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
 941                return -EINVAL;
 942        if (!MACHINE_HAS_ESOP && prot == PROT_READ)
 943                return -EINVAL;
 944        down_read(&gmap->mm->mmap_sem);
 945        rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT);
 946        up_read(&gmap->mm->mmap_sem);
 947        return rc;
 948}
 949EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
 950
 951/**
 952 * gmap_read_table - get an unsigned long value from a guest page table using
 953 *                   absolute addressing, without marking the page referenced.
 954 * @gmap: pointer to guest mapping meta data structure
 955 * @gaddr: virtual address in the guest address space
 956 * @val: pointer to the unsigned long value to return
 957 *
 958 * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
 959 * if reading using the virtual address failed.
 960 *
 961 * Called with gmap->mm->mmap_sem in read.
 962 */
 963int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
 964{
 965        unsigned long address, vmaddr;
 966        spinlock_t *ptl;
 967        pte_t *ptep, pte;
 968        int rc;
 969
 970        while (1) {
 971                rc = -EAGAIN;
 972                ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
 973                if (ptep) {
 974                        pte = *ptep;
 975                        if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
 976                                address = pte_val(pte) & PAGE_MASK;
 977                                address += gaddr & ~PAGE_MASK;
 978                                *val = *(unsigned long *) address;
 979                                pte_val(*ptep) |= _PAGE_YOUNG;
 980                                /* Do *NOT* clear the _PAGE_INVALID bit! */
 981                                rc = 0;
 982                        }
 983                        gmap_pte_op_end(ptl);
 984                }
 985                if (!rc)
 986                        break;
 987                vmaddr = __gmap_translate(gmap, gaddr);
 988                if (IS_ERR_VALUE(vmaddr)) {
 989                        rc = vmaddr;
 990                        break;
 991                }
 992                rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
 993                if (rc)
 994                        break;
 995        }
 996        return rc;
 997}
 998EXPORT_SYMBOL_GPL(gmap_read_table);
 999
1000/**
1001 * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
1002 * @sg: pointer to the shadow guest address space structure
1003 * @vmaddr: vm address associated with the rmap
1004 * @rmap: pointer to the rmap structure
1005 *
1006 * Called with the sg->guest_table_lock
1007 */
1008static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
1009                                    struct gmap_rmap *rmap)
1010{
1011        void **slot;
1012
1013        BUG_ON(!gmap_is_shadow(sg));
1014        slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
1015        if (slot) {
1016                rmap->next = radix_tree_deref_slot_protected(slot,
1017                                                        &sg->guest_table_lock);
1018                radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
1019        } else {
1020                rmap->next = NULL;
1021                radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
1022                                  rmap);
1023        }
1024}
1025
1026/**
1027 * gmap_protect_rmap - modify access rights to memory and create an rmap
1028 * @sg: pointer to the shadow guest address space structure
1029 * @raddr: rmap address in the shadow gmap
1030 * @paddr: address in the parent guest address space
1031 * @len: length of the memory area to protect
1032 * @prot: indicates access rights: none, read-only or read-write
1033 *
1034 * Returns 0 if successfully protected and the rmap was created, -ENOMEM
1035 * if out of memory and -EFAULT if paddr is invalid.
1036 */
1037static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
1038                             unsigned long paddr, unsigned long len, int prot)
1039{
1040        struct gmap *parent;
1041        struct gmap_rmap *rmap;
1042        unsigned long vmaddr;
1043        spinlock_t *ptl;
1044        pte_t *ptep;
1045        int rc;
1046
1047        BUG_ON(!gmap_is_shadow(sg));
1048        parent = sg->parent;
1049        while (len) {
1050                vmaddr = __gmap_translate(parent, paddr);
1051                if (IS_ERR_VALUE(vmaddr))
1052                        return vmaddr;
1053                rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
1054                if (!rmap)
1055                        return -ENOMEM;
1056                rmap->raddr = raddr;
1057                rc = radix_tree_preload(GFP_KERNEL);
1058                if (rc) {
1059                        kfree(rmap);
1060                        return rc;
1061                }
1062                rc = -EAGAIN;
1063                ptep = gmap_pte_op_walk(parent, paddr, &ptl);
1064                if (ptep) {
1065                        spin_lock(&sg->guest_table_lock);
1066                        rc = ptep_force_prot(parent->mm, paddr, ptep, prot,
1067                                             PGSTE_VSIE_BIT);
1068                        if (!rc)
1069                                gmap_insert_rmap(sg, vmaddr, rmap);
1070                        spin_unlock(&sg->guest_table_lock);
1071                        gmap_pte_op_end(ptl);
1072                }
1073                radix_tree_preload_end();
1074                if (rc) {
1075                        kfree(rmap);
1076                        rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
1077                        if (rc)
1078                                return rc;
1079                        continue;
1080                }
1081                paddr += PAGE_SIZE;
1082                len -= PAGE_SIZE;
1083        }
1084        return 0;
1085}
1086
1087#define _SHADOW_RMAP_MASK       0x7
1088#define _SHADOW_RMAP_REGION1    0x5
1089#define _SHADOW_RMAP_REGION2    0x4
1090#define _SHADOW_RMAP_REGION3    0x3
1091#define _SHADOW_RMAP_SEGMENT    0x2
1092#define _SHADOW_RMAP_PGTABLE    0x1
1093
1094/**
1095 * gmap_idte_one - invalidate a single region or segment table entry
1096 * @asce: region or segment table *origin* + table-type bits
1097 * @vaddr: virtual address to identify the table entry to flush
1098 *
1099 * The invalid bit of a single region or segment table entry is set
1100 * and the associated TLB entries depending on the entry are flushed.
1101 * The table-type of the @asce identifies the portion of the @vaddr
1102 * that is used as the invalidation index.
1103 */
1104static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
1105{
1106        asm volatile(
1107                "       .insn   rrf,0xb98e0000,%0,%1,0,0"
1108                : : "a" (asce), "a" (vaddr) : "cc", "memory");
1109}
1110
1111/**
1112 * gmap_unshadow_page - remove a page from a shadow page table
1113 * @sg: pointer to the shadow guest address space structure
1114 * @raddr: rmap address in the shadow guest address space
1115 *
1116 * Called with the sg->guest_table_lock
1117 */
1118static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
1119{
1120        unsigned long *table;
1121
1122        BUG_ON(!gmap_is_shadow(sg));
1123        table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
1124        if (!table || *table & _PAGE_INVALID)
1125                return;
1126        gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1);
1127        ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
1128}
1129
1130/**
1131 * __gmap_unshadow_pgt - remove all entries from a shadow page table
1132 * @sg: pointer to the shadow guest address space structure
1133 * @raddr: rmap address in the shadow guest address space
1134 * @pgt: pointer to the start of a shadow page table
1135 *
1136 * Called with the sg->guest_table_lock
1137 */
1138static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
1139                                unsigned long *pgt)
1140{
1141        int i;
1142
1143        BUG_ON(!gmap_is_shadow(sg));
1144        for (i = 0; i < 256; i++, raddr += 1UL << 12)
1145                pgt[i] = _PAGE_INVALID;
1146}
1147
1148/**
1149 * gmap_unshadow_pgt - remove a shadow page table from a segment entry
1150 * @sg: pointer to the shadow guest address space structure
1151 * @raddr: address in the shadow guest address space
1152 *
1153 * Called with the sg->guest_table_lock
1154 */
1155static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
1156{
1157        unsigned long sto, *ste, *pgt;
1158        struct page *page;
1159
1160        BUG_ON(!gmap_is_shadow(sg));
1161        ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
1162        if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
1163                return;
1164        gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1);
1165        sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff));
1166        gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
1167        pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
1168        *ste = _SEGMENT_ENTRY_EMPTY;
1169        __gmap_unshadow_pgt(sg, raddr, pgt);
1170        /* Free page table */
1171        page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
1172        list_del(&page->lru);
1173        page_table_free_pgste(page);
1174}
1175
1176/**
1177 * __gmap_unshadow_sgt - remove all entries from a shadow segment table
1178 * @sg: pointer to the shadow guest address space structure
1179 * @raddr: rmap address in the shadow guest address space
1180 * @sgt: pointer to the start of a shadow segment table
1181 *
1182 * Called with the sg->guest_table_lock
1183 */
1184static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
1185                                unsigned long *sgt)
1186{
1187        unsigned long asce, *pgt;
1188        struct page *page;
1189        int i;
1190
1191        BUG_ON(!gmap_is_shadow(sg));
1192        asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT;
1193        for (i = 0; i < 2048; i++, raddr += 1UL << 20) {
1194                if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
1195                        continue;
1196                pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
1197                sgt[i] = _SEGMENT_ENTRY_EMPTY;
1198                __gmap_unshadow_pgt(sg, raddr, pgt);
1199                /* Free page table */
1200                page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
1201                list_del(&page->lru);
1202                page_table_free_pgste(page);
1203        }
1204}
1205
1206/**
1207 * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
1208 * @sg: pointer to the shadow guest address space structure
1209 * @raddr: rmap address in the shadow guest address space
1210 *
1211 * Called with the shadow->guest_table_lock
1212 */
1213static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
1214{
1215        unsigned long r3o, *r3e, *sgt;
1216        struct page *page;
1217
1218        BUG_ON(!gmap_is_shadow(sg));
1219        r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
1220        if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
1221                return;
1222        gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1);
1223        r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff));
1224        gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
1225        sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
1226        *r3e = _REGION3_ENTRY_EMPTY;
1227        __gmap_unshadow_sgt(sg, raddr, sgt);
1228        /* Free segment table */
1229        page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
1230        list_del(&page->lru);
1231        __free_pages(page, 2);
1232}
1233
1234/**
1235 * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
1236 * @sg: pointer to the shadow guest address space structure
1237 * @raddr: address in the shadow guest address space
1238 * @r3t: pointer to the start of a shadow region-3 table
1239 *
1240 * Called with the sg->guest_table_lock
1241 */
1242static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
1243                                unsigned long *r3t)
1244{
1245        unsigned long asce, *sgt;
1246        struct page *page;
1247        int i;
1248
1249        BUG_ON(!gmap_is_shadow(sg));
1250        asce = (unsigned long) r3t | _ASCE_TYPE_REGION3;
1251        for (i = 0; i < 2048; i++, raddr += 1UL << 31) {
1252                if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
1253                        continue;
1254                sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
1255                r3t[i] = _REGION3_ENTRY_EMPTY;
1256                __gmap_unshadow_sgt(sg, raddr, sgt);
1257                /* Free segment table */
1258                page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
1259                list_del(&page->lru);
1260                __free_pages(page, 2);
1261        }
1262}
1263
1264/**
1265 * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
1266 * @sg: pointer to the shadow guest address space structure
1267 * @raddr: rmap address in the shadow guest address space
1268 *
1269 * Called with the sg->guest_table_lock
1270 */
1271static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
1272{
1273        unsigned long r2o, *r2e, *r3t;
1274        struct page *page;
1275
1276        BUG_ON(!gmap_is_shadow(sg));
1277        r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
1278        if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
1279                return;
1280        gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1);
1281        r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff));
1282        gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
1283        r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
1284        *r2e = _REGION2_ENTRY_EMPTY;
1285        __gmap_unshadow_r3t(sg, raddr, r3t);
1286        /* Free region 3 table */
1287        page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
1288        list_del(&page->lru);
1289        __free_pages(page, 2);
1290}
1291
1292/**
1293 * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
1294 * @sg: pointer to the shadow guest address space structure
1295 * @raddr: rmap address in the shadow guest address space
1296 * @r2t: pointer to the start of a shadow region-2 table
1297 *
1298 * Called with the sg->guest_table_lock
1299 */
1300static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
1301                                unsigned long *r2t)
1302{
1303        unsigned long asce, *r3t;
1304        struct page *page;
1305        int i;
1306
1307        BUG_ON(!gmap_is_shadow(sg));
1308        asce = (unsigned long) r2t | _ASCE_TYPE_REGION2;
1309        for (i = 0; i < 2048; i++, raddr += 1UL << 42) {
1310                if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
1311                        continue;
1312                r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
1313                r2t[i] = _REGION2_ENTRY_EMPTY;
1314                __gmap_unshadow_r3t(sg, raddr, r3t);
1315                /* Free region 3 table */
1316                page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
1317                list_del(&page->lru);
1318                __free_pages(page, 2);
1319        }
1320}
1321
1322/**
1323 * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
1324 * @sg: pointer to the shadow guest address space structure
1325 * @raddr: rmap address in the shadow guest address space
1326 *
1327 * Called with the sg->guest_table_lock
1328 */
1329static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
1330{
1331        unsigned long r1o, *r1e, *r2t;
1332        struct page *page;
1333
1334        BUG_ON(!gmap_is_shadow(sg));
1335        r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
1336        if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
1337                return;
1338        gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1);
1339        r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff));
1340        gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
1341        r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
1342        *r1e = _REGION1_ENTRY_EMPTY;
1343        __gmap_unshadow_r2t(sg, raddr, r2t);
1344        /* Free region 2 table */
1345        page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
1346        list_del(&page->lru);
1347        __free_pages(page, 2);
1348}
1349
1350/**
1351 * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
1352 * @sg: pointer to the shadow guest address space structure
1353 * @raddr: rmap address in the shadow guest address space
1354 * @r1t: pointer to the start of a shadow region-1 table
1355 *
1356 * Called with the shadow->guest_table_lock
1357 */
1358static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
1359                                unsigned long *r1t)
1360{
1361        unsigned long asce, *r2t;
1362        struct page *page;
1363        int i;
1364
1365        BUG_ON(!gmap_is_shadow(sg));
1366        asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
1367        for (i = 0; i < 2048; i++, raddr += 1UL << 53) {
1368                if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
1369                        continue;
1370                r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
1371                __gmap_unshadow_r2t(sg, raddr, r2t);
1372                /* Clear entry and flush translation r1t -> r2t */
1373                gmap_idte_one(asce, raddr);
1374                r1t[i] = _REGION1_ENTRY_EMPTY;
1375                /* Free region 2 table */
1376                page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
1377                list_del(&page->lru);
1378                __free_pages(page, 2);
1379        }
1380}
1381
1382/**
1383 * gmap_unshadow - remove a shadow page table completely
1384 * @sg: pointer to the shadow guest address space structure
1385 *
1386 * Called with sg->guest_table_lock
1387 */
1388static void gmap_unshadow(struct gmap *sg)
1389{
1390        unsigned long *table;
1391
1392        BUG_ON(!gmap_is_shadow(sg));
1393        if (sg->removed)
1394                return;
1395        sg->removed = 1;
1396        gmap_call_notifier(sg, 0, -1UL);
1397        gmap_flush_tlb(sg);
1398        table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
1399        switch (sg->asce & _ASCE_TYPE_MASK) {
1400        case _ASCE_TYPE_REGION1:
1401                __gmap_unshadow_r1t(sg, 0, table);
1402                break;
1403        case _ASCE_TYPE_REGION2:
1404                __gmap_unshadow_r2t(sg, 0, table);
1405                break;
1406        case _ASCE_TYPE_REGION3:
1407                __gmap_unshadow_r3t(sg, 0, table);
1408                break;
1409        case _ASCE_TYPE_SEGMENT:
1410                __gmap_unshadow_sgt(sg, 0, table);
1411                break;
1412        }
1413}
1414
1415/**
1416 * gmap_find_shadow - find a specific asce in the list of shadow tables
1417 * @parent: pointer to the parent gmap
1418 * @asce: ASCE for which the shadow table is created
1419 * @edat_level: edat level to be used for the shadow translation
1420 *
1421 * Returns the pointer to a gmap if a shadow table with the given asce is
1422 * already available, ERR_PTR(-EAGAIN) if another one is just being created,
1423 * otherwise NULL
1424 */
1425static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
1426                                     int edat_level)
1427{
1428        struct gmap *sg;
1429
1430        list_for_each_entry(sg, &parent->children, list) {
1431                if (sg->orig_asce != asce || sg->edat_level != edat_level ||
1432                    sg->removed)
1433                        continue;
1434                if (!sg->initialized)
1435                        return ERR_PTR(-EAGAIN);
1436                atomic_inc(&sg->ref_count);
1437                return sg;
1438        }
1439        return NULL;
1440}
1441
1442/**
1443 * gmap_shadow_valid - check if a shadow guest address space matches the
1444 *                     given properties and is still valid
1445 * @sg: pointer to the shadow guest address space structure
1446 * @asce: ASCE for which the shadow table is requested
1447 * @edat_level: edat level to be used for the shadow translation
1448 *
1449 * Returns 1 if the gmap shadow is still valid and matches the given
1450 * properties, the caller can continue using it. Returns 0 otherwise, the
1451 * caller has to request a new shadow gmap in this case.
1452 *
1453 */
1454int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
1455{
1456        if (sg->removed)
1457                return 0;
1458        return sg->orig_asce == asce && sg->edat_level == edat_level;
1459}
1460EXPORT_SYMBOL_GPL(gmap_shadow_valid);
1461
1462/**
1463 * gmap_shadow - create/find a shadow guest address space
1464 * @parent: pointer to the parent gmap
1465 * @asce: ASCE for which the shadow table is created
1466 * @edat_level: edat level to be used for the shadow translation
1467 *
1468 * The pages of the top level page table referred by the asce parameter
1469 * will be set to read-only and marked in the PGSTEs of the kvm process.
1470 * The shadow table will be removed automatically on any change to the
1471 * PTE mapping for the source table.
1472 *
1473 * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
1474 * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
1475 * parent gmap table could not be protected.
1476 */
1477struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
1478                         int edat_level)
1479{
1480        struct gmap *sg, *new;
1481        unsigned long limit;
1482        int rc;
1483
1484        BUG_ON(gmap_is_shadow(parent));
1485        spin_lock(&parent->shadow_lock);
1486        sg = gmap_find_shadow(parent, asce, edat_level);
1487        spin_unlock(&parent->shadow_lock);
1488        if (sg)
1489                return sg;
1490        /* Create a new shadow gmap */
1491        limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
1492        if (asce & _ASCE_REAL_SPACE)
1493                limit = -1UL;
1494        new = gmap_alloc(limit);
1495        if (!new)
1496                return ERR_PTR(-ENOMEM);
1497        new->mm = parent->mm;
1498        new->parent = gmap_get(parent);
1499        new->orig_asce = asce;
1500        new->edat_level = edat_level;
1501        new->initialized = false;
1502        spin_lock(&parent->shadow_lock);
1503        /* Recheck if another CPU created the same shadow */
1504        sg = gmap_find_shadow(parent, asce, edat_level);
1505        if (sg) {
1506                spin_unlock(&parent->shadow_lock);
1507                gmap_free(new);
1508                return sg;
1509        }
1510        if (asce & _ASCE_REAL_SPACE) {
1511                /* only allow one real-space gmap shadow */
1512                list_for_each_entry(sg, &parent->children, list) {
1513                        if (sg->orig_asce & _ASCE_REAL_SPACE) {
1514                                spin_lock(&sg->guest_table_lock);
1515                                gmap_unshadow(sg);
1516                                spin_unlock(&sg->guest_table_lock);
1517                                list_del(&sg->list);
1518                                gmap_put(sg);
1519                                break;
1520                        }
1521                }
1522        }
1523        atomic_set(&new->ref_count, 2);
1524        list_add(&new->list, &parent->children);
1525        if (asce & _ASCE_REAL_SPACE) {
1526                /* nothing to protect, return right away */
1527                new->initialized = true;
1528                spin_unlock(&parent->shadow_lock);
1529                return new;
1530        }
1531        spin_unlock(&parent->shadow_lock);
1532        /* protect after insertion, so it will get properly invalidated */
1533        down_read(&parent->mm->mmap_sem);
1534        rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
1535                                ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096,
1536                                PROT_READ, PGSTE_VSIE_BIT);
1537        up_read(&parent->mm->mmap_sem);
1538        spin_lock(&parent->shadow_lock);
1539        new->initialized = true;
1540        if (rc) {
1541                list_del(&new->list);
1542                gmap_free(new);
1543                new = ERR_PTR(rc);
1544        }
1545        spin_unlock(&parent->shadow_lock);
1546        return new;
1547}
1548EXPORT_SYMBOL_GPL(gmap_shadow);
1549
1550/**
1551 * gmap_shadow_r2t - create an empty shadow region 2 table
1552 * @sg: pointer to the shadow guest address space structure
1553 * @saddr: faulting address in the shadow gmap
1554 * @r2t: parent gmap address of the region 2 table to get shadowed
1555 * @fake: r2t references contiguous guest memory block, not a r2t
1556 *
1557 * The r2t parameter specifies the address of the source table. The
1558 * four pages of the source table are made read-only in the parent gmap
1559 * address space. A write to the source table area @r2t will automatically
1560 * remove the shadow r2 table and all of its decendents.
1561 *
1562 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1563 * shadow table structure is incomplete, -ENOMEM if out of memory and
1564 * -EFAULT if an address in the parent gmap could not be resolved.
1565 *
1566 * Called with sg->mm->mmap_sem in read.
1567 */
1568int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
1569                    int fake)
1570{
1571        unsigned long raddr, origin, offset, len;
1572        unsigned long *s_r2t, *table;
1573        struct page *page;
1574        int rc;
1575
1576        BUG_ON(!gmap_is_shadow(sg));
1577        /* Allocate a shadow region second table */
1578        page = alloc_pages(GFP_KERNEL, 2);
1579        if (!page)
1580                return -ENOMEM;
1581        page->index = r2t & _REGION_ENTRY_ORIGIN;
1582        if (fake)
1583                page->index |= GMAP_SHADOW_FAKE_TABLE;
1584        s_r2t = (unsigned long *) page_to_phys(page);
1585        /* Install shadow region second table */
1586        spin_lock(&sg->guest_table_lock);
1587        table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
1588        if (!table) {
1589                rc = -EAGAIN;           /* Race with unshadow */
1590                goto out_free;
1591        }
1592        if (!(*table & _REGION_ENTRY_INVALID)) {
1593                rc = 0;                 /* Already established */
1594                goto out_free;
1595        } else if (*table & _REGION_ENTRY_ORIGIN) {
1596                rc = -EAGAIN;           /* Race with shadow */
1597                goto out_free;
1598        }
1599        crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
1600        /* mark as invalid as long as the parent table is not protected */
1601        *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
1602                 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
1603        if (sg->edat_level >= 1)
1604                *table |= (r2t & _REGION_ENTRY_PROTECT);
1605        list_add(&page->lru, &sg->crst_list);
1606        if (fake) {
1607                /* nothing to protect for fake tables */
1608                *table &= ~_REGION_ENTRY_INVALID;
1609                spin_unlock(&sg->guest_table_lock);
1610                return 0;
1611        }
1612        spin_unlock(&sg->guest_table_lock);
1613        /* Make r2t read-only in parent gmap page table */
1614        raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1;
1615        origin = r2t & _REGION_ENTRY_ORIGIN;
1616        offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
1617        len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
1618        rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
1619        spin_lock(&sg->guest_table_lock);
1620        if (!rc) {
1621                table = gmap_table_walk(sg, saddr, 4);
1622                if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
1623                              (unsigned long) s_r2t)
1624                        rc = -EAGAIN;           /* Race with unshadow */
1625                else
1626                        *table &= ~_REGION_ENTRY_INVALID;
1627        } else {
1628                gmap_unshadow_r2t(sg, raddr);
1629        }
1630        spin_unlock(&sg->guest_table_lock);
1631        return rc;
1632out_free:
1633        spin_unlock(&sg->guest_table_lock);
1634        __free_pages(page, 2);
1635        return rc;
1636}
1637EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
1638
1639/**
1640 * gmap_shadow_r3t - create a shadow region 3 table
1641 * @sg: pointer to the shadow guest address space structure
1642 * @saddr: faulting address in the shadow gmap
1643 * @r3t: parent gmap address of the region 3 table to get shadowed
1644 * @fake: r3t references contiguous guest memory block, not a r3t
1645 *
1646 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1647 * shadow table structure is incomplete, -ENOMEM if out of memory and
1648 * -EFAULT if an address in the parent gmap could not be resolved.
1649 *
1650 * Called with sg->mm->mmap_sem in read.
1651 */
1652int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
1653                    int fake)
1654{
1655        unsigned long raddr, origin, offset, len;
1656        unsigned long *s_r3t, *table;
1657        struct page *page;
1658        int rc;
1659
1660        BUG_ON(!gmap_is_shadow(sg));
1661        /* Allocate a shadow region second table */
1662        page = alloc_pages(GFP_KERNEL, 2);
1663        if (!page)
1664                return -ENOMEM;
1665        page->index = r3t & _REGION_ENTRY_ORIGIN;
1666        if (fake)
1667                page->index |= GMAP_SHADOW_FAKE_TABLE;
1668        s_r3t = (unsigned long *) page_to_phys(page);
1669        /* Install shadow region second table */
1670        spin_lock(&sg->guest_table_lock);
1671        table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
1672        if (!table) {
1673                rc = -EAGAIN;           /* Race with unshadow */
1674                goto out_free;
1675        }
1676        if (!(*table & _REGION_ENTRY_INVALID)) {
1677                rc = 0;                 /* Already established */
1678                goto out_free;
1679        } else if (*table & _REGION_ENTRY_ORIGIN) {
1680                rc = -EAGAIN;           /* Race with shadow */
1681        }
1682        crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
1683        /* mark as invalid as long as the parent table is not protected */
1684        *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
1685                 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
1686        if (sg->edat_level >= 1)
1687                *table |= (r3t & _REGION_ENTRY_PROTECT);
1688        list_add(&page->lru, &sg->crst_list);
1689        if (fake) {
1690                /* nothing to protect for fake tables */
1691                *table &= ~_REGION_ENTRY_INVALID;
1692                spin_unlock(&sg->guest_table_lock);
1693                return 0;
1694        }
1695        spin_unlock(&sg->guest_table_lock);
1696        /* Make r3t read-only in parent gmap page table */
1697        raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2;
1698        origin = r3t & _REGION_ENTRY_ORIGIN;
1699        offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
1700        len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
1701        rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
1702        spin_lock(&sg->guest_table_lock);
1703        if (!rc) {
1704                table = gmap_table_walk(sg, saddr, 3);
1705                if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
1706                              (unsigned long) s_r3t)
1707                        rc = -EAGAIN;           /* Race with unshadow */
1708                else
1709                        *table &= ~_REGION_ENTRY_INVALID;
1710        } else {
1711                gmap_unshadow_r3t(sg, raddr);
1712        }
1713        spin_unlock(&sg->guest_table_lock);
1714        return rc;
1715out_free:
1716        spin_unlock(&sg->guest_table_lock);
1717        __free_pages(page, 2);
1718        return rc;
1719}
1720EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
1721
1722/**
1723 * gmap_shadow_sgt - create a shadow segment table
1724 * @sg: pointer to the shadow guest address space structure
1725 * @saddr: faulting address in the shadow gmap
1726 * @sgt: parent gmap address of the segment table to get shadowed
1727 * @fake: sgt references contiguous guest memory block, not a sgt
1728 *
1729 * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
1730 * shadow table structure is incomplete, -ENOMEM if out of memory and
1731 * -EFAULT if an address in the parent gmap could not be resolved.
1732 *
1733 * Called with sg->mm->mmap_sem in read.
1734 */
1735int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
1736                    int fake)
1737{
1738        unsigned long raddr, origin, offset, len;
1739        unsigned long *s_sgt, *table;
1740        struct page *page;
1741        int rc;
1742
1743        BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
1744        /* Allocate a shadow segment table */
1745        page = alloc_pages(GFP_KERNEL, 2);
1746        if (!page)
1747                return -ENOMEM;
1748        page->index = sgt & _REGION_ENTRY_ORIGIN;
1749        if (fake)
1750                page->index |= GMAP_SHADOW_FAKE_TABLE;
1751        s_sgt = (unsigned long *) page_to_phys(page);
1752        /* Install shadow region second table */
1753        spin_lock(&sg->guest_table_lock);
1754        table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
1755        if (!table) {
1756                rc = -EAGAIN;           /* Race with unshadow */
1757                goto out_free;
1758        }
1759        if (!(*table & _REGION_ENTRY_INVALID)) {
1760                rc = 0;                 /* Already established */
1761                goto out_free;
1762        } else if (*table & _REGION_ENTRY_ORIGIN) {
1763                rc = -EAGAIN;           /* Race with shadow */
1764                goto out_free;
1765        }
1766        crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
1767        /* mark as invalid as long as the parent table is not protected */
1768        *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
1769                 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
1770        if (sg->edat_level >= 1)
1771                *table |= sgt & _REGION_ENTRY_PROTECT;
1772        list_add(&page->lru, &sg->crst_list);
1773        if (fake) {
1774                /* nothing to protect for fake tables */
1775                *table &= ~_REGION_ENTRY_INVALID;
1776                spin_unlock(&sg->guest_table_lock);
1777                return 0;
1778        }
1779        spin_unlock(&sg->guest_table_lock);
1780        /* Make sgt read-only in parent gmap page table */
1781        raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3;
1782        origin = sgt & _REGION_ENTRY_ORIGIN;
1783        offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096;
1784        len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
1785        rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
1786        spin_lock(&sg->guest_table_lock);
1787        if (!rc) {
1788                table = gmap_table_walk(sg, saddr, 2);
1789                if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
1790                              (unsigned long) s_sgt)
1791                        rc = -EAGAIN;           /* Race with unshadow */
1792                else
1793                        *table &= ~_REGION_ENTRY_INVALID;
1794        } else {
1795                gmap_unshadow_sgt(sg, raddr);
1796        }
1797        spin_unlock(&sg->guest_table_lock);
1798        return rc;
1799out_free:
1800        spin_unlock(&sg->guest_table_lock);
1801        __free_pages(page, 2);
1802        return rc;
1803}
1804EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
1805
1806/**
1807 * gmap_shadow_lookup_pgtable - find a shadow page table
1808 * @sg: pointer to the shadow guest address space structure
1809 * @saddr: the address in the shadow aguest address space
1810 * @pgt: parent gmap address of the page table to get shadowed
1811 * @dat_protection: if the pgtable is marked as protected by dat
1812 * @fake: pgt references contiguous guest memory block, not a pgtable
1813 *
1814 * Returns 0 if the shadow page table was found and -EAGAIN if the page
1815 * table was not found.
1816 *
1817 * Called with sg->mm->mmap_sem in read.
1818 */
1819int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
1820                           unsigned long *pgt, int *dat_protection,
1821                           int *fake)
1822{
1823        unsigned long *table;
1824        struct page *page;
1825        int rc;
1826
1827        BUG_ON(!gmap_is_shadow(sg));
1828        spin_lock(&sg->guest_table_lock);
1829        table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
1830        if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
1831                /* Shadow page tables are full pages (pte+pgste) */
1832                page = pfn_to_page(*table >> PAGE_SHIFT);
1833                *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
1834                *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
1835                *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
1836                rc = 0;
1837        } else  {
1838                rc = -EAGAIN;
1839        }
1840        spin_unlock(&sg->guest_table_lock);
1841        return rc;
1842
1843}
1844EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
1845
1846/**
1847 * gmap_shadow_pgt - instantiate a shadow page table
1848 * @sg: pointer to the shadow guest address space structure
1849 * @saddr: faulting address in the shadow gmap
1850 * @pgt: parent gmap address of the page table to get shadowed
1851 * @fake: pgt references contiguous guest memory block, not a pgtable
1852 *
1853 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1854 * shadow table structure is incomplete, -ENOMEM if out of memory,
1855 * -EFAULT if an address in the parent gmap could not be resolved and
1856 *
1857 * Called with gmap->mm->mmap_sem in read
1858 */
1859int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
1860                    int fake)
1861{
1862        unsigned long raddr, origin;
1863        unsigned long *s_pgt, *table;
1864        struct page *page;
1865        int rc;
1866
1867        BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
1868        /* Allocate a shadow page table */
1869        page = page_table_alloc_pgste(sg->mm);
1870        if (!page)
1871                return -ENOMEM;
1872        page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
1873        if (fake)
1874                page->index |= GMAP_SHADOW_FAKE_TABLE;
1875        s_pgt = (unsigned long *) page_to_phys(page);
1876        /* Install shadow page table */
1877        spin_lock(&sg->guest_table_lock);
1878        table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
1879        if (!table) {
1880                rc = -EAGAIN;           /* Race with unshadow */
1881                goto out_free;
1882        }
1883        if (!(*table & _SEGMENT_ENTRY_INVALID)) {
1884                rc = 0;                 /* Already established */
1885                goto out_free;
1886        } else if (*table & _SEGMENT_ENTRY_ORIGIN) {
1887                rc = -EAGAIN;           /* Race with shadow */
1888                goto out_free;
1889        }
1890        /* mark as invalid as long as the parent table is not protected */
1891        *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
1892                 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
1893        list_add(&page->lru, &sg->pt_list);
1894        if (fake) {
1895                /* nothing to protect for fake tables */
1896                *table &= ~_SEGMENT_ENTRY_INVALID;
1897                spin_unlock(&sg->guest_table_lock);
1898                return 0;
1899        }
1900        spin_unlock(&sg->guest_table_lock);
1901        /* Make pgt read-only in parent gmap page table (not the pgste) */
1902        raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT;
1903        origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
1904        rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ);
1905        spin_lock(&sg->guest_table_lock);
1906        if (!rc) {
1907                table = gmap_table_walk(sg, saddr, 1);
1908                if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
1909                              (unsigned long) s_pgt)
1910                        rc = -EAGAIN;           /* Race with unshadow */
1911                else
1912                        *table &= ~_SEGMENT_ENTRY_INVALID;
1913        } else {
1914                gmap_unshadow_pgt(sg, raddr);
1915        }
1916        spin_unlock(&sg->guest_table_lock);
1917        return rc;
1918out_free:
1919        spin_unlock(&sg->guest_table_lock);
1920        page_table_free_pgste(page);
1921        return rc;
1922
1923}
1924EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
1925
1926/**
1927 * gmap_shadow_page - create a shadow page mapping
1928 * @sg: pointer to the shadow guest address space structure
1929 * @saddr: faulting address in the shadow gmap
1930 * @pte: pte in parent gmap address space to get shadowed
1931 *
1932 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1933 * shadow table structure is incomplete, -ENOMEM if out of memory and
1934 * -EFAULT if an address in the parent gmap could not be resolved.
1935 *
1936 * Called with sg->mm->mmap_sem in read.
1937 */
1938int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
1939{
1940        struct gmap *parent;
1941        struct gmap_rmap *rmap;
1942        unsigned long vmaddr, paddr;
1943        spinlock_t *ptl;
1944        pte_t *sptep, *tptep;
1945        int prot;
1946        int rc;
1947
1948        BUG_ON(!gmap_is_shadow(sg));
1949        parent = sg->parent;
1950        prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
1951
1952        rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
1953        if (!rmap)
1954                return -ENOMEM;
1955        rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
1956
1957        while (1) {
1958                paddr = pte_val(pte) & PAGE_MASK;
1959                vmaddr = __gmap_translate(parent, paddr);
1960                if (IS_ERR_VALUE(vmaddr)) {
1961                        rc = vmaddr;
1962                        break;
1963                }
1964                rc = radix_tree_preload(GFP_KERNEL);
1965                if (rc)
1966                        break;
1967                rc = -EAGAIN;
1968                sptep = gmap_pte_op_walk(parent, paddr, &ptl);
1969                if (sptep) {
1970                        spin_lock(&sg->guest_table_lock);
1971                        /* Get page table pointer */
1972                        tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
1973                        if (!tptep) {
1974                                spin_unlock(&sg->guest_table_lock);
1975                                gmap_pte_op_end(ptl);
1976                                radix_tree_preload_end();
1977                                break;
1978                        }
1979                        rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
1980                        if (rc > 0) {
1981                                /* Success and a new mapping */
1982                                gmap_insert_rmap(sg, vmaddr, rmap);
1983                                rmap = NULL;
1984                                rc = 0;
1985                        }
1986                        gmap_pte_op_end(ptl);
1987                        spin_unlock(&sg->guest_table_lock);
1988                }
1989                radix_tree_preload_end();
1990                if (!rc)
1991                        break;
1992                rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
1993                if (rc)
1994                        break;
1995        }
1996        kfree(rmap);
1997        return rc;
1998}
1999EXPORT_SYMBOL_GPL(gmap_shadow_page);
2000
2001/**
2002 * gmap_shadow_notify - handle notifications for shadow gmap
2003 *
2004 * Called with sg->parent->shadow_lock.
2005 */
2006static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
2007                               unsigned long offset, pte_t *pte)
2008{
2009        struct gmap_rmap *rmap, *rnext, *head;
2010        unsigned long gaddr, start, end, bits, raddr;
2011        unsigned long *table;
2012
2013        BUG_ON(!gmap_is_shadow(sg));
2014        spin_lock(&sg->parent->guest_table_lock);
2015        table = radix_tree_lookup(&sg->parent->host_to_guest,
2016                                  vmaddr >> PMD_SHIFT);
2017        gaddr = table ? __gmap_segment_gaddr(table) + offset : 0;
2018        spin_unlock(&sg->parent->guest_table_lock);
2019        if (!table)
2020                return;
2021
2022        spin_lock(&sg->guest_table_lock);
2023        if (sg->removed) {
2024                spin_unlock(&sg->guest_table_lock);
2025                return;
2026        }
2027        /* Check for top level table */
2028        start = sg->orig_asce & _ASCE_ORIGIN;
2029        end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096;
2030        if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
2031            gaddr < end) {
2032                /* The complete shadow table has to go */
2033                gmap_unshadow(sg);
2034                spin_unlock(&sg->guest_table_lock);
2035                list_del(&sg->list);
2036                gmap_put(sg);
2037                return;
2038        }
2039        /* Remove the page table tree from on specific entry */
2040        head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12);
2041        gmap_for_each_rmap_safe(rmap, rnext, head) {
2042                bits = rmap->raddr & _SHADOW_RMAP_MASK;
2043                raddr = rmap->raddr ^ bits;
2044                switch (bits) {
2045                case _SHADOW_RMAP_REGION1:
2046                        gmap_unshadow_r2t(sg, raddr);
2047                        break;
2048                case _SHADOW_RMAP_REGION2:
2049                        gmap_unshadow_r3t(sg, raddr);
2050                        break;
2051                case _SHADOW_RMAP_REGION3:
2052                        gmap_unshadow_sgt(sg, raddr);
2053                        break;
2054                case _SHADOW_RMAP_SEGMENT:
2055                        gmap_unshadow_pgt(sg, raddr);
2056                        break;
2057                case _SHADOW_RMAP_PGTABLE:
2058                        gmap_unshadow_page(sg, raddr);
2059                        break;
2060                }
2061                kfree(rmap);
2062        }
2063        spin_unlock(&sg->guest_table_lock);
2064}
2065
2066/**
2067 * ptep_notify - call all invalidation callbacks for a specific pte.
2068 * @mm: pointer to the process mm_struct
2069 * @addr: virtual address in the process address space
2070 * @pte: pointer to the page table entry
2071 * @bits: bits from the pgste that caused the notify call
2072 *
2073 * This function is assumed to be called with the page table lock held
2074 * for the pte to notify.
2075 */
2076void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
2077                 pte_t *pte, unsigned long bits)
2078{
2079        unsigned long offset, gaddr;
2080        unsigned long *table;
2081        struct gmap *gmap, *sg, *next;
2082
2083        offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
2084        offset = offset * (4096 / sizeof(pte_t));
2085        rcu_read_lock();
2086        list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2087                if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
2088                        spin_lock(&gmap->shadow_lock);
2089                        list_for_each_entry_safe(sg, next,
2090                                                 &gmap->children, list)
2091                                gmap_shadow_notify(sg, vmaddr, offset, pte);
2092                        spin_unlock(&gmap->shadow_lock);
2093                }
2094                if (!(bits & PGSTE_IN_BIT))
2095                        continue;
2096                spin_lock(&gmap->guest_table_lock);
2097                table = radix_tree_lookup(&gmap->host_to_guest,
2098                                          vmaddr >> PMD_SHIFT);
2099                if (table)
2100                        gaddr = __gmap_segment_gaddr(table) + offset;
2101                spin_unlock(&gmap->guest_table_lock);
2102                if (table)
2103                        gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
2104        }
2105        rcu_read_unlock();
2106}
2107EXPORT_SYMBOL_GPL(ptep_notify);
2108
2109static inline void thp_split_mm(struct mm_struct *mm)
2110{
2111#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2112        struct vm_area_struct *vma;
2113        unsigned long addr;
2114
2115        for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
2116                for (addr = vma->vm_start;
2117                     addr < vma->vm_end;
2118                     addr += PAGE_SIZE)
2119                        follow_page(vma, addr, FOLL_SPLIT);
2120                vma->vm_flags &= ~VM_HUGEPAGE;
2121                vma->vm_flags |= VM_NOHUGEPAGE;
2122        }
2123        mm->def_flags |= VM_NOHUGEPAGE;
2124#endif
2125}
2126
2127/*
2128 * switch on pgstes for its userspace process (for kvm)
2129 */
2130int s390_enable_sie(void)
2131{
2132        struct mm_struct *mm = current->mm;
2133
2134        /* Do we have pgstes? if yes, we are done */
2135        if (mm_has_pgste(mm))
2136                return 0;
2137        /* Fail if the page tables are 2K */
2138        if (!mm_alloc_pgste(mm))
2139                return -EINVAL;
2140        down_write(&mm->mmap_sem);
2141        mm->context.has_pgste = 1;
2142        /* split thp mappings and disable thp for future mappings */
2143        thp_split_mm(mm);
2144        up_write(&mm->mmap_sem);
2145        return 0;
2146}
2147EXPORT_SYMBOL_GPL(s390_enable_sie);
2148
2149/*
2150 * Enable storage key handling from now on and initialize the storage
2151 * keys with the default key.
2152 */
2153static int __s390_enable_skey(pte_t *pte, unsigned long addr,
2154                              unsigned long next, struct mm_walk *walk)
2155{
2156        /*
2157         * Remove all zero page mappings,
2158         * after establishing a policy to forbid zero page mappings
2159         * following faults for that page will get fresh anonymous pages
2160         */
2161        if (is_zero_pfn(pte_pfn(*pte)))
2162                ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID));
2163        /* Clear storage key */
2164        ptep_zap_key(walk->mm, addr, pte);
2165        return 0;
2166}
2167
2168int s390_enable_skey(void)
2169{
2170        struct mm_walk walk = { .pte_entry = __s390_enable_skey };
2171        struct mm_struct *mm = current->mm;
2172        struct vm_area_struct *vma;
2173        int rc = 0;
2174
2175        down_write(&mm->mmap_sem);
2176        if (mm_use_skey(mm))
2177                goto out_up;
2178
2179        mm->context.use_skey = 1;
2180        for (vma = mm->mmap; vma; vma = vma->vm_next) {
2181                if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
2182                                MADV_UNMERGEABLE, &vma->vm_flags)) {
2183                        mm->context.use_skey = 0;
2184                        rc = -ENOMEM;
2185                        goto out_up;
2186                }
2187        }
2188        mm->def_flags &= ~VM_MERGEABLE;
2189
2190        walk.mm = mm;
2191        walk_page_range(0, TASK_SIZE, &walk);
2192
2193out_up:
2194        up_write(&mm->mmap_sem);
2195        return rc;
2196}
2197EXPORT_SYMBOL_GPL(s390_enable_skey);
2198
2199/*
2200 * Reset CMMA state, make all pages stable again.
2201 */
2202static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
2203                             unsigned long next, struct mm_walk *walk)
2204{
2205        ptep_zap_unused(walk->mm, addr, pte, 1);
2206        return 0;
2207}
2208
2209void s390_reset_cmma(struct mm_struct *mm)
2210{
2211        struct mm_walk walk = { .pte_entry = __s390_reset_cmma };
2212
2213        down_write(&mm->mmap_sem);
2214        walk.mm = mm;
2215        walk_page_range(0, TASK_SIZE, &walk);
2216        up_write(&mm->mmap_sem);
2217}
2218EXPORT_SYMBOL_GPL(s390_reset_cmma);
2219