linux/arch/s390/mm/gmap.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  KVM guest address space mapping code
   4 *
   5 *    Copyright IBM Corp. 2007, 2020
   6 *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
   7 *               David Hildenbrand <david@redhat.com>
   8 *               Janosch Frank <frankja@linux.vnet.ibm.com>
   9 */
  10
  11#include <linux/kernel.h>
  12#include <linux/pagewalk.h>
  13#include <linux/swap.h>
  14#include <linux/smp.h>
  15#include <linux/spinlock.h>
  16#include <linux/slab.h>
  17#include <linux/swapops.h>
  18#include <linux/ksm.h>
  19#include <linux/mman.h>
  20#include <linux/pgtable.h>
  21
  22#include <asm/pgalloc.h>
  23#include <asm/gmap.h>
  24#include <asm/tlb.h>
  25
  26#define GMAP_SHADOW_FAKE_TABLE 1ULL
  27
  28/**
  29 * gmap_alloc - allocate and initialize a guest address space
  30 * @limit: maximum address of the gmap address space
  31 *
  32 * Returns a guest address space structure.
  33 */
  34static struct gmap *gmap_alloc(unsigned long limit)
  35{
  36        struct gmap *gmap;
  37        struct page *page;
  38        unsigned long *table;
  39        unsigned long etype, atype;
  40
  41        if (limit < _REGION3_SIZE) {
  42                limit = _REGION3_SIZE - 1;
  43                atype = _ASCE_TYPE_SEGMENT;
  44                etype = _SEGMENT_ENTRY_EMPTY;
  45        } else if (limit < _REGION2_SIZE) {
  46                limit = _REGION2_SIZE - 1;
  47                atype = _ASCE_TYPE_REGION3;
  48                etype = _REGION3_ENTRY_EMPTY;
  49        } else if (limit < _REGION1_SIZE) {
  50                limit = _REGION1_SIZE - 1;
  51                atype = _ASCE_TYPE_REGION2;
  52                etype = _REGION2_ENTRY_EMPTY;
  53        } else {
  54                limit = -1UL;
  55                atype = _ASCE_TYPE_REGION1;
  56                etype = _REGION1_ENTRY_EMPTY;
  57        }
  58        gmap = kzalloc(sizeof(struct gmap), GFP_KERNEL_ACCOUNT);
  59        if (!gmap)
  60                goto out;
  61        INIT_LIST_HEAD(&gmap->crst_list);
  62        INIT_LIST_HEAD(&gmap->children);
  63        INIT_LIST_HEAD(&gmap->pt_list);
  64        INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL_ACCOUNT);
  65        INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC | __GFP_ACCOUNT);
  66        INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC | __GFP_ACCOUNT);
  67        spin_lock_init(&gmap->guest_table_lock);
  68        spin_lock_init(&gmap->shadow_lock);
  69        refcount_set(&gmap->ref_count, 1);
  70        page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
  71        if (!page)
  72                goto out_free;
  73        page->index = 0;
  74        list_add(&page->lru, &gmap->crst_list);
  75        table = (unsigned long *) page_to_phys(page);
  76        crst_table_init(table, etype);
  77        gmap->table = table;
  78        gmap->asce = atype | _ASCE_TABLE_LENGTH |
  79                _ASCE_USER_BITS | __pa(table);
  80        gmap->asce_end = limit;
  81        return gmap;
  82
  83out_free:
  84        kfree(gmap);
  85out:
  86        return NULL;
  87}
  88
  89/**
  90 * gmap_create - create a guest address space
  91 * @mm: pointer to the parent mm_struct
  92 * @limit: maximum size of the gmap address space
  93 *
  94 * Returns a guest address space structure.
  95 */
  96struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
  97{
  98        struct gmap *gmap;
  99        unsigned long gmap_asce;
 100
 101        gmap = gmap_alloc(limit);
 102        if (!gmap)
 103                return NULL;
 104        gmap->mm = mm;
 105        spin_lock(&mm->context.lock);
 106        list_add_rcu(&gmap->list, &mm->context.gmap_list);
 107        if (list_is_singular(&mm->context.gmap_list))
 108                gmap_asce = gmap->asce;
 109        else
 110                gmap_asce = -1UL;
 111        WRITE_ONCE(mm->context.gmap_asce, gmap_asce);
 112        spin_unlock(&mm->context.lock);
 113        return gmap;
 114}
 115EXPORT_SYMBOL_GPL(gmap_create);
 116
 117static void gmap_flush_tlb(struct gmap *gmap)
 118{
 119        if (MACHINE_HAS_IDTE)
 120                __tlb_flush_idte(gmap->asce);
 121        else
 122                __tlb_flush_global();
 123}
 124
 125static void gmap_radix_tree_free(struct radix_tree_root *root)
 126{
 127        struct radix_tree_iter iter;
 128        unsigned long indices[16];
 129        unsigned long index;
 130        void __rcu **slot;
 131        int i, nr;
 132
 133        /* A radix tree is freed by deleting all of its entries */
 134        index = 0;
 135        do {
 136                nr = 0;
 137                radix_tree_for_each_slot(slot, root, &iter, index) {
 138                        indices[nr] = iter.index;
 139                        if (++nr == 16)
 140                                break;
 141                }
 142                for (i = 0; i < nr; i++) {
 143                        index = indices[i];
 144                        radix_tree_delete(root, index);
 145                }
 146        } while (nr > 0);
 147}
 148
 149static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
 150{
 151        struct gmap_rmap *rmap, *rnext, *head;
 152        struct radix_tree_iter iter;
 153        unsigned long indices[16];
 154        unsigned long index;
 155        void __rcu **slot;
 156        int i, nr;
 157
 158        /* A radix tree is freed by deleting all of its entries */
 159        index = 0;
 160        do {
 161                nr = 0;
 162                radix_tree_for_each_slot(slot, root, &iter, index) {
 163                        indices[nr] = iter.index;
 164                        if (++nr == 16)
 165                                break;
 166                }
 167                for (i = 0; i < nr; i++) {
 168                        index = indices[i];
 169                        head = radix_tree_delete(root, index);
 170                        gmap_for_each_rmap_safe(rmap, rnext, head)
 171                                kfree(rmap);
 172                }
 173        } while (nr > 0);
 174}
 175
 176/**
 177 * gmap_free - free a guest address space
 178 * @gmap: pointer to the guest address space structure
 179 *
 180 * No locks required. There are no references to this gmap anymore.
 181 */
 182static void gmap_free(struct gmap *gmap)
 183{
 184        struct page *page, *next;
 185
 186        /* Flush tlb of all gmaps (if not already done for shadows) */
 187        if (!(gmap_is_shadow(gmap) && gmap->removed))
 188                gmap_flush_tlb(gmap);
 189        /* Free all segment & region tables. */
 190        list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
 191                __free_pages(page, CRST_ALLOC_ORDER);
 192        gmap_radix_tree_free(&gmap->guest_to_host);
 193        gmap_radix_tree_free(&gmap->host_to_guest);
 194
 195        /* Free additional data for a shadow gmap */
 196        if (gmap_is_shadow(gmap)) {
 197                /* Free all page tables. */
 198                list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
 199                        page_table_free_pgste(page);
 200                gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
 201                /* Release reference to the parent */
 202                gmap_put(gmap->parent);
 203        }
 204
 205        kfree(gmap);
 206}
 207
 208/**
 209 * gmap_get - increase reference counter for guest address space
 210 * @gmap: pointer to the guest address space structure
 211 *
 212 * Returns the gmap pointer
 213 */
 214struct gmap *gmap_get(struct gmap *gmap)
 215{
 216        refcount_inc(&gmap->ref_count);
 217        return gmap;
 218}
 219EXPORT_SYMBOL_GPL(gmap_get);
 220
 221/**
 222 * gmap_put - decrease reference counter for guest address space
 223 * @gmap: pointer to the guest address space structure
 224 *
 225 * If the reference counter reaches zero the guest address space is freed.
 226 */
 227void gmap_put(struct gmap *gmap)
 228{
 229        if (refcount_dec_and_test(&gmap->ref_count))
 230                gmap_free(gmap);
 231}
 232EXPORT_SYMBOL_GPL(gmap_put);
 233
 234/**
 235 * gmap_remove - remove a guest address space but do not free it yet
 236 * @gmap: pointer to the guest address space structure
 237 */
 238void gmap_remove(struct gmap *gmap)
 239{
 240        struct gmap *sg, *next;
 241        unsigned long gmap_asce;
 242
 243        /* Remove all shadow gmaps linked to this gmap */
 244        if (!list_empty(&gmap->children)) {
 245                spin_lock(&gmap->shadow_lock);
 246                list_for_each_entry_safe(sg, next, &gmap->children, list) {
 247                        list_del(&sg->list);
 248                        gmap_put(sg);
 249                }
 250                spin_unlock(&gmap->shadow_lock);
 251        }
 252        /* Remove gmap from the pre-mm list */
 253        spin_lock(&gmap->mm->context.lock);
 254        list_del_rcu(&gmap->list);
 255        if (list_empty(&gmap->mm->context.gmap_list))
 256                gmap_asce = 0;
 257        else if (list_is_singular(&gmap->mm->context.gmap_list))
 258                gmap_asce = list_first_entry(&gmap->mm->context.gmap_list,
 259                                             struct gmap, list)->asce;
 260        else
 261                gmap_asce = -1UL;
 262        WRITE_ONCE(gmap->mm->context.gmap_asce, gmap_asce);
 263        spin_unlock(&gmap->mm->context.lock);
 264        synchronize_rcu();
 265        /* Put reference */
 266        gmap_put(gmap);
 267}
 268EXPORT_SYMBOL_GPL(gmap_remove);
 269
 270/**
 271 * gmap_enable - switch primary space to the guest address space
 272 * @gmap: pointer to the guest address space structure
 273 */
 274void gmap_enable(struct gmap *gmap)
 275{
 276        S390_lowcore.gmap = (unsigned long) gmap;
 277}
 278EXPORT_SYMBOL_GPL(gmap_enable);
 279
 280/**
 281 * gmap_disable - switch back to the standard primary address space
 282 * @gmap: pointer to the guest address space structure
 283 */
 284void gmap_disable(struct gmap *gmap)
 285{
 286        S390_lowcore.gmap = 0UL;
 287}
 288EXPORT_SYMBOL_GPL(gmap_disable);
 289
 290/**
 291 * gmap_get_enabled - get a pointer to the currently enabled gmap
 292 *
 293 * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
 294 */
 295struct gmap *gmap_get_enabled(void)
 296{
 297        return (struct gmap *) S390_lowcore.gmap;
 298}
 299EXPORT_SYMBOL_GPL(gmap_get_enabled);
 300
 301/*
 302 * gmap_alloc_table is assumed to be called with mmap_lock held
 303 */
 304static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
 305                            unsigned long init, unsigned long gaddr)
 306{
 307        struct page *page;
 308        unsigned long *new;
 309
 310        /* since we dont free the gmap table until gmap_free we can unlock */
 311        page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
 312        if (!page)
 313                return -ENOMEM;
 314        new = (unsigned long *) page_to_phys(page);
 315        crst_table_init(new, init);
 316        spin_lock(&gmap->guest_table_lock);
 317        if (*table & _REGION_ENTRY_INVALID) {
 318                list_add(&page->lru, &gmap->crst_list);
 319                *table = (unsigned long) new | _REGION_ENTRY_LENGTH |
 320                        (*table & _REGION_ENTRY_TYPE_MASK);
 321                page->index = gaddr;
 322                page = NULL;
 323        }
 324        spin_unlock(&gmap->guest_table_lock);
 325        if (page)
 326                __free_pages(page, CRST_ALLOC_ORDER);
 327        return 0;
 328}
 329
 330/**
 331 * __gmap_segment_gaddr - find virtual address from segment pointer
 332 * @entry: pointer to a segment table entry in the guest address space
 333 *
 334 * Returns the virtual address in the guest address space for the segment
 335 */
 336static unsigned long __gmap_segment_gaddr(unsigned long *entry)
 337{
 338        struct page *page;
 339        unsigned long offset, mask;
 340
 341        offset = (unsigned long) entry / sizeof(unsigned long);
 342        offset = (offset & (PTRS_PER_PMD - 1)) * PMD_SIZE;
 343        mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
 344        page = virt_to_page((void *)((unsigned long) entry & mask));
 345        return page->index + offset;
 346}
 347
 348/**
 349 * __gmap_unlink_by_vmaddr - unlink a single segment via a host address
 350 * @gmap: pointer to the guest address space structure
 351 * @vmaddr: address in the host process address space
 352 *
 353 * Returns 1 if a TLB flush is required
 354 */
 355static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
 356{
 357        unsigned long *entry;
 358        int flush = 0;
 359
 360        BUG_ON(gmap_is_shadow(gmap));
 361        spin_lock(&gmap->guest_table_lock);
 362        entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
 363        if (entry) {
 364                flush = (*entry != _SEGMENT_ENTRY_EMPTY);
 365                *entry = _SEGMENT_ENTRY_EMPTY;
 366        }
 367        spin_unlock(&gmap->guest_table_lock);
 368        return flush;
 369}
 370
 371/**
 372 * __gmap_unmap_by_gaddr - unmap a single segment via a guest address
 373 * @gmap: pointer to the guest address space structure
 374 * @gaddr: address in the guest address space
 375 *
 376 * Returns 1 if a TLB flush is required
 377 */
 378static int __gmap_unmap_by_gaddr(struct gmap *gmap, unsigned long gaddr)
 379{
 380        unsigned long vmaddr;
 381
 382        vmaddr = (unsigned long) radix_tree_delete(&gmap->guest_to_host,
 383                                                   gaddr >> PMD_SHIFT);
 384        return vmaddr ? __gmap_unlink_by_vmaddr(gmap, vmaddr) : 0;
 385}
 386
 387/**
 388 * gmap_unmap_segment - unmap segment from the guest address space
 389 * @gmap: pointer to the guest address space structure
 390 * @to: address in the guest address space
 391 * @len: length of the memory area to unmap
 392 *
 393 * Returns 0 if the unmap succeeded, -EINVAL if not.
 394 */
 395int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
 396{
 397        unsigned long off;
 398        int flush;
 399
 400        BUG_ON(gmap_is_shadow(gmap));
 401        if ((to | len) & (PMD_SIZE - 1))
 402                return -EINVAL;
 403        if (len == 0 || to + len < to)
 404                return -EINVAL;
 405
 406        flush = 0;
 407        mmap_write_lock(gmap->mm);
 408        for (off = 0; off < len; off += PMD_SIZE)
 409                flush |= __gmap_unmap_by_gaddr(gmap, to + off);
 410        mmap_write_unlock(gmap->mm);
 411        if (flush)
 412                gmap_flush_tlb(gmap);
 413        return 0;
 414}
 415EXPORT_SYMBOL_GPL(gmap_unmap_segment);
 416
 417/**
 418 * gmap_map_segment - map a segment to the guest address space
 419 * @gmap: pointer to the guest address space structure
 420 * @from: source address in the parent address space
 421 * @to: target address in the guest address space
 422 * @len: length of the memory area to map
 423 *
 424 * Returns 0 if the mmap succeeded, -EINVAL or -ENOMEM if not.
 425 */
 426int gmap_map_segment(struct gmap *gmap, unsigned long from,
 427                     unsigned long to, unsigned long len)
 428{
 429        unsigned long off;
 430        int flush;
 431
 432        BUG_ON(gmap_is_shadow(gmap));
 433        if ((from | to | len) & (PMD_SIZE - 1))
 434                return -EINVAL;
 435        if (len == 0 || from + len < from || to + len < to ||
 436            from + len - 1 > TASK_SIZE_MAX || to + len - 1 > gmap->asce_end)
 437                return -EINVAL;
 438
 439        flush = 0;
 440        mmap_write_lock(gmap->mm);
 441        for (off = 0; off < len; off += PMD_SIZE) {
 442                /* Remove old translation */
 443                flush |= __gmap_unmap_by_gaddr(gmap, to + off);
 444                /* Store new translation */
 445                if (radix_tree_insert(&gmap->guest_to_host,
 446                                      (to + off) >> PMD_SHIFT,
 447                                      (void *) from + off))
 448                        break;
 449        }
 450        mmap_write_unlock(gmap->mm);
 451        if (flush)
 452                gmap_flush_tlb(gmap);
 453        if (off >= len)
 454                return 0;
 455        gmap_unmap_segment(gmap, to, len);
 456        return -ENOMEM;
 457}
 458EXPORT_SYMBOL_GPL(gmap_map_segment);
 459
 460/**
 461 * __gmap_translate - translate a guest address to a user space address
 462 * @gmap: pointer to guest mapping meta data structure
 463 * @gaddr: guest address
 464 *
 465 * Returns user space address which corresponds to the guest address or
 466 * -EFAULT if no such mapping exists.
 467 * This function does not establish potentially missing page table entries.
 468 * The mmap_lock of the mm that belongs to the address space must be held
 469 * when this function gets called.
 470 *
 471 * Note: Can also be called for shadow gmaps.
 472 */
 473unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
 474{
 475        unsigned long vmaddr;
 476
 477        vmaddr = (unsigned long)
 478                radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
 479        /* Note: guest_to_host is empty for a shadow gmap */
 480        return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
 481}
 482EXPORT_SYMBOL_GPL(__gmap_translate);
 483
 484/**
 485 * gmap_translate - translate a guest address to a user space address
 486 * @gmap: pointer to guest mapping meta data structure
 487 * @gaddr: guest address
 488 *
 489 * Returns user space address which corresponds to the guest address or
 490 * -EFAULT if no such mapping exists.
 491 * This function does not establish potentially missing page table entries.
 492 */
 493unsigned long gmap_translate(struct gmap *gmap, unsigned long gaddr)
 494{
 495        unsigned long rc;
 496
 497        mmap_read_lock(gmap->mm);
 498        rc = __gmap_translate(gmap, gaddr);
 499        mmap_read_unlock(gmap->mm);
 500        return rc;
 501}
 502EXPORT_SYMBOL_GPL(gmap_translate);
 503
 504/**
 505 * gmap_unlink - disconnect a page table from the gmap shadow tables
 506 * @mm: pointer to the parent mm_struct
 507 * @table: pointer to the host page table
 508 * @vmaddr: vm address associated with the host page table
 509 */
 510void gmap_unlink(struct mm_struct *mm, unsigned long *table,
 511                 unsigned long vmaddr)
 512{
 513        struct gmap *gmap;
 514        int flush;
 515
 516        rcu_read_lock();
 517        list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
 518                flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
 519                if (flush)
 520                        gmap_flush_tlb(gmap);
 521        }
 522        rcu_read_unlock();
 523}
 524
 525static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *old, pmd_t new,
 526                           unsigned long gaddr);
 527
 528/**
 529 * __gmap_link - set up shadow page tables to connect a host to a guest address
 530 * @gmap: pointer to guest mapping meta data structure
 531 * @gaddr: guest address
 532 * @vmaddr: vm address
 533 *
 534 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
 535 * if the vm address is already mapped to a different guest segment.
 536 * The mmap_lock of the mm that belongs to the address space must be held
 537 * when this function gets called.
 538 */
 539int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
 540{
 541        struct mm_struct *mm;
 542        unsigned long *table;
 543        spinlock_t *ptl;
 544        pgd_t *pgd;
 545        p4d_t *p4d;
 546        pud_t *pud;
 547        pmd_t *pmd;
 548        u64 unprot;
 549        int rc;
 550
 551        BUG_ON(gmap_is_shadow(gmap));
 552        /* Create higher level tables in the gmap page table */
 553        table = gmap->table;
 554        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
 555                table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
 556                if ((*table & _REGION_ENTRY_INVALID) &&
 557                    gmap_alloc_table(gmap, table, _REGION2_ENTRY_EMPTY,
 558                                     gaddr & _REGION1_MASK))
 559                        return -ENOMEM;
 560                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 561        }
 562        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION2) {
 563                table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
 564                if ((*table & _REGION_ENTRY_INVALID) &&
 565                    gmap_alloc_table(gmap, table, _REGION3_ENTRY_EMPTY,
 566                                     gaddr & _REGION2_MASK))
 567                        return -ENOMEM;
 568                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 569        }
 570        if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION3) {
 571                table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
 572                if ((*table & _REGION_ENTRY_INVALID) &&
 573                    gmap_alloc_table(gmap, table, _SEGMENT_ENTRY_EMPTY,
 574                                     gaddr & _REGION3_MASK))
 575                        return -ENOMEM;
 576                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 577        }
 578        table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
 579        /* Walk the parent mm page table */
 580        mm = gmap->mm;
 581        pgd = pgd_offset(mm, vmaddr);
 582        VM_BUG_ON(pgd_none(*pgd));
 583        p4d = p4d_offset(pgd, vmaddr);
 584        VM_BUG_ON(p4d_none(*p4d));
 585        pud = pud_offset(p4d, vmaddr);
 586        VM_BUG_ON(pud_none(*pud));
 587        /* large puds cannot yet be handled */
 588        if (pud_large(*pud))
 589                return -EFAULT;
 590        pmd = pmd_offset(pud, vmaddr);
 591        VM_BUG_ON(pmd_none(*pmd));
 592        /* Are we allowed to use huge pages? */
 593        if (pmd_large(*pmd) && !gmap->mm->context.allow_gmap_hpage_1m)
 594                return -EFAULT;
 595        /* Link gmap segment table entry location to page table. */
 596        rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
 597        if (rc)
 598                return rc;
 599        ptl = pmd_lock(mm, pmd);
 600        spin_lock(&gmap->guest_table_lock);
 601        if (*table == _SEGMENT_ENTRY_EMPTY) {
 602                rc = radix_tree_insert(&gmap->host_to_guest,
 603                                       vmaddr >> PMD_SHIFT, table);
 604                if (!rc) {
 605                        if (pmd_large(*pmd)) {
 606                                *table = (pmd_val(*pmd) &
 607                                          _SEGMENT_ENTRY_HARDWARE_BITS_LARGE)
 608                                        | _SEGMENT_ENTRY_GMAP_UC;
 609                        } else
 610                                *table = pmd_val(*pmd) &
 611                                        _SEGMENT_ENTRY_HARDWARE_BITS;
 612                }
 613        } else if (*table & _SEGMENT_ENTRY_PROTECT &&
 614                   !(pmd_val(*pmd) & _SEGMENT_ENTRY_PROTECT)) {
 615                unprot = (u64)*table;
 616                unprot &= ~_SEGMENT_ENTRY_PROTECT;
 617                unprot |= _SEGMENT_ENTRY_GMAP_UC;
 618                gmap_pmdp_xchg(gmap, (pmd_t *)table, __pmd(unprot), gaddr);
 619        }
 620        spin_unlock(&gmap->guest_table_lock);
 621        spin_unlock(ptl);
 622        radix_tree_preload_end();
 623        return rc;
 624}
 625
 626/**
 627 * gmap_fault - resolve a fault on a guest address
 628 * @gmap: pointer to guest mapping meta data structure
 629 * @gaddr: guest address
 630 * @fault_flags: flags to pass down to handle_mm_fault()
 631 *
 632 * Returns 0 on success, -ENOMEM for out of memory conditions, and -EFAULT
 633 * if the vm address is already mapped to a different guest segment.
 634 */
 635int gmap_fault(struct gmap *gmap, unsigned long gaddr,
 636               unsigned int fault_flags)
 637{
 638        unsigned long vmaddr;
 639        int rc;
 640        bool unlocked;
 641
 642        mmap_read_lock(gmap->mm);
 643
 644retry:
 645        unlocked = false;
 646        vmaddr = __gmap_translate(gmap, gaddr);
 647        if (IS_ERR_VALUE(vmaddr)) {
 648                rc = vmaddr;
 649                goto out_up;
 650        }
 651        if (fixup_user_fault(gmap->mm, vmaddr, fault_flags,
 652                             &unlocked)) {
 653                rc = -EFAULT;
 654                goto out_up;
 655        }
 656        /*
 657         * In the case that fixup_user_fault unlocked the mmap_lock during
 658         * faultin redo __gmap_translate to not race with a map/unmap_segment.
 659         */
 660        if (unlocked)
 661                goto retry;
 662
 663        rc = __gmap_link(gmap, gaddr, vmaddr);
 664out_up:
 665        mmap_read_unlock(gmap->mm);
 666        return rc;
 667}
 668EXPORT_SYMBOL_GPL(gmap_fault);
 669
 670/*
 671 * this function is assumed to be called with mmap_lock held
 672 */
 673void __gmap_zap(struct gmap *gmap, unsigned long gaddr)
 674{
 675        struct vm_area_struct *vma;
 676        unsigned long vmaddr;
 677        spinlock_t *ptl;
 678        pte_t *ptep;
 679
 680        /* Find the vm address for the guest address */
 681        vmaddr = (unsigned long) radix_tree_lookup(&gmap->guest_to_host,
 682                                                   gaddr >> PMD_SHIFT);
 683        if (vmaddr) {
 684                vmaddr |= gaddr & ~PMD_MASK;
 685
 686                vma = vma_lookup(gmap->mm, vmaddr);
 687                if (!vma || is_vm_hugetlb_page(vma))
 688                        return;
 689
 690                /* Get pointer to the page table entry */
 691                ptep = get_locked_pte(gmap->mm, vmaddr, &ptl);
 692                if (likely(ptep)) {
 693                        ptep_zap_unused(gmap->mm, vmaddr, ptep, 0);
 694                        pte_unmap_unlock(ptep, ptl);
 695                }
 696        }
 697}
 698EXPORT_SYMBOL_GPL(__gmap_zap);
 699
 700void gmap_discard(struct gmap *gmap, unsigned long from, unsigned long to)
 701{
 702        unsigned long gaddr, vmaddr, size;
 703        struct vm_area_struct *vma;
 704
 705        mmap_read_lock(gmap->mm);
 706        for (gaddr = from; gaddr < to;
 707             gaddr = (gaddr + PMD_SIZE) & PMD_MASK) {
 708                /* Find the vm address for the guest address */
 709                vmaddr = (unsigned long)
 710                        radix_tree_lookup(&gmap->guest_to_host,
 711                                          gaddr >> PMD_SHIFT);
 712                if (!vmaddr)
 713                        continue;
 714                vmaddr |= gaddr & ~PMD_MASK;
 715                /* Find vma in the parent mm */
 716                vma = find_vma(gmap->mm, vmaddr);
 717                if (!vma)
 718                        continue;
 719                /*
 720                 * We do not discard pages that are backed by
 721                 * hugetlbfs, so we don't have to refault them.
 722                 */
 723                if (is_vm_hugetlb_page(vma))
 724                        continue;
 725                size = min(to - gaddr, PMD_SIZE - (gaddr & ~PMD_MASK));
 726                zap_page_range(vma, vmaddr, size);
 727        }
 728        mmap_read_unlock(gmap->mm);
 729}
 730EXPORT_SYMBOL_GPL(gmap_discard);
 731
 732static LIST_HEAD(gmap_notifier_list);
 733static DEFINE_SPINLOCK(gmap_notifier_lock);
 734
 735/**
 736 * gmap_register_pte_notifier - register a pte invalidation callback
 737 * @nb: pointer to the gmap notifier block
 738 */
 739void gmap_register_pte_notifier(struct gmap_notifier *nb)
 740{
 741        spin_lock(&gmap_notifier_lock);
 742        list_add_rcu(&nb->list, &gmap_notifier_list);
 743        spin_unlock(&gmap_notifier_lock);
 744}
 745EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
 746
 747/**
 748 * gmap_unregister_pte_notifier - remove a pte invalidation callback
 749 * @nb: pointer to the gmap notifier block
 750 */
 751void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
 752{
 753        spin_lock(&gmap_notifier_lock);
 754        list_del_rcu(&nb->list);
 755        spin_unlock(&gmap_notifier_lock);
 756        synchronize_rcu();
 757}
 758EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
 759
 760/**
 761 * gmap_call_notifier - call all registered invalidation callbacks
 762 * @gmap: pointer to guest mapping meta data structure
 763 * @start: start virtual address in the guest address space
 764 * @end: end virtual address in the guest address space
 765 */
 766static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
 767                               unsigned long end)
 768{
 769        struct gmap_notifier *nb;
 770
 771        list_for_each_entry(nb, &gmap_notifier_list, list)
 772                nb->notifier_call(gmap, start, end);
 773}
 774
 775/**
 776 * gmap_table_walk - walk the gmap page tables
 777 * @gmap: pointer to guest mapping meta data structure
 778 * @gaddr: virtual address in the guest address space
 779 * @level: page table level to stop at
 780 *
 781 * Returns a table entry pointer for the given guest address and @level
 782 * @level=0 : returns a pointer to a page table table entry (or NULL)
 783 * @level=1 : returns a pointer to a segment table entry (or NULL)
 784 * @level=2 : returns a pointer to a region-3 table entry (or NULL)
 785 * @level=3 : returns a pointer to a region-2 table entry (or NULL)
 786 * @level=4 : returns a pointer to a region-1 table entry (or NULL)
 787 *
 788 * Returns NULL if the gmap page tables could not be walked to the
 789 * requested level.
 790 *
 791 * Note: Can also be called for shadow gmaps.
 792 */
 793static inline unsigned long *gmap_table_walk(struct gmap *gmap,
 794                                             unsigned long gaddr, int level)
 795{
 796        const int asce_type = gmap->asce & _ASCE_TYPE_MASK;
 797        unsigned long *table = gmap->table;
 798
 799        if (gmap_is_shadow(gmap) && gmap->removed)
 800                return NULL;
 801
 802        if (WARN_ON_ONCE(level > (asce_type >> 2) + 1))
 803                return NULL;
 804
 805        if (asce_type != _ASCE_TYPE_REGION1 &&
 806            gaddr & (-1UL << (31 + (asce_type >> 2) * 11)))
 807                return NULL;
 808
 809        switch (asce_type) {
 810        case _ASCE_TYPE_REGION1:
 811                table += (gaddr & _REGION1_INDEX) >> _REGION1_SHIFT;
 812                if (level == 4)
 813                        break;
 814                if (*table & _REGION_ENTRY_INVALID)
 815                        return NULL;
 816                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 817                fallthrough;
 818        case _ASCE_TYPE_REGION2:
 819                table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT;
 820                if (level == 3)
 821                        break;
 822                if (*table & _REGION_ENTRY_INVALID)
 823                        return NULL;
 824                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 825                fallthrough;
 826        case _ASCE_TYPE_REGION3:
 827                table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT;
 828                if (level == 2)
 829                        break;
 830                if (*table & _REGION_ENTRY_INVALID)
 831                        return NULL;
 832                table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
 833                fallthrough;
 834        case _ASCE_TYPE_SEGMENT:
 835                table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
 836                if (level == 1)
 837                        break;
 838                if (*table & _REGION_ENTRY_INVALID)
 839                        return NULL;
 840                table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
 841                table += (gaddr & _PAGE_INDEX) >> _PAGE_SHIFT;
 842        }
 843        return table;
 844}
 845
 846/**
 847 * gmap_pte_op_walk - walk the gmap page table, get the page table lock
 848 *                    and return the pte pointer
 849 * @gmap: pointer to guest mapping meta data structure
 850 * @gaddr: virtual address in the guest address space
 851 * @ptl: pointer to the spinlock pointer
 852 *
 853 * Returns a pointer to the locked pte for a guest address, or NULL
 854 */
 855static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
 856                               spinlock_t **ptl)
 857{
 858        unsigned long *table;
 859
 860        BUG_ON(gmap_is_shadow(gmap));
 861        /* Walk the gmap page table, lock and get pte pointer */
 862        table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
 863        if (!table || *table & _SEGMENT_ENTRY_INVALID)
 864                return NULL;
 865        return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
 866}
 867
 868/**
 869 * gmap_pte_op_fixup - force a page in and connect the gmap page table
 870 * @gmap: pointer to guest mapping meta data structure
 871 * @gaddr: virtual address in the guest address space
 872 * @vmaddr: address in the host process address space
 873 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
 874 *
 875 * Returns 0 if the caller can retry __gmap_translate (might fail again),
 876 * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
 877 * up or connecting the gmap page table.
 878 */
 879static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
 880                             unsigned long vmaddr, int prot)
 881{
 882        struct mm_struct *mm = gmap->mm;
 883        unsigned int fault_flags;
 884        bool unlocked = false;
 885
 886        BUG_ON(gmap_is_shadow(gmap));
 887        fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
 888        if (fixup_user_fault(mm, vmaddr, fault_flags, &unlocked))
 889                return -EFAULT;
 890        if (unlocked)
 891                /* lost mmap_lock, caller has to retry __gmap_translate */
 892                return 0;
 893        /* Connect the page tables */
 894        return __gmap_link(gmap, gaddr, vmaddr);
 895}
 896
 897/**
 898 * gmap_pte_op_end - release the page table lock
 899 * @ptl: pointer to the spinlock pointer
 900 */
 901static void gmap_pte_op_end(spinlock_t *ptl)
 902{
 903        if (ptl)
 904                spin_unlock(ptl);
 905}
 906
 907/**
 908 * gmap_pmd_op_walk - walk the gmap tables, get the guest table lock
 909 *                    and return the pmd pointer
 910 * @gmap: pointer to guest mapping meta data structure
 911 * @gaddr: virtual address in the guest address space
 912 *
 913 * Returns a pointer to the pmd for a guest address, or NULL
 914 */
 915static inline pmd_t *gmap_pmd_op_walk(struct gmap *gmap, unsigned long gaddr)
 916{
 917        pmd_t *pmdp;
 918
 919        BUG_ON(gmap_is_shadow(gmap));
 920        pmdp = (pmd_t *) gmap_table_walk(gmap, gaddr, 1);
 921        if (!pmdp)
 922                return NULL;
 923
 924        /* without huge pages, there is no need to take the table lock */
 925        if (!gmap->mm->context.allow_gmap_hpage_1m)
 926                return pmd_none(*pmdp) ? NULL : pmdp;
 927
 928        spin_lock(&gmap->guest_table_lock);
 929        if (pmd_none(*pmdp)) {
 930                spin_unlock(&gmap->guest_table_lock);
 931                return NULL;
 932        }
 933
 934        /* 4k page table entries are locked via the pte (pte_alloc_map_lock). */
 935        if (!pmd_large(*pmdp))
 936                spin_unlock(&gmap->guest_table_lock);
 937        return pmdp;
 938}
 939
 940/**
 941 * gmap_pmd_op_end - release the guest_table_lock if needed
 942 * @gmap: pointer to the guest mapping meta data structure
 943 * @pmdp: pointer to the pmd
 944 */
 945static inline void gmap_pmd_op_end(struct gmap *gmap, pmd_t *pmdp)
 946{
 947        if (pmd_large(*pmdp))
 948                spin_unlock(&gmap->guest_table_lock);
 949}
 950
 951/*
 952 * gmap_protect_pmd - remove access rights to memory and set pmd notification bits
 953 * @pmdp: pointer to the pmd to be protected
 954 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
 955 * @bits: notification bits to set
 956 *
 957 * Returns:
 958 * 0 if successfully protected
 959 * -EAGAIN if a fixup is needed
 960 * -EINVAL if unsupported notifier bits have been specified
 961 *
 962 * Expected to be called with sg->mm->mmap_lock in read and
 963 * guest_table_lock held.
 964 */
 965static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
 966                            pmd_t *pmdp, int prot, unsigned long bits)
 967{
 968        int pmd_i = pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID;
 969        int pmd_p = pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT;
 970        pmd_t new = *pmdp;
 971
 972        /* Fixup needed */
 973        if ((pmd_i && (prot != PROT_NONE)) || (pmd_p && (prot == PROT_WRITE)))
 974                return -EAGAIN;
 975
 976        if (prot == PROT_NONE && !pmd_i) {
 977                pmd_val(new) |= _SEGMENT_ENTRY_INVALID;
 978                gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
 979        }
 980
 981        if (prot == PROT_READ && !pmd_p) {
 982                pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID;
 983                pmd_val(new) |= _SEGMENT_ENTRY_PROTECT;
 984                gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
 985        }
 986
 987        if (bits & GMAP_NOTIFY_MPROT)
 988                pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN;
 989
 990        /* Shadow GMAP protection needs split PMDs */
 991        if (bits & GMAP_NOTIFY_SHADOW)
 992                return -EINVAL;
 993
 994        return 0;
 995}
 996
 997/*
 998 * gmap_protect_pte - remove access rights to memory and set pgste bits
 999 * @gmap: pointer to guest mapping meta data structure
1000 * @gaddr: virtual address in the guest address space
1001 * @pmdp: pointer to the pmd associated with the pte
1002 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
1003 * @bits: notification bits to set
1004 *
1005 * Returns 0 if successfully protected, -ENOMEM if out of memory and
1006 * -EAGAIN if a fixup is needed.
1007 *
1008 * Expected to be called with sg->mm->mmap_lock in read
1009 */
1010static int gmap_protect_pte(struct gmap *gmap, unsigned long gaddr,
1011                            pmd_t *pmdp, int prot, unsigned long bits)
1012{
1013        int rc;
1014        pte_t *ptep;
1015        spinlock_t *ptl = NULL;
1016        unsigned long pbits = 0;
1017
1018        if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
1019                return -EAGAIN;
1020
1021        ptep = pte_alloc_map_lock(gmap->mm, pmdp, gaddr, &ptl);
1022        if (!ptep)
1023                return -ENOMEM;
1024
1025        pbits |= (bits & GMAP_NOTIFY_MPROT) ? PGSTE_IN_BIT : 0;
1026        pbits |= (bits & GMAP_NOTIFY_SHADOW) ? PGSTE_VSIE_BIT : 0;
1027        /* Protect and unlock. */
1028        rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, pbits);
1029        gmap_pte_op_end(ptl);
1030        return rc;
1031}
1032
1033/*
1034 * gmap_protect_range - remove access rights to memory and set pgste bits
1035 * @gmap: pointer to guest mapping meta data structure
1036 * @gaddr: virtual address in the guest address space
1037 * @len: size of area
1038 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
1039 * @bits: pgste notification bits to set
1040 *
1041 * Returns 0 if successfully protected, -ENOMEM if out of memory and
1042 * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
1043 *
1044 * Called with sg->mm->mmap_lock in read.
1045 */
1046static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
1047                              unsigned long len, int prot, unsigned long bits)
1048{
1049        unsigned long vmaddr, dist;
1050        pmd_t *pmdp;
1051        int rc;
1052
1053        BUG_ON(gmap_is_shadow(gmap));
1054        while (len) {
1055                rc = -EAGAIN;
1056                pmdp = gmap_pmd_op_walk(gmap, gaddr);
1057                if (pmdp) {
1058                        if (!pmd_large(*pmdp)) {
1059                                rc = gmap_protect_pte(gmap, gaddr, pmdp, prot,
1060                                                      bits);
1061                                if (!rc) {
1062                                        len -= PAGE_SIZE;
1063                                        gaddr += PAGE_SIZE;
1064                                }
1065                        } else {
1066                                rc = gmap_protect_pmd(gmap, gaddr, pmdp, prot,
1067                                                      bits);
1068                                if (!rc) {
1069                                        dist = HPAGE_SIZE - (gaddr & ~HPAGE_MASK);
1070                                        len = len < dist ? 0 : len - dist;
1071                                        gaddr = (gaddr & HPAGE_MASK) + HPAGE_SIZE;
1072                                }
1073                        }
1074                        gmap_pmd_op_end(gmap, pmdp);
1075                }
1076                if (rc) {
1077                        if (rc == -EINVAL)
1078                                return rc;
1079
1080                        /* -EAGAIN, fixup of userspace mm and gmap */
1081                        vmaddr = __gmap_translate(gmap, gaddr);
1082                        if (IS_ERR_VALUE(vmaddr))
1083                                return vmaddr;
1084                        rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
1085                        if (rc)
1086                                return rc;
1087                }
1088        }
1089        return 0;
1090}
1091
1092/**
1093 * gmap_mprotect_notify - change access rights for a range of ptes and
1094 *                        call the notifier if any pte changes again
1095 * @gmap: pointer to guest mapping meta data structure
1096 * @gaddr: virtual address in the guest address space
1097 * @len: size of area
1098 * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
1099 *
1100 * Returns 0 if for each page in the given range a gmap mapping exists,
1101 * the new access rights could be set and the notifier could be armed.
1102 * If the gmap mapping is missing for one or more pages -EFAULT is
1103 * returned. If no memory could be allocated -ENOMEM is returned.
1104 * This function establishes missing page table entries.
1105 */
1106int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
1107                         unsigned long len, int prot)
1108{
1109        int rc;
1110
1111        if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
1112                return -EINVAL;
1113        if (!MACHINE_HAS_ESOP && prot == PROT_READ)
1114                return -EINVAL;
1115        mmap_read_lock(gmap->mm);
1116        rc = gmap_protect_range(gmap, gaddr, len, prot, GMAP_NOTIFY_MPROT);
1117        mmap_read_unlock(gmap->mm);
1118        return rc;
1119}
1120EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
1121
1122/**
1123 * gmap_read_table - get an unsigned long value from a guest page table using
1124 *                   absolute addressing, without marking the page referenced.
1125 * @gmap: pointer to guest mapping meta data structure
1126 * @gaddr: virtual address in the guest address space
1127 * @val: pointer to the unsigned long value to return
1128 *
1129 * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
1130 * if reading using the virtual address failed. -EINVAL if called on a gmap
1131 * shadow.
1132 *
1133 * Called with gmap->mm->mmap_lock in read.
1134 */
1135int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
1136{
1137        unsigned long address, vmaddr;
1138        spinlock_t *ptl;
1139        pte_t *ptep, pte;
1140        int rc;
1141
1142        if (gmap_is_shadow(gmap))
1143                return -EINVAL;
1144
1145        while (1) {
1146                rc = -EAGAIN;
1147                ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
1148                if (ptep) {
1149                        pte = *ptep;
1150                        if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
1151                                address = pte_val(pte) & PAGE_MASK;
1152                                address += gaddr & ~PAGE_MASK;
1153                                *val = *(unsigned long *) address;
1154                                pte_val(*ptep) |= _PAGE_YOUNG;
1155                                /* Do *NOT* clear the _PAGE_INVALID bit! */
1156                                rc = 0;
1157                        }
1158                        gmap_pte_op_end(ptl);
1159                }
1160                if (!rc)
1161                        break;
1162                vmaddr = __gmap_translate(gmap, gaddr);
1163                if (IS_ERR_VALUE(vmaddr)) {
1164                        rc = vmaddr;
1165                        break;
1166                }
1167                rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
1168                if (rc)
1169                        break;
1170        }
1171        return rc;
1172}
1173EXPORT_SYMBOL_GPL(gmap_read_table);
1174
1175/**
1176 * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
1177 * @sg: pointer to the shadow guest address space structure
1178 * @vmaddr: vm address associated with the rmap
1179 * @rmap: pointer to the rmap structure
1180 *
1181 * Called with the sg->guest_table_lock
1182 */
1183static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
1184                                    struct gmap_rmap *rmap)
1185{
1186        void __rcu **slot;
1187
1188        BUG_ON(!gmap_is_shadow(sg));
1189        slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
1190        if (slot) {
1191                rmap->next = radix_tree_deref_slot_protected(slot,
1192                                                        &sg->guest_table_lock);
1193                radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
1194        } else {
1195                rmap->next = NULL;
1196                radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
1197                                  rmap);
1198        }
1199}
1200
1201/**
1202 * gmap_protect_rmap - restrict access rights to memory (RO) and create an rmap
1203 * @sg: pointer to the shadow guest address space structure
1204 * @raddr: rmap address in the shadow gmap
1205 * @paddr: address in the parent guest address space
1206 * @len: length of the memory area to protect
1207 *
1208 * Returns 0 if successfully protected and the rmap was created, -ENOMEM
1209 * if out of memory and -EFAULT if paddr is invalid.
1210 */
1211static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
1212                             unsigned long paddr, unsigned long len)
1213{
1214        struct gmap *parent;
1215        struct gmap_rmap *rmap;
1216        unsigned long vmaddr;
1217        spinlock_t *ptl;
1218        pte_t *ptep;
1219        int rc;
1220
1221        BUG_ON(!gmap_is_shadow(sg));
1222        parent = sg->parent;
1223        while (len) {
1224                vmaddr = __gmap_translate(parent, paddr);
1225                if (IS_ERR_VALUE(vmaddr))
1226                        return vmaddr;
1227                rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
1228                if (!rmap)
1229                        return -ENOMEM;
1230                rmap->raddr = raddr;
1231                rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
1232                if (rc) {
1233                        kfree(rmap);
1234                        return rc;
1235                }
1236                rc = -EAGAIN;
1237                ptep = gmap_pte_op_walk(parent, paddr, &ptl);
1238                if (ptep) {
1239                        spin_lock(&sg->guest_table_lock);
1240                        rc = ptep_force_prot(parent->mm, paddr, ptep, PROT_READ,
1241                                             PGSTE_VSIE_BIT);
1242                        if (!rc)
1243                                gmap_insert_rmap(sg, vmaddr, rmap);
1244                        spin_unlock(&sg->guest_table_lock);
1245                        gmap_pte_op_end(ptl);
1246                }
1247                radix_tree_preload_end();
1248                if (rc) {
1249                        kfree(rmap);
1250                        rc = gmap_pte_op_fixup(parent, paddr, vmaddr, PROT_READ);
1251                        if (rc)
1252                                return rc;
1253                        continue;
1254                }
1255                paddr += PAGE_SIZE;
1256                len -= PAGE_SIZE;
1257        }
1258        return 0;
1259}
1260
1261#define _SHADOW_RMAP_MASK       0x7
1262#define _SHADOW_RMAP_REGION1    0x5
1263#define _SHADOW_RMAP_REGION2    0x4
1264#define _SHADOW_RMAP_REGION3    0x3
1265#define _SHADOW_RMAP_SEGMENT    0x2
1266#define _SHADOW_RMAP_PGTABLE    0x1
1267
1268/**
1269 * gmap_idte_one - invalidate a single region or segment table entry
1270 * @asce: region or segment table *origin* + table-type bits
1271 * @vaddr: virtual address to identify the table entry to flush
1272 *
1273 * The invalid bit of a single region or segment table entry is set
1274 * and the associated TLB entries depending on the entry are flushed.
1275 * The table-type of the @asce identifies the portion of the @vaddr
1276 * that is used as the invalidation index.
1277 */
1278static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
1279{
1280        asm volatile(
1281                "       .insn   rrf,0xb98e0000,%0,%1,0,0"
1282                : : "a" (asce), "a" (vaddr) : "cc", "memory");
1283}
1284
1285/**
1286 * gmap_unshadow_page - remove a page from a shadow page table
1287 * @sg: pointer to the shadow guest address space structure
1288 * @raddr: rmap address in the shadow guest address space
1289 *
1290 * Called with the sg->guest_table_lock
1291 */
1292static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
1293{
1294        unsigned long *table;
1295
1296        BUG_ON(!gmap_is_shadow(sg));
1297        table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
1298        if (!table || *table & _PAGE_INVALID)
1299                return;
1300        gmap_call_notifier(sg, raddr, raddr + _PAGE_SIZE - 1);
1301        ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
1302}
1303
1304/**
1305 * __gmap_unshadow_pgt - remove all entries from a shadow page table
1306 * @sg: pointer to the shadow guest address space structure
1307 * @raddr: rmap address in the shadow guest address space
1308 * @pgt: pointer to the start of a shadow page table
1309 *
1310 * Called with the sg->guest_table_lock
1311 */
1312static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
1313                                unsigned long *pgt)
1314{
1315        int i;
1316
1317        BUG_ON(!gmap_is_shadow(sg));
1318        for (i = 0; i < _PAGE_ENTRIES; i++, raddr += _PAGE_SIZE)
1319                pgt[i] = _PAGE_INVALID;
1320}
1321
1322/**
1323 * gmap_unshadow_pgt - remove a shadow page table from a segment entry
1324 * @sg: pointer to the shadow guest address space structure
1325 * @raddr: address in the shadow guest address space
1326 *
1327 * Called with the sg->guest_table_lock
1328 */
1329static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
1330{
1331        unsigned long sto, *ste, *pgt;
1332        struct page *page;
1333
1334        BUG_ON(!gmap_is_shadow(sg));
1335        ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
1336        if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
1337                return;
1338        gmap_call_notifier(sg, raddr, raddr + _SEGMENT_SIZE - 1);
1339        sto = (unsigned long) (ste - ((raddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT));
1340        gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
1341        pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
1342        *ste = _SEGMENT_ENTRY_EMPTY;
1343        __gmap_unshadow_pgt(sg, raddr, pgt);
1344        /* Free page table */
1345        page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
1346        list_del(&page->lru);
1347        page_table_free_pgste(page);
1348}
1349
1350/**
1351 * __gmap_unshadow_sgt - remove all entries from a shadow segment table
1352 * @sg: pointer to the shadow guest address space structure
1353 * @raddr: rmap address in the shadow guest address space
1354 * @sgt: pointer to the start of a shadow segment table
1355 *
1356 * Called with the sg->guest_table_lock
1357 */
1358static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
1359                                unsigned long *sgt)
1360{
1361        unsigned long *pgt;
1362        struct page *page;
1363        int i;
1364
1365        BUG_ON(!gmap_is_shadow(sg));
1366        for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) {
1367                if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
1368                        continue;
1369                pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
1370                sgt[i] = _SEGMENT_ENTRY_EMPTY;
1371                __gmap_unshadow_pgt(sg, raddr, pgt);
1372                /* Free page table */
1373                page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
1374                list_del(&page->lru);
1375                page_table_free_pgste(page);
1376        }
1377}
1378
1379/**
1380 * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
1381 * @sg: pointer to the shadow guest address space structure
1382 * @raddr: rmap address in the shadow guest address space
1383 *
1384 * Called with the shadow->guest_table_lock
1385 */
1386static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
1387{
1388        unsigned long r3o, *r3e, *sgt;
1389        struct page *page;
1390
1391        BUG_ON(!gmap_is_shadow(sg));
1392        r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
1393        if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
1394                return;
1395        gmap_call_notifier(sg, raddr, raddr + _REGION3_SIZE - 1);
1396        r3o = (unsigned long) (r3e - ((raddr & _REGION3_INDEX) >> _REGION3_SHIFT));
1397        gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
1398        sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
1399        *r3e = _REGION3_ENTRY_EMPTY;
1400        __gmap_unshadow_sgt(sg, raddr, sgt);
1401        /* Free segment table */
1402        page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
1403        list_del(&page->lru);
1404        __free_pages(page, CRST_ALLOC_ORDER);
1405}
1406
1407/**
1408 * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
1409 * @sg: pointer to the shadow guest address space structure
1410 * @raddr: address in the shadow guest address space
1411 * @r3t: pointer to the start of a shadow region-3 table
1412 *
1413 * Called with the sg->guest_table_lock
1414 */
1415static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
1416                                unsigned long *r3t)
1417{
1418        unsigned long *sgt;
1419        struct page *page;
1420        int i;
1421
1422        BUG_ON(!gmap_is_shadow(sg));
1423        for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) {
1424                if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
1425                        continue;
1426                sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
1427                r3t[i] = _REGION3_ENTRY_EMPTY;
1428                __gmap_unshadow_sgt(sg, raddr, sgt);
1429                /* Free segment table */
1430                page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
1431                list_del(&page->lru);
1432                __free_pages(page, CRST_ALLOC_ORDER);
1433        }
1434}
1435
1436/**
1437 * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
1438 * @sg: pointer to the shadow guest address space structure
1439 * @raddr: rmap address in the shadow guest address space
1440 *
1441 * Called with the sg->guest_table_lock
1442 */
1443static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
1444{
1445        unsigned long r2o, *r2e, *r3t;
1446        struct page *page;
1447
1448        BUG_ON(!gmap_is_shadow(sg));
1449        r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
1450        if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
1451                return;
1452        gmap_call_notifier(sg, raddr, raddr + _REGION2_SIZE - 1);
1453        r2o = (unsigned long) (r2e - ((raddr & _REGION2_INDEX) >> _REGION2_SHIFT));
1454        gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
1455        r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
1456        *r2e = _REGION2_ENTRY_EMPTY;
1457        __gmap_unshadow_r3t(sg, raddr, r3t);
1458        /* Free region 3 table */
1459        page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
1460        list_del(&page->lru);
1461        __free_pages(page, CRST_ALLOC_ORDER);
1462}
1463
1464/**
1465 * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
1466 * @sg: pointer to the shadow guest address space structure
1467 * @raddr: rmap address in the shadow guest address space
1468 * @r2t: pointer to the start of a shadow region-2 table
1469 *
1470 * Called with the sg->guest_table_lock
1471 */
1472static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
1473                                unsigned long *r2t)
1474{
1475        unsigned long *r3t;
1476        struct page *page;
1477        int i;
1478
1479        BUG_ON(!gmap_is_shadow(sg));
1480        for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) {
1481                if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
1482                        continue;
1483                r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
1484                r2t[i] = _REGION2_ENTRY_EMPTY;
1485                __gmap_unshadow_r3t(sg, raddr, r3t);
1486                /* Free region 3 table */
1487                page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
1488                list_del(&page->lru);
1489                __free_pages(page, CRST_ALLOC_ORDER);
1490        }
1491}
1492
1493/**
1494 * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
1495 * @sg: pointer to the shadow guest address space structure
1496 * @raddr: rmap address in the shadow guest address space
1497 *
1498 * Called with the sg->guest_table_lock
1499 */
1500static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
1501{
1502        unsigned long r1o, *r1e, *r2t;
1503        struct page *page;
1504
1505        BUG_ON(!gmap_is_shadow(sg));
1506        r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
1507        if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
1508                return;
1509        gmap_call_notifier(sg, raddr, raddr + _REGION1_SIZE - 1);
1510        r1o = (unsigned long) (r1e - ((raddr & _REGION1_INDEX) >> _REGION1_SHIFT));
1511        gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
1512        r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
1513        *r1e = _REGION1_ENTRY_EMPTY;
1514        __gmap_unshadow_r2t(sg, raddr, r2t);
1515        /* Free region 2 table */
1516        page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
1517        list_del(&page->lru);
1518        __free_pages(page, CRST_ALLOC_ORDER);
1519}
1520
1521/**
1522 * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
1523 * @sg: pointer to the shadow guest address space structure
1524 * @raddr: rmap address in the shadow guest address space
1525 * @r1t: pointer to the start of a shadow region-1 table
1526 *
1527 * Called with the shadow->guest_table_lock
1528 */
1529static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
1530                                unsigned long *r1t)
1531{
1532        unsigned long asce, *r2t;
1533        struct page *page;
1534        int i;
1535
1536        BUG_ON(!gmap_is_shadow(sg));
1537        asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
1538        for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION1_SIZE) {
1539                if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
1540                        continue;
1541                r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
1542                __gmap_unshadow_r2t(sg, raddr, r2t);
1543                /* Clear entry and flush translation r1t -> r2t */
1544                gmap_idte_one(asce, raddr);
1545                r1t[i] = _REGION1_ENTRY_EMPTY;
1546                /* Free region 2 table */
1547                page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
1548                list_del(&page->lru);
1549                __free_pages(page, CRST_ALLOC_ORDER);
1550        }
1551}
1552
1553/**
1554 * gmap_unshadow - remove a shadow page table completely
1555 * @sg: pointer to the shadow guest address space structure
1556 *
1557 * Called with sg->guest_table_lock
1558 */
1559static void gmap_unshadow(struct gmap *sg)
1560{
1561        unsigned long *table;
1562
1563        BUG_ON(!gmap_is_shadow(sg));
1564        if (sg->removed)
1565                return;
1566        sg->removed = 1;
1567        gmap_call_notifier(sg, 0, -1UL);
1568        gmap_flush_tlb(sg);
1569        table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
1570        switch (sg->asce & _ASCE_TYPE_MASK) {
1571        case _ASCE_TYPE_REGION1:
1572                __gmap_unshadow_r1t(sg, 0, table);
1573                break;
1574        case _ASCE_TYPE_REGION2:
1575                __gmap_unshadow_r2t(sg, 0, table);
1576                break;
1577        case _ASCE_TYPE_REGION3:
1578                __gmap_unshadow_r3t(sg, 0, table);
1579                break;
1580        case _ASCE_TYPE_SEGMENT:
1581                __gmap_unshadow_sgt(sg, 0, table);
1582                break;
1583        }
1584}
1585
1586/**
1587 * gmap_find_shadow - find a specific asce in the list of shadow tables
1588 * @parent: pointer to the parent gmap
1589 * @asce: ASCE for which the shadow table is created
1590 * @edat_level: edat level to be used for the shadow translation
1591 *
1592 * Returns the pointer to a gmap if a shadow table with the given asce is
1593 * already available, ERR_PTR(-EAGAIN) if another one is just being created,
1594 * otherwise NULL
1595 */
1596static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
1597                                     int edat_level)
1598{
1599        struct gmap *sg;
1600
1601        list_for_each_entry(sg, &parent->children, list) {
1602                if (sg->orig_asce != asce || sg->edat_level != edat_level ||
1603                    sg->removed)
1604                        continue;
1605                if (!sg->initialized)
1606                        return ERR_PTR(-EAGAIN);
1607                refcount_inc(&sg->ref_count);
1608                return sg;
1609        }
1610        return NULL;
1611}
1612
1613/**
1614 * gmap_shadow_valid - check if a shadow guest address space matches the
1615 *                     given properties and is still valid
1616 * @sg: pointer to the shadow guest address space structure
1617 * @asce: ASCE for which the shadow table is requested
1618 * @edat_level: edat level to be used for the shadow translation
1619 *
1620 * Returns 1 if the gmap shadow is still valid and matches the given
1621 * properties, the caller can continue using it. Returns 0 otherwise, the
1622 * caller has to request a new shadow gmap in this case.
1623 *
1624 */
1625int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
1626{
1627        if (sg->removed)
1628                return 0;
1629        return sg->orig_asce == asce && sg->edat_level == edat_level;
1630}
1631EXPORT_SYMBOL_GPL(gmap_shadow_valid);
1632
1633/**
1634 * gmap_shadow - create/find a shadow guest address space
1635 * @parent: pointer to the parent gmap
1636 * @asce: ASCE for which the shadow table is created
1637 * @edat_level: edat level to be used for the shadow translation
1638 *
1639 * The pages of the top level page table referred by the asce parameter
1640 * will be set to read-only and marked in the PGSTEs of the kvm process.
1641 * The shadow table will be removed automatically on any change to the
1642 * PTE mapping for the source table.
1643 *
1644 * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
1645 * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
1646 * parent gmap table could not be protected.
1647 */
1648struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
1649                         int edat_level)
1650{
1651        struct gmap *sg, *new;
1652        unsigned long limit;
1653        int rc;
1654
1655        BUG_ON(parent->mm->context.allow_gmap_hpage_1m);
1656        BUG_ON(gmap_is_shadow(parent));
1657        spin_lock(&parent->shadow_lock);
1658        sg = gmap_find_shadow(parent, asce, edat_level);
1659        spin_unlock(&parent->shadow_lock);
1660        if (sg)
1661                return sg;
1662        /* Create a new shadow gmap */
1663        limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
1664        if (asce & _ASCE_REAL_SPACE)
1665                limit = -1UL;
1666        new = gmap_alloc(limit);
1667        if (!new)
1668                return ERR_PTR(-ENOMEM);
1669        new->mm = parent->mm;
1670        new->parent = gmap_get(parent);
1671        new->orig_asce = asce;
1672        new->edat_level = edat_level;
1673        new->initialized = false;
1674        spin_lock(&parent->shadow_lock);
1675        /* Recheck if another CPU created the same shadow */
1676        sg = gmap_find_shadow(parent, asce, edat_level);
1677        if (sg) {
1678                spin_unlock(&parent->shadow_lock);
1679                gmap_free(new);
1680                return sg;
1681        }
1682        if (asce & _ASCE_REAL_SPACE) {
1683                /* only allow one real-space gmap shadow */
1684                list_for_each_entry(sg, &parent->children, list) {
1685                        if (sg->orig_asce & _ASCE_REAL_SPACE) {
1686                                spin_lock(&sg->guest_table_lock);
1687                                gmap_unshadow(sg);
1688                                spin_unlock(&sg->guest_table_lock);
1689                                list_del(&sg->list);
1690                                gmap_put(sg);
1691                                break;
1692                        }
1693                }
1694        }
1695        refcount_set(&new->ref_count, 2);
1696        list_add(&new->list, &parent->children);
1697        if (asce & _ASCE_REAL_SPACE) {
1698                /* nothing to protect, return right away */
1699                new->initialized = true;
1700                spin_unlock(&parent->shadow_lock);
1701                return new;
1702        }
1703        spin_unlock(&parent->shadow_lock);
1704        /* protect after insertion, so it will get properly invalidated */
1705        mmap_read_lock(parent->mm);
1706        rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
1707                                ((asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE,
1708                                PROT_READ, GMAP_NOTIFY_SHADOW);
1709        mmap_read_unlock(parent->mm);
1710        spin_lock(&parent->shadow_lock);
1711        new->initialized = true;
1712        if (rc) {
1713                list_del(&new->list);
1714                gmap_free(new);
1715                new = ERR_PTR(rc);
1716        }
1717        spin_unlock(&parent->shadow_lock);
1718        return new;
1719}
1720EXPORT_SYMBOL_GPL(gmap_shadow);
1721
1722/**
1723 * gmap_shadow_r2t - create an empty shadow region 2 table
1724 * @sg: pointer to the shadow guest address space structure
1725 * @saddr: faulting address in the shadow gmap
1726 * @r2t: parent gmap address of the region 2 table to get shadowed
1727 * @fake: r2t references contiguous guest memory block, not a r2t
1728 *
1729 * The r2t parameter specifies the address of the source table. The
1730 * four pages of the source table are made read-only in the parent gmap
1731 * address space. A write to the source table area @r2t will automatically
1732 * remove the shadow r2 table and all of its decendents.
1733 *
1734 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1735 * shadow table structure is incomplete, -ENOMEM if out of memory and
1736 * -EFAULT if an address in the parent gmap could not be resolved.
1737 *
1738 * Called with sg->mm->mmap_lock in read.
1739 */
1740int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
1741                    int fake)
1742{
1743        unsigned long raddr, origin, offset, len;
1744        unsigned long *s_r2t, *table;
1745        struct page *page;
1746        int rc;
1747
1748        BUG_ON(!gmap_is_shadow(sg));
1749        /* Allocate a shadow region second table */
1750        page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
1751        if (!page)
1752                return -ENOMEM;
1753        page->index = r2t & _REGION_ENTRY_ORIGIN;
1754        if (fake)
1755                page->index |= GMAP_SHADOW_FAKE_TABLE;
1756        s_r2t = (unsigned long *) page_to_phys(page);
1757        /* Install shadow region second table */
1758        spin_lock(&sg->guest_table_lock);
1759        table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
1760        if (!table) {
1761                rc = -EAGAIN;           /* Race with unshadow */
1762                goto out_free;
1763        }
1764        if (!(*table & _REGION_ENTRY_INVALID)) {
1765                rc = 0;                 /* Already established */
1766                goto out_free;
1767        } else if (*table & _REGION_ENTRY_ORIGIN) {
1768                rc = -EAGAIN;           /* Race with shadow */
1769                goto out_free;
1770        }
1771        crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
1772        /* mark as invalid as long as the parent table is not protected */
1773        *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
1774                 _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
1775        if (sg->edat_level >= 1)
1776                *table |= (r2t & _REGION_ENTRY_PROTECT);
1777        list_add(&page->lru, &sg->crst_list);
1778        if (fake) {
1779                /* nothing to protect for fake tables */
1780                *table &= ~_REGION_ENTRY_INVALID;
1781                spin_unlock(&sg->guest_table_lock);
1782                return 0;
1783        }
1784        spin_unlock(&sg->guest_table_lock);
1785        /* Make r2t read-only in parent gmap page table */
1786        raddr = (saddr & _REGION1_MASK) | _SHADOW_RMAP_REGION1;
1787        origin = r2t & _REGION_ENTRY_ORIGIN;
1788        offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1789        len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1790        rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1791        spin_lock(&sg->guest_table_lock);
1792        if (!rc) {
1793                table = gmap_table_walk(sg, saddr, 4);
1794                if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
1795                              (unsigned long) s_r2t)
1796                        rc = -EAGAIN;           /* Race with unshadow */
1797                else
1798                        *table &= ~_REGION_ENTRY_INVALID;
1799        } else {
1800                gmap_unshadow_r2t(sg, raddr);
1801        }
1802        spin_unlock(&sg->guest_table_lock);
1803        return rc;
1804out_free:
1805        spin_unlock(&sg->guest_table_lock);
1806        __free_pages(page, CRST_ALLOC_ORDER);
1807        return rc;
1808}
1809EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
1810
1811/**
1812 * gmap_shadow_r3t - create a shadow region 3 table
1813 * @sg: pointer to the shadow guest address space structure
1814 * @saddr: faulting address in the shadow gmap
1815 * @r3t: parent gmap address of the region 3 table to get shadowed
1816 * @fake: r3t references contiguous guest memory block, not a r3t
1817 *
1818 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
1819 * shadow table structure is incomplete, -ENOMEM if out of memory and
1820 * -EFAULT if an address in the parent gmap could not be resolved.
1821 *
1822 * Called with sg->mm->mmap_lock in read.
1823 */
1824int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
1825                    int fake)
1826{
1827        unsigned long raddr, origin, offset, len;
1828        unsigned long *s_r3t, *table;
1829        struct page *page;
1830        int rc;
1831
1832        BUG_ON(!gmap_is_shadow(sg));
1833        /* Allocate a shadow region second table */
1834        page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
1835        if (!page)
1836                return -ENOMEM;
1837        page->index = r3t & _REGION_ENTRY_ORIGIN;
1838        if (fake)
1839                page->index |= GMAP_SHADOW_FAKE_TABLE;
1840        s_r3t = (unsigned long *) page_to_phys(page);
1841        /* Install shadow region second table */
1842        spin_lock(&sg->guest_table_lock);
1843        table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
1844        if (!table) {
1845                rc = -EAGAIN;           /* Race with unshadow */
1846                goto out_free;
1847        }
1848        if (!(*table & _REGION_ENTRY_INVALID)) {
1849                rc = 0;                 /* Already established */
1850                goto out_free;
1851        } else if (*table & _REGION_ENTRY_ORIGIN) {
1852                rc = -EAGAIN;           /* Race with shadow */
1853                goto out_free;
1854        }
1855        crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
1856        /* mark as invalid as long as the parent table is not protected */
1857        *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
1858                 _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
1859        if (sg->edat_level >= 1)
1860                *table |= (r3t & _REGION_ENTRY_PROTECT);
1861        list_add(&page->lru, &sg->crst_list);
1862        if (fake) {
1863                /* nothing to protect for fake tables */
1864                *table &= ~_REGION_ENTRY_INVALID;
1865                spin_unlock(&sg->guest_table_lock);
1866                return 0;
1867        }
1868        spin_unlock(&sg->guest_table_lock);
1869        /* Make r3t read-only in parent gmap page table */
1870        raddr = (saddr & _REGION2_MASK) | _SHADOW_RMAP_REGION2;
1871        origin = r3t & _REGION_ENTRY_ORIGIN;
1872        offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1873        len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1874        rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1875        spin_lock(&sg->guest_table_lock);
1876        if (!rc) {
1877                table = gmap_table_walk(sg, saddr, 3);
1878                if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
1879                              (unsigned long) s_r3t)
1880                        rc = -EAGAIN;           /* Race with unshadow */
1881                else
1882                        *table &= ~_REGION_ENTRY_INVALID;
1883        } else {
1884                gmap_unshadow_r3t(sg, raddr);
1885        }
1886        spin_unlock(&sg->guest_table_lock);
1887        return rc;
1888out_free:
1889        spin_unlock(&sg->guest_table_lock);
1890        __free_pages(page, CRST_ALLOC_ORDER);
1891        return rc;
1892}
1893EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
1894
1895/**
1896 * gmap_shadow_sgt - create a shadow segment table
1897 * @sg: pointer to the shadow guest address space structure
1898 * @saddr: faulting address in the shadow gmap
1899 * @sgt: parent gmap address of the segment table to get shadowed
1900 * @fake: sgt references contiguous guest memory block, not a sgt
1901 *
1902 * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
1903 * shadow table structure is incomplete, -ENOMEM if out of memory and
1904 * -EFAULT if an address in the parent gmap could not be resolved.
1905 *
1906 * Called with sg->mm->mmap_lock in read.
1907 */
1908int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
1909                    int fake)
1910{
1911        unsigned long raddr, origin, offset, len;
1912        unsigned long *s_sgt, *table;
1913        struct page *page;
1914        int rc;
1915
1916        BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
1917        /* Allocate a shadow segment table */
1918        page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
1919        if (!page)
1920                return -ENOMEM;
1921        page->index = sgt & _REGION_ENTRY_ORIGIN;
1922        if (fake)
1923                page->index |= GMAP_SHADOW_FAKE_TABLE;
1924        s_sgt = (unsigned long *) page_to_phys(page);
1925        /* Install shadow region second table */
1926        spin_lock(&sg->guest_table_lock);
1927        table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
1928        if (!table) {
1929                rc = -EAGAIN;           /* Race with unshadow */
1930                goto out_free;
1931        }
1932        if (!(*table & _REGION_ENTRY_INVALID)) {
1933                rc = 0;                 /* Already established */
1934                goto out_free;
1935        } else if (*table & _REGION_ENTRY_ORIGIN) {
1936                rc = -EAGAIN;           /* Race with shadow */
1937                goto out_free;
1938        }
1939        crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
1940        /* mark as invalid as long as the parent table is not protected */
1941        *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
1942                 _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
1943        if (sg->edat_level >= 1)
1944                *table |= sgt & _REGION_ENTRY_PROTECT;
1945        list_add(&page->lru, &sg->crst_list);
1946        if (fake) {
1947                /* nothing to protect for fake tables */
1948                *table &= ~_REGION_ENTRY_INVALID;
1949                spin_unlock(&sg->guest_table_lock);
1950                return 0;
1951        }
1952        spin_unlock(&sg->guest_table_lock);
1953        /* Make sgt read-only in parent gmap page table */
1954        raddr = (saddr & _REGION3_MASK) | _SHADOW_RMAP_REGION3;
1955        origin = sgt & _REGION_ENTRY_ORIGIN;
1956        offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * PAGE_SIZE;
1957        len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * PAGE_SIZE - offset;
1958        rc = gmap_protect_rmap(sg, raddr, origin + offset, len);
1959        spin_lock(&sg->guest_table_lock);
1960        if (!rc) {
1961                table = gmap_table_walk(sg, saddr, 2);
1962                if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
1963                              (unsigned long) s_sgt)
1964                        rc = -EAGAIN;           /* Race with unshadow */
1965                else
1966                        *table &= ~_REGION_ENTRY_INVALID;
1967        } else {
1968                gmap_unshadow_sgt(sg, raddr);
1969        }
1970        spin_unlock(&sg->guest_table_lock);
1971        return rc;
1972out_free:
1973        spin_unlock(&sg->guest_table_lock);
1974        __free_pages(page, CRST_ALLOC_ORDER);
1975        return rc;
1976}
1977EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
1978
1979/**
1980 * gmap_shadow_pgt_lookup - find a shadow page table
1981 * @sg: pointer to the shadow guest address space structure
1982 * @saddr: the address in the shadow aguest address space
1983 * @pgt: parent gmap address of the page table to get shadowed
1984 * @dat_protection: if the pgtable is marked as protected by dat
1985 * @fake: pgt references contiguous guest memory block, not a pgtable
1986 *
1987 * Returns 0 if the shadow page table was found and -EAGAIN if the page
1988 * table was not found.
1989 *
1990 * Called with sg->mm->mmap_lock in read.
1991 */
1992int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
1993                           unsigned long *pgt, int *dat_protection,
1994                           int *fake)
1995{
1996        unsigned long *table;
1997        struct page *page;
1998        int rc;
1999
2000        BUG_ON(!gmap_is_shadow(sg));
2001        spin_lock(&sg->guest_table_lock);
2002        table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
2003        if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
2004                /* Shadow page tables are full pages (pte+pgste) */
2005                page = pfn_to_page(*table >> PAGE_SHIFT);
2006                *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
2007                *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
2008                *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
2009                rc = 0;
2010        } else  {
2011                rc = -EAGAIN;
2012        }
2013        spin_unlock(&sg->guest_table_lock);
2014        return rc;
2015
2016}
2017EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
2018
2019/**
2020 * gmap_shadow_pgt - instantiate a shadow page table
2021 * @sg: pointer to the shadow guest address space structure
2022 * @saddr: faulting address in the shadow gmap
2023 * @pgt: parent gmap address of the page table to get shadowed
2024 * @fake: pgt references contiguous guest memory block, not a pgtable
2025 *
2026 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
2027 * shadow table structure is incomplete, -ENOMEM if out of memory,
2028 * -EFAULT if an address in the parent gmap could not be resolved and
2029 *
2030 * Called with gmap->mm->mmap_lock in read
2031 */
2032int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
2033                    int fake)
2034{
2035        unsigned long raddr, origin;
2036        unsigned long *s_pgt, *table;
2037        struct page *page;
2038        int rc;
2039
2040        BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
2041        /* Allocate a shadow page table */
2042        page = page_table_alloc_pgste(sg->mm);
2043        if (!page)
2044                return -ENOMEM;
2045        page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
2046        if (fake)
2047                page->index |= GMAP_SHADOW_FAKE_TABLE;
2048        s_pgt = (unsigned long *) page_to_phys(page);
2049        /* Install shadow page table */
2050        spin_lock(&sg->guest_table_lock);
2051        table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
2052        if (!table) {
2053                rc = -EAGAIN;           /* Race with unshadow */
2054                goto out_free;
2055        }
2056        if (!(*table & _SEGMENT_ENTRY_INVALID)) {
2057                rc = 0;                 /* Already established */
2058                goto out_free;
2059        } else if (*table & _SEGMENT_ENTRY_ORIGIN) {
2060                rc = -EAGAIN;           /* Race with shadow */
2061                goto out_free;
2062        }
2063        /* mark as invalid as long as the parent table is not protected */
2064        *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
2065                 (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
2066        list_add(&page->lru, &sg->pt_list);
2067        if (fake) {
2068                /* nothing to protect for fake tables */
2069                *table &= ~_SEGMENT_ENTRY_INVALID;
2070                spin_unlock(&sg->guest_table_lock);
2071                return 0;
2072        }
2073        spin_unlock(&sg->guest_table_lock);
2074        /* Make pgt read-only in parent gmap page table (not the pgste) */
2075        raddr = (saddr & _SEGMENT_MASK) | _SHADOW_RMAP_SEGMENT;
2076        origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
2077        rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE);
2078        spin_lock(&sg->guest_table_lock);
2079        if (!rc) {
2080                table = gmap_table_walk(sg, saddr, 1);
2081                if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
2082                              (unsigned long) s_pgt)
2083                        rc = -EAGAIN;           /* Race with unshadow */
2084                else
2085                        *table &= ~_SEGMENT_ENTRY_INVALID;
2086        } else {
2087                gmap_unshadow_pgt(sg, raddr);
2088        }
2089        spin_unlock(&sg->guest_table_lock);
2090        return rc;
2091out_free:
2092        spin_unlock(&sg->guest_table_lock);
2093        page_table_free_pgste(page);
2094        return rc;
2095
2096}
2097EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
2098
2099/**
2100 * gmap_shadow_page - create a shadow page mapping
2101 * @sg: pointer to the shadow guest address space structure
2102 * @saddr: faulting address in the shadow gmap
2103 * @pte: pte in parent gmap address space to get shadowed
2104 *
2105 * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
2106 * shadow table structure is incomplete, -ENOMEM if out of memory and
2107 * -EFAULT if an address in the parent gmap could not be resolved.
2108 *
2109 * Called with sg->mm->mmap_lock in read.
2110 */
2111int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
2112{
2113        struct gmap *parent;
2114        struct gmap_rmap *rmap;
2115        unsigned long vmaddr, paddr;
2116        spinlock_t *ptl;
2117        pte_t *sptep, *tptep;
2118        int prot;
2119        int rc;
2120
2121        BUG_ON(!gmap_is_shadow(sg));
2122        parent = sg->parent;
2123        prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
2124
2125        rmap = kzalloc(sizeof(*rmap), GFP_KERNEL_ACCOUNT);
2126        if (!rmap)
2127                return -ENOMEM;
2128        rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
2129
2130        while (1) {
2131                paddr = pte_val(pte) & PAGE_MASK;
2132                vmaddr = __gmap_translate(parent, paddr);
2133                if (IS_ERR_VALUE(vmaddr)) {
2134                        rc = vmaddr;
2135                        break;
2136                }
2137                rc = radix_tree_preload(GFP_KERNEL_ACCOUNT);
2138                if (rc)
2139                        break;
2140                rc = -EAGAIN;
2141                sptep = gmap_pte_op_walk(parent, paddr, &ptl);
2142                if (sptep) {
2143                        spin_lock(&sg->guest_table_lock);
2144                        /* Get page table pointer */
2145                        tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
2146                        if (!tptep) {
2147                                spin_unlock(&sg->guest_table_lock);
2148                                gmap_pte_op_end(ptl);
2149                                radix_tree_preload_end();
2150                                break;
2151                        }
2152                        rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
2153                        if (rc > 0) {
2154                                /* Success and a new mapping */
2155                                gmap_insert_rmap(sg, vmaddr, rmap);
2156                                rmap = NULL;
2157                                rc = 0;
2158                        }
2159                        gmap_pte_op_end(ptl);
2160                        spin_unlock(&sg->guest_table_lock);
2161                }
2162                radix_tree_preload_end();
2163                if (!rc)
2164                        break;
2165                rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
2166                if (rc)
2167                        break;
2168        }
2169        kfree(rmap);
2170        return rc;
2171}
2172EXPORT_SYMBOL_GPL(gmap_shadow_page);
2173
2174/*
2175 * gmap_shadow_notify - handle notifications for shadow gmap
2176 *
2177 * Called with sg->parent->shadow_lock.
2178 */
2179static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
2180                               unsigned long gaddr)
2181{
2182        struct gmap_rmap *rmap, *rnext, *head;
2183        unsigned long start, end, bits, raddr;
2184
2185        BUG_ON(!gmap_is_shadow(sg));
2186
2187        spin_lock(&sg->guest_table_lock);
2188        if (sg->removed) {
2189                spin_unlock(&sg->guest_table_lock);
2190                return;
2191        }
2192        /* Check for top level table */
2193        start = sg->orig_asce & _ASCE_ORIGIN;
2194        end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * PAGE_SIZE;
2195        if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
2196            gaddr < end) {
2197                /* The complete shadow table has to go */
2198                gmap_unshadow(sg);
2199                spin_unlock(&sg->guest_table_lock);
2200                list_del(&sg->list);
2201                gmap_put(sg);
2202                return;
2203        }
2204        /* Remove the page table tree from on specific entry */
2205        head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
2206        gmap_for_each_rmap_safe(rmap, rnext, head) {
2207                bits = rmap->raddr & _SHADOW_RMAP_MASK;
2208                raddr = rmap->raddr ^ bits;
2209                switch (bits) {
2210                case _SHADOW_RMAP_REGION1:
2211                        gmap_unshadow_r2t(sg, raddr);
2212                        break;
2213                case _SHADOW_RMAP_REGION2:
2214                        gmap_unshadow_r3t(sg, raddr);
2215                        break;
2216                case _SHADOW_RMAP_REGION3:
2217                        gmap_unshadow_sgt(sg, raddr);
2218                        break;
2219                case _SHADOW_RMAP_SEGMENT:
2220                        gmap_unshadow_pgt(sg, raddr);
2221                        break;
2222                case _SHADOW_RMAP_PGTABLE:
2223                        gmap_unshadow_page(sg, raddr);
2224                        break;
2225                }
2226                kfree(rmap);
2227        }
2228        spin_unlock(&sg->guest_table_lock);
2229}
2230
2231/**
2232 * ptep_notify - call all invalidation callbacks for a specific pte.
2233 * @mm: pointer to the process mm_struct
2234 * @vmaddr: virtual address in the process address space
2235 * @pte: pointer to the page table entry
2236 * @bits: bits from the pgste that caused the notify call
2237 *
2238 * This function is assumed to be called with the page table lock held
2239 * for the pte to notify.
2240 */
2241void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
2242                 pte_t *pte, unsigned long bits)
2243{
2244        unsigned long offset, gaddr = 0;
2245        unsigned long *table;
2246        struct gmap *gmap, *sg, *next;
2247
2248        offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
2249        offset = offset * (PAGE_SIZE / sizeof(pte_t));
2250        rcu_read_lock();
2251        list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2252                spin_lock(&gmap->guest_table_lock);
2253                table = radix_tree_lookup(&gmap->host_to_guest,
2254                                          vmaddr >> PMD_SHIFT);
2255                if (table)
2256                        gaddr = __gmap_segment_gaddr(table) + offset;
2257                spin_unlock(&gmap->guest_table_lock);
2258                if (!table)
2259                        continue;
2260
2261                if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
2262                        spin_lock(&gmap->shadow_lock);
2263                        list_for_each_entry_safe(sg, next,
2264                                                 &gmap->children, list)
2265                                gmap_shadow_notify(sg, vmaddr, gaddr);
2266                        spin_unlock(&gmap->shadow_lock);
2267                }
2268                if (bits & PGSTE_IN_BIT)
2269                        gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
2270        }
2271        rcu_read_unlock();
2272}
2273EXPORT_SYMBOL_GPL(ptep_notify);
2274
2275static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
2276                             unsigned long gaddr)
2277{
2278        pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN;
2279        gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
2280}
2281
2282/**
2283 * gmap_pmdp_xchg - exchange a gmap pmd with another
2284 * @gmap: pointer to the guest address space structure
2285 * @pmdp: pointer to the pmd entry
2286 * @new: replacement entry
2287 * @gaddr: the affected guest address
2288 *
2289 * This function is assumed to be called with the guest_table_lock
2290 * held.
2291 */
2292static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
2293                           unsigned long gaddr)
2294{
2295        gaddr &= HPAGE_MASK;
2296        pmdp_notify_gmap(gmap, pmdp, gaddr);
2297        pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN;
2298        if (MACHINE_HAS_TLB_GUEST)
2299                __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
2300                            IDTE_GLOBAL);
2301        else if (MACHINE_HAS_IDTE)
2302                __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
2303        else
2304                __pmdp_csp(pmdp);
2305        *pmdp = new;
2306}
2307
2308static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
2309                            int purge)
2310{
2311        pmd_t *pmdp;
2312        struct gmap *gmap;
2313        unsigned long gaddr;
2314
2315        rcu_read_lock();
2316        list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2317                spin_lock(&gmap->guest_table_lock);
2318                pmdp = (pmd_t *)radix_tree_delete(&gmap->host_to_guest,
2319                                                  vmaddr >> PMD_SHIFT);
2320                if (pmdp) {
2321                        gaddr = __gmap_segment_gaddr((unsigned long *)pmdp);
2322                        pmdp_notify_gmap(gmap, pmdp, gaddr);
2323                        WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2324                                                   _SEGMENT_ENTRY_GMAP_UC));
2325                        if (purge)
2326                                __pmdp_csp(pmdp);
2327                        pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
2328                }
2329                spin_unlock(&gmap->guest_table_lock);
2330        }
2331        rcu_read_unlock();
2332}
2333
2334/**
2335 * gmap_pmdp_invalidate - invalidate all affected guest pmd entries without
2336 *                        flushing
2337 * @mm: pointer to the process mm_struct
2338 * @vmaddr: virtual address in the process address space
2339 */
2340void gmap_pmdp_invalidate(struct mm_struct *mm, unsigned long vmaddr)
2341{
2342        gmap_pmdp_clear(mm, vmaddr, 0);
2343}
2344EXPORT_SYMBOL_GPL(gmap_pmdp_invalidate);
2345
2346/**
2347 * gmap_pmdp_csp - csp all affected guest pmd entries
2348 * @mm: pointer to the process mm_struct
2349 * @vmaddr: virtual address in the process address space
2350 */
2351void gmap_pmdp_csp(struct mm_struct *mm, unsigned long vmaddr)
2352{
2353        gmap_pmdp_clear(mm, vmaddr, 1);
2354}
2355EXPORT_SYMBOL_GPL(gmap_pmdp_csp);
2356
2357/**
2358 * gmap_pmdp_idte_local - invalidate and clear a guest pmd entry
2359 * @mm: pointer to the process mm_struct
2360 * @vmaddr: virtual address in the process address space
2361 */
2362void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr)
2363{
2364        unsigned long *entry, gaddr;
2365        struct gmap *gmap;
2366        pmd_t *pmdp;
2367
2368        rcu_read_lock();
2369        list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2370                spin_lock(&gmap->guest_table_lock);
2371                entry = radix_tree_delete(&gmap->host_to_guest,
2372                                          vmaddr >> PMD_SHIFT);
2373                if (entry) {
2374                        pmdp = (pmd_t *)entry;
2375                        gaddr = __gmap_segment_gaddr(entry);
2376                        pmdp_notify_gmap(gmap, pmdp, gaddr);
2377                        WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2378                                           _SEGMENT_ENTRY_GMAP_UC));
2379                        if (MACHINE_HAS_TLB_GUEST)
2380                                __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
2381                                            gmap->asce, IDTE_LOCAL);
2382                        else if (MACHINE_HAS_IDTE)
2383                                __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_LOCAL);
2384                        *entry = _SEGMENT_ENTRY_EMPTY;
2385                }
2386                spin_unlock(&gmap->guest_table_lock);
2387        }
2388        rcu_read_unlock();
2389}
2390EXPORT_SYMBOL_GPL(gmap_pmdp_idte_local);
2391
2392/**
2393 * gmap_pmdp_idte_global - invalidate and clear a guest pmd entry
2394 * @mm: pointer to the process mm_struct
2395 * @vmaddr: virtual address in the process address space
2396 */
2397void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr)
2398{
2399        unsigned long *entry, gaddr;
2400        struct gmap *gmap;
2401        pmd_t *pmdp;
2402
2403        rcu_read_lock();
2404        list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
2405                spin_lock(&gmap->guest_table_lock);
2406                entry = radix_tree_delete(&gmap->host_to_guest,
2407                                          vmaddr >> PMD_SHIFT);
2408                if (entry) {
2409                        pmdp = (pmd_t *)entry;
2410                        gaddr = __gmap_segment_gaddr(entry);
2411                        pmdp_notify_gmap(gmap, pmdp, gaddr);
2412                        WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE |
2413                                           _SEGMENT_ENTRY_GMAP_UC));
2414                        if (MACHINE_HAS_TLB_GUEST)
2415                                __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE,
2416                                            gmap->asce, IDTE_GLOBAL);
2417                        else if (MACHINE_HAS_IDTE)
2418                                __pmdp_idte(gaddr, pmdp, 0, 0, IDTE_GLOBAL);
2419                        else
2420                                __pmdp_csp(pmdp);
2421                        *entry = _SEGMENT_ENTRY_EMPTY;
2422                }
2423                spin_unlock(&gmap->guest_table_lock);
2424        }
2425        rcu_read_unlock();
2426}
2427EXPORT_SYMBOL_GPL(gmap_pmdp_idte_global);
2428
2429/**
2430 * gmap_test_and_clear_dirty_pmd - test and reset segment dirty status
2431 * @gmap: pointer to guest address space
2432 * @pmdp: pointer to the pmd to be tested
2433 * @gaddr: virtual address in the guest address space
2434 *
2435 * This function is assumed to be called with the guest_table_lock
2436 * held.
2437 */
2438static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
2439                                          unsigned long gaddr)
2440{
2441        if (pmd_val(*pmdp) & _SEGMENT_ENTRY_INVALID)
2442                return false;
2443
2444        /* Already protected memory, which did not change is clean */
2445        if (pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT &&
2446            !(pmd_val(*pmdp) & _SEGMENT_ENTRY_GMAP_UC))
2447                return false;
2448
2449        /* Clear UC indication and reset protection */
2450        pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC;
2451        gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
2452        return true;
2453}
2454
2455/**
2456 * gmap_sync_dirty_log_pmd - set bitmap based on dirty status of segment
2457 * @gmap: pointer to guest address space
2458 * @bitmap: dirty bitmap for this pmd
2459 * @gaddr: virtual address in the guest address space
2460 * @vmaddr: virtual address in the host address space
2461 *
2462 * This function is assumed to be called with the guest_table_lock
2463 * held.
2464 */
2465void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long bitmap[4],
2466                             unsigned long gaddr, unsigned long vmaddr)
2467{
2468        int i;
2469        pmd_t *pmdp;
2470        pte_t *ptep;
2471        spinlock_t *ptl;
2472
2473        pmdp = gmap_pmd_op_walk(gmap, gaddr);
2474        if (!pmdp)
2475                return;
2476
2477        if (pmd_large(*pmdp)) {
2478                if (gmap_test_and_clear_dirty_pmd(gmap, pmdp, gaddr))
2479                        bitmap_fill(bitmap, _PAGE_ENTRIES);
2480        } else {
2481                for (i = 0; i < _PAGE_ENTRIES; i++, vmaddr += PAGE_SIZE) {
2482                        ptep = pte_alloc_map_lock(gmap->mm, pmdp, vmaddr, &ptl);
2483                        if (!ptep)
2484                                continue;
2485                        if (ptep_test_and_clear_uc(gmap->mm, vmaddr, ptep))
2486                                set_bit(i, bitmap);
2487                        spin_unlock(ptl);
2488                }
2489        }
2490        gmap_pmd_op_end(gmap, pmdp);
2491}
2492EXPORT_SYMBOL_GPL(gmap_sync_dirty_log_pmd);
2493
2494#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2495static int thp_split_walk_pmd_entry(pmd_t *pmd, unsigned long addr,
2496                                    unsigned long end, struct mm_walk *walk)
2497{
2498        struct vm_area_struct *vma = walk->vma;
2499
2500        split_huge_pmd(vma, pmd, addr);
2501        return 0;
2502}
2503
2504static const struct mm_walk_ops thp_split_walk_ops = {
2505        .pmd_entry      = thp_split_walk_pmd_entry,
2506};
2507
2508static inline void thp_split_mm(struct mm_struct *mm)
2509{
2510        struct vm_area_struct *vma;
2511
2512        for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
2513                vma->vm_flags &= ~VM_HUGEPAGE;
2514                vma->vm_flags |= VM_NOHUGEPAGE;
2515                walk_page_vma(vma, &thp_split_walk_ops, NULL);
2516        }
2517        mm->def_flags |= VM_NOHUGEPAGE;
2518}
2519#else
2520static inline void thp_split_mm(struct mm_struct *mm)
2521{
2522}
2523#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2524
2525/*
2526 * Remove all empty zero pages from the mapping for lazy refaulting
2527 * - This must be called after mm->context.has_pgste is set, to avoid
2528 *   future creation of zero pages
2529 * - This must be called after THP was enabled
2530 */
2531static int __zap_zero_pages(pmd_t *pmd, unsigned long start,
2532                           unsigned long end, struct mm_walk *walk)
2533{
2534        unsigned long addr;
2535
2536        for (addr = start; addr != end; addr += PAGE_SIZE) {
2537                pte_t *ptep;
2538                spinlock_t *ptl;
2539
2540                ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
2541                if (is_zero_pfn(pte_pfn(*ptep)))
2542                        ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID));
2543                pte_unmap_unlock(ptep, ptl);
2544        }
2545        return 0;
2546}
2547
2548static const struct mm_walk_ops zap_zero_walk_ops = {
2549        .pmd_entry      = __zap_zero_pages,
2550};
2551
2552/*
2553 * switch on pgstes for its userspace process (for kvm)
2554 */
2555int s390_enable_sie(void)
2556{
2557        struct mm_struct *mm = current->mm;
2558
2559        /* Do we have pgstes? if yes, we are done */
2560        if (mm_has_pgste(mm))
2561                return 0;
2562        /* Fail if the page tables are 2K */
2563        if (!mm_alloc_pgste(mm))
2564                return -EINVAL;
2565        mmap_write_lock(mm);
2566        mm->context.has_pgste = 1;
2567        /* split thp mappings and disable thp for future mappings */
2568        thp_split_mm(mm);
2569        walk_page_range(mm, 0, TASK_SIZE, &zap_zero_walk_ops, NULL);
2570        mmap_write_unlock(mm);
2571        return 0;
2572}
2573EXPORT_SYMBOL_GPL(s390_enable_sie);
2574
2575int gmap_mark_unmergeable(void)
2576{
2577        struct mm_struct *mm = current->mm;
2578        struct vm_area_struct *vma;
2579        int ret;
2580
2581        for (vma = mm->mmap; vma; vma = vma->vm_next) {
2582                ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
2583                                  MADV_UNMERGEABLE, &vma->vm_flags);
2584                if (ret)
2585                        return ret;
2586        }
2587        mm->def_flags &= ~VM_MERGEABLE;
2588        return 0;
2589}
2590EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
2591
2592/*
2593 * Enable storage key handling from now on and initialize the storage
2594 * keys with the default key.
2595 */
2596static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
2597                                  unsigned long next, struct mm_walk *walk)
2598{
2599        /* Clear storage key */
2600        ptep_zap_key(walk->mm, addr, pte);
2601        return 0;
2602}
2603
2604static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
2605                                      unsigned long hmask, unsigned long next,
2606                                      struct mm_walk *walk)
2607{
2608        pmd_t *pmd = (pmd_t *)pte;
2609        unsigned long start, end;
2610        struct page *page = pmd_page(*pmd);
2611
2612        /*
2613         * The write check makes sure we do not set a key on shared
2614         * memory. This is needed as the walker does not differentiate
2615         * between actual guest memory and the process executable or
2616         * shared libraries.
2617         */
2618        if (pmd_val(*pmd) & _SEGMENT_ENTRY_INVALID ||
2619            !(pmd_val(*pmd) & _SEGMENT_ENTRY_WRITE))
2620                return 0;
2621
2622        start = pmd_val(*pmd) & HPAGE_MASK;
2623        end = start + HPAGE_SIZE - 1;
2624        __storage_key_init_range(start, end);
2625        set_bit(PG_arch_1, &page->flags);
2626        return 0;
2627}
2628
2629static const struct mm_walk_ops enable_skey_walk_ops = {
2630        .hugetlb_entry          = __s390_enable_skey_hugetlb,
2631        .pte_entry              = __s390_enable_skey_pte,
2632};
2633
2634int s390_enable_skey(void)
2635{
2636        struct mm_struct *mm = current->mm;
2637        int rc = 0;
2638
2639        mmap_write_lock(mm);
2640        if (mm_uses_skeys(mm))
2641                goto out_up;
2642
2643        mm->context.uses_skeys = 1;
2644        rc = gmap_mark_unmergeable();
2645        if (rc) {
2646                mm->context.uses_skeys = 0;
2647                goto out_up;
2648        }
2649        walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL);
2650
2651out_up:
2652        mmap_write_unlock(mm);
2653        return rc;
2654}
2655EXPORT_SYMBOL_GPL(s390_enable_skey);
2656
2657/*
2658 * Reset CMMA state, make all pages stable again.
2659 */
2660static int __s390_reset_cmma(pte_t *pte, unsigned long addr,
2661                             unsigned long next, struct mm_walk *walk)
2662{
2663        ptep_zap_unused(walk->mm, addr, pte, 1);
2664        return 0;
2665}
2666
2667static const struct mm_walk_ops reset_cmma_walk_ops = {
2668        .pte_entry              = __s390_reset_cmma,
2669};
2670
2671void s390_reset_cmma(struct mm_struct *mm)
2672{
2673        mmap_write_lock(mm);
2674        walk_page_range(mm, 0, TASK_SIZE, &reset_cmma_walk_ops, NULL);
2675        mmap_write_unlock(mm);
2676}
2677EXPORT_SYMBOL_GPL(s390_reset_cmma);
2678
2679/*
2680 * make inaccessible pages accessible again
2681 */
2682static int __s390_reset_acc(pte_t *ptep, unsigned long addr,
2683                            unsigned long next, struct mm_walk *walk)
2684{
2685        pte_t pte = READ_ONCE(*ptep);
2686
2687        /* There is a reference through the mapping */
2688        if (pte_present(pte))
2689                WARN_ON_ONCE(uv_destroy_owned_page(pte_val(pte) & PAGE_MASK));
2690
2691        return 0;
2692}
2693
2694static const struct mm_walk_ops reset_acc_walk_ops = {
2695        .pte_entry              = __s390_reset_acc,
2696};
2697
2698#include <linux/sched/mm.h>
2699void s390_reset_acc(struct mm_struct *mm)
2700{
2701        if (!mm_is_protected(mm))
2702                return;
2703        /*
2704         * we might be called during
2705         * reset:                             we walk the pages and clear
2706         * close of all kvm file descriptors: we walk the pages and clear
2707         * exit of process on fd closure:     vma already gone, do nothing
2708         */
2709        if (!mmget_not_zero(mm))
2710                return;
2711        mmap_read_lock(mm);
2712        walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL);
2713        mmap_read_unlock(mm);
2714        mmput(mm);
2715}
2716EXPORT_SYMBOL_GPL(s390_reset_acc);
2717