linux/arch/powerpc/kvm/book3s_64_mmu_radix.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *
   4 * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   5 */
   6
   7#include <linux/types.h>
   8#include <linux/string.h>
   9#include <linux/kvm.h>
  10#include <linux/kvm_host.h>
  11#include <linux/anon_inodes.h>
  12#include <linux/file.h>
  13#include <linux/debugfs.h>
  14#include <linux/pgtable.h>
  15
  16#include <asm/kvm_ppc.h>
  17#include <asm/kvm_book3s.h>
  18#include <asm/page.h>
  19#include <asm/mmu.h>
  20#include <asm/pgalloc.h>
  21#include <asm/pte-walk.h>
  22#include <asm/ultravisor.h>
  23#include <asm/kvm_book3s_uvmem.h>
  24#include <asm/plpar_wrappers.h>
  25
  26/*
  27 * Supported radix tree geometry.
  28 * Like p9, we support either 5 or 9 bits at the first (lowest) level,
  29 * for a page size of 64k or 4k.
  30 */
  31static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
  32
  33unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
  34                                              gva_t eaddr, void *to, void *from,
  35                                              unsigned long n)
  36{
  37        int old_pid, old_lpid;
  38        unsigned long quadrant, ret = n;
  39        bool is_load = !!to;
  40
  41        /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
  42        if (kvmhv_on_pseries())
  43                return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
  44                                          (to != NULL) ? __pa(to): 0,
  45                                          (from != NULL) ? __pa(from): 0, n);
  46
  47        quadrant = 1;
  48        if (!pid)
  49                quadrant = 2;
  50        if (is_load)
  51                from = (void *) (eaddr | (quadrant << 62));
  52        else
  53                to = (void *) (eaddr | (quadrant << 62));
  54
  55        preempt_disable();
  56
  57        /* switch the lpid first to avoid running host with unallocated pid */
  58        old_lpid = mfspr(SPRN_LPID);
  59        if (old_lpid != lpid)
  60                mtspr(SPRN_LPID, lpid);
  61        if (quadrant == 1) {
  62                old_pid = mfspr(SPRN_PID);
  63                if (old_pid != pid)
  64                        mtspr(SPRN_PID, pid);
  65        }
  66        isync();
  67
  68        if (is_load)
  69                ret = copy_from_user_nofault(to, (const void __user *)from, n);
  70        else
  71                ret = copy_to_user_nofault((void __user *)to, from, n);
  72
  73        /* switch the pid first to avoid running host with unallocated pid */
  74        if (quadrant == 1 && pid != old_pid)
  75                mtspr(SPRN_PID, old_pid);
  76        if (lpid != old_lpid)
  77                mtspr(SPRN_LPID, old_lpid);
  78        isync();
  79
  80        preempt_enable();
  81
  82        return ret;
  83}
  84EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix);
  85
  86static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
  87                                          void *to, void *from, unsigned long n)
  88{
  89        int lpid = vcpu->kvm->arch.lpid;
  90        int pid = vcpu->arch.pid;
  91
  92        /* This would cause a data segment intr so don't allow the access */
  93        if (eaddr & (0x3FFUL << 52))
  94                return -EINVAL;
  95
  96        /* Should we be using the nested lpid */
  97        if (vcpu->arch.nested)
  98                lpid = vcpu->arch.nested->shadow_lpid;
  99
 100        /* If accessing quadrant 3 then pid is expected to be 0 */
 101        if (((eaddr >> 62) & 0x3) == 0x3)
 102                pid = 0;
 103
 104        eaddr &= ~(0xFFFUL << 52);
 105
 106        return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
 107}
 108
 109long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
 110                                 unsigned long n)
 111{
 112        long ret;
 113
 114        ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
 115        if (ret > 0)
 116                memset(to + (n - ret), 0, ret);
 117
 118        return ret;
 119}
 120EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix);
 121
 122long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
 123                               unsigned long n)
 124{
 125        return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
 126}
 127EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix);
 128
 129int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
 130                               struct kvmppc_pte *gpte, u64 root,
 131                               u64 *pte_ret_p)
 132{
 133        struct kvm *kvm = vcpu->kvm;
 134        int ret, level, ps;
 135        unsigned long rts, bits, offset, index;
 136        u64 pte, base, gpa;
 137        __be64 rpte;
 138
 139        rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
 140                ((root & RTS2_MASK) >> RTS2_SHIFT);
 141        bits = root & RPDS_MASK;
 142        base = root & RPDB_MASK;
 143
 144        offset = rts + 31;
 145
 146        /* Current implementations only support 52-bit space */
 147        if (offset != 52)
 148                return -EINVAL;
 149
 150        /* Walk each level of the radix tree */
 151        for (level = 3; level >= 0; --level) {
 152                u64 addr;
 153                /* Check a valid size */
 154                if (level && bits != p9_supported_radix_bits[level])
 155                        return -EINVAL;
 156                if (level == 0 && !(bits == 5 || bits == 9))
 157                        return -EINVAL;
 158                offset -= bits;
 159                index = (eaddr >> offset) & ((1UL << bits) - 1);
 160                /* Check that low bits of page table base are zero */
 161                if (base & ((1UL << (bits + 3)) - 1))
 162                        return -EINVAL;
 163                /* Read the entry from guest memory */
 164                addr = base + (index * sizeof(rpte));
 165                vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 166                ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
 167                srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 168                if (ret) {
 169                        if (pte_ret_p)
 170                                *pte_ret_p = addr;
 171                        return ret;
 172                }
 173                pte = __be64_to_cpu(rpte);
 174                if (!(pte & _PAGE_PRESENT))
 175                        return -ENOENT;
 176                /* Check if a leaf entry */
 177                if (pte & _PAGE_PTE)
 178                        break;
 179                /* Get ready to walk the next level */
 180                base = pte & RPDB_MASK;
 181                bits = pte & RPDS_MASK;
 182        }
 183
 184        /* Need a leaf at lowest level; 512GB pages not supported */
 185        if (level < 0 || level == 3)
 186                return -EINVAL;
 187
 188        /* We found a valid leaf PTE */
 189        /* Offset is now log base 2 of the page size */
 190        gpa = pte & 0x01fffffffffff000ul;
 191        if (gpa & ((1ul << offset) - 1))
 192                return -EINVAL;
 193        gpa |= eaddr & ((1ul << offset) - 1);
 194        for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
 195                if (offset == mmu_psize_defs[ps].shift)
 196                        break;
 197        gpte->page_size = ps;
 198        gpte->page_shift = offset;
 199
 200        gpte->eaddr = eaddr;
 201        gpte->raddr = gpa;
 202
 203        /* Work out permissions */
 204        gpte->may_read = !!(pte & _PAGE_READ);
 205        gpte->may_write = !!(pte & _PAGE_WRITE);
 206        gpte->may_execute = !!(pte & _PAGE_EXEC);
 207
 208        gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
 209
 210        if (pte_ret_p)
 211                *pte_ret_p = pte;
 212
 213        return 0;
 214}
 215
 216/*
 217 * Used to walk a partition or process table radix tree in guest memory
 218 * Note: We exploit the fact that a partition table and a process
 219 * table have the same layout, a partition-scoped page table and a
 220 * process-scoped page table have the same layout, and the 2nd
 221 * doubleword of a partition table entry has the same layout as
 222 * the PTCR register.
 223 */
 224int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
 225                                     struct kvmppc_pte *gpte, u64 table,
 226                                     int table_index, u64 *pte_ret_p)
 227{
 228        struct kvm *kvm = vcpu->kvm;
 229        int ret;
 230        unsigned long size, ptbl, root;
 231        struct prtb_entry entry;
 232
 233        if ((table & PRTS_MASK) > 24)
 234                return -EINVAL;
 235        size = 1ul << ((table & PRTS_MASK) + 12);
 236
 237        /* Is the table big enough to contain this entry? */
 238        if ((table_index * sizeof(entry)) >= size)
 239                return -EINVAL;
 240
 241        /* Read the table to find the root of the radix tree */
 242        ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
 243        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 244        ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
 245        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 246        if (ret)
 247                return ret;
 248
 249        /* Root is stored in the first double word */
 250        root = be64_to_cpu(entry.prtb0);
 251
 252        return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
 253}
 254
 255int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 256                           struct kvmppc_pte *gpte, bool data, bool iswrite)
 257{
 258        u32 pid;
 259        u64 pte;
 260        int ret;
 261
 262        /* Work out effective PID */
 263        switch (eaddr >> 62) {
 264        case 0:
 265                pid = vcpu->arch.pid;
 266                break;
 267        case 3:
 268                pid = 0;
 269                break;
 270        default:
 271                return -EINVAL;
 272        }
 273
 274        ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
 275                                vcpu->kvm->arch.process_table, pid, &pte);
 276        if (ret)
 277                return ret;
 278
 279        /* Check privilege (applies only to process scoped translations) */
 280        if (kvmppc_get_msr(vcpu) & MSR_PR) {
 281                if (pte & _PAGE_PRIVILEGED) {
 282                        gpte->may_read = 0;
 283                        gpte->may_write = 0;
 284                        gpte->may_execute = 0;
 285                }
 286        } else {
 287                if (!(pte & _PAGE_PRIVILEGED)) {
 288                        /* Check AMR/IAMR to see if strict mode is in force */
 289                        if (vcpu->arch.amr & (1ul << 62))
 290                                gpte->may_read = 0;
 291                        if (vcpu->arch.amr & (1ul << 63))
 292                                gpte->may_write = 0;
 293                        if (vcpu->arch.iamr & (1ul << 62))
 294                                gpte->may_execute = 0;
 295                }
 296        }
 297
 298        return 0;
 299}
 300
 301void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
 302                             unsigned int pshift, unsigned int lpid)
 303{
 304        unsigned long psize = PAGE_SIZE;
 305        int psi;
 306        long rc;
 307        unsigned long rb;
 308
 309        if (pshift)
 310                psize = 1UL << pshift;
 311        else
 312                pshift = PAGE_SHIFT;
 313
 314        addr &= ~(psize - 1);
 315
 316        if (!kvmhv_on_pseries()) {
 317                radix__flush_tlb_lpid_page(lpid, addr, psize);
 318                return;
 319        }
 320
 321        psi = shift_to_mmu_psize(pshift);
 322
 323        if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) {
 324                rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
 325                rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
 326                                        lpid, rb);
 327        } else {
 328                rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
 329                                            H_RPTI_TYPE_NESTED |
 330                                            H_RPTI_TYPE_TLB,
 331                                            psize_to_rpti_pgsize(psi),
 332                                            addr, addr + psize);
 333        }
 334
 335        if (rc)
 336                pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
 337}
 338
 339static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
 340{
 341        long rc;
 342
 343        if (!kvmhv_on_pseries()) {
 344                radix__flush_pwc_lpid(lpid);
 345                return;
 346        }
 347
 348        if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE))
 349                rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
 350                                        lpid, TLBIEL_INVAL_SET_LPID);
 351        else
 352                rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU,
 353                                            H_RPTI_TYPE_NESTED |
 354                                            H_RPTI_TYPE_PWC, H_RPTI_PAGE_ALL,
 355                                            0, -1UL);
 356        if (rc)
 357                pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
 358}
 359
 360static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
 361                                      unsigned long clr, unsigned long set,
 362                                      unsigned long addr, unsigned int shift)
 363{
 364        return __radix_pte_update(ptep, clr, set);
 365}
 366
 367static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
 368                             pte_t *ptep, pte_t pte)
 369{
 370        radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
 371}
 372
 373static struct kmem_cache *kvm_pte_cache;
 374static struct kmem_cache *kvm_pmd_cache;
 375
 376static pte_t *kvmppc_pte_alloc(void)
 377{
 378        pte_t *pte;
 379
 380        pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
 381        /* pmd_populate() will only reference _pa(pte). */
 382        kmemleak_ignore(pte);
 383
 384        return pte;
 385}
 386
 387static void kvmppc_pte_free(pte_t *ptep)
 388{
 389        kmem_cache_free(kvm_pte_cache, ptep);
 390}
 391
 392static pmd_t *kvmppc_pmd_alloc(void)
 393{
 394        pmd_t *pmd;
 395
 396        pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
 397        /* pud_populate() will only reference _pa(pmd). */
 398        kmemleak_ignore(pmd);
 399
 400        return pmd;
 401}
 402
 403static void kvmppc_pmd_free(pmd_t *pmdp)
 404{
 405        kmem_cache_free(kvm_pmd_cache, pmdp);
 406}
 407
 408/* Called with kvm->mmu_lock held */
 409void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
 410                      unsigned int shift,
 411                      const struct kvm_memory_slot *memslot,
 412                      unsigned int lpid)
 413
 414{
 415        unsigned long old;
 416        unsigned long gfn = gpa >> PAGE_SHIFT;
 417        unsigned long page_size = PAGE_SIZE;
 418        unsigned long hpa;
 419
 420        old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
 421        kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
 422
 423        /* The following only applies to L1 entries */
 424        if (lpid != kvm->arch.lpid)
 425                return;
 426
 427        if (!memslot) {
 428                memslot = gfn_to_memslot(kvm, gfn);
 429                if (!memslot)
 430                        return;
 431        }
 432        if (shift) { /* 1GB or 2MB page */
 433                page_size = 1ul << shift;
 434                if (shift == PMD_SHIFT)
 435                        kvm->stat.num_2M_pages--;
 436                else if (shift == PUD_SHIFT)
 437                        kvm->stat.num_1G_pages--;
 438        }
 439
 440        gpa &= ~(page_size - 1);
 441        hpa = old & PTE_RPN_MASK;
 442        kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
 443
 444        if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
 445                kvmppc_update_dirty_map(memslot, gfn, page_size);
 446}
 447
 448/*
 449 * kvmppc_free_p?d are used to free existing page tables, and recursively
 450 * descend and clear and free children.
 451 * Callers are responsible for flushing the PWC.
 452 *
 453 * When page tables are being unmapped/freed as part of page fault path
 454 * (full == false), valid ptes are generally not expected; however, there
 455 * is one situation where they arise, which is when dirty page logging is
 456 * turned off for a memslot while the VM is running.  The new memslot
 457 * becomes visible to page faults before the memslot commit function
 458 * gets to flush the memslot, which can lead to a 2MB page mapping being
 459 * installed for a guest physical address where there are already 64kB
 460 * (or 4kB) mappings (of sub-pages of the same 2MB page).
 461 */
 462static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
 463                                  unsigned int lpid)
 464{
 465        if (full) {
 466                memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);
 467        } else {
 468                pte_t *p = pte;
 469                unsigned long it;
 470
 471                for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
 472                        if (pte_val(*p) == 0)
 473                                continue;
 474                        kvmppc_unmap_pte(kvm, p,
 475                                         pte_pfn(*p) << PAGE_SHIFT,
 476                                         PAGE_SHIFT, NULL, lpid);
 477                }
 478        }
 479
 480        kvmppc_pte_free(pte);
 481}
 482
 483static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
 484                                  unsigned int lpid)
 485{
 486        unsigned long im;
 487        pmd_t *p = pmd;
 488
 489        for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
 490                if (!pmd_present(*p))
 491                        continue;
 492                if (pmd_is_leaf(*p)) {
 493                        if (full) {
 494                                pmd_clear(p);
 495                        } else {
 496                                WARN_ON_ONCE(1);
 497                                kvmppc_unmap_pte(kvm, (pte_t *)p,
 498                                         pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
 499                                         PMD_SHIFT, NULL, lpid);
 500                        }
 501                } else {
 502                        pte_t *pte;
 503
 504                        pte = pte_offset_map(p, 0);
 505                        kvmppc_unmap_free_pte(kvm, pte, full, lpid);
 506                        pmd_clear(p);
 507                }
 508        }
 509        kvmppc_pmd_free(pmd);
 510}
 511
 512static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
 513                                  unsigned int lpid)
 514{
 515        unsigned long iu;
 516        pud_t *p = pud;
 517
 518        for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
 519                if (!pud_present(*p))
 520                        continue;
 521                if (pud_is_leaf(*p)) {
 522                        pud_clear(p);
 523                } else {
 524                        pmd_t *pmd;
 525
 526                        pmd = pmd_offset(p, 0);
 527                        kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
 528                        pud_clear(p);
 529                }
 530        }
 531        pud_free(kvm->mm, pud);
 532}
 533
 534void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
 535{
 536        unsigned long ig;
 537
 538        for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
 539                p4d_t *p4d = p4d_offset(pgd, 0);
 540                pud_t *pud;
 541
 542                if (!p4d_present(*p4d))
 543                        continue;
 544                pud = pud_offset(p4d, 0);
 545                kvmppc_unmap_free_pud(kvm, pud, lpid);
 546                p4d_clear(p4d);
 547        }
 548}
 549
 550void kvmppc_free_radix(struct kvm *kvm)
 551{
 552        if (kvm->arch.pgtable) {
 553                kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
 554                                          kvm->arch.lpid);
 555                pgd_free(kvm->mm, kvm->arch.pgtable);
 556                kvm->arch.pgtable = NULL;
 557        }
 558}
 559
 560static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
 561                                        unsigned long gpa, unsigned int lpid)
 562{
 563        pte_t *pte = pte_offset_kernel(pmd, 0);
 564
 565        /*
 566         * Clearing the pmd entry then flushing the PWC ensures that the pte
 567         * page no longer be cached by the MMU, so can be freed without
 568         * flushing the PWC again.
 569         */
 570        pmd_clear(pmd);
 571        kvmppc_radix_flush_pwc(kvm, lpid);
 572
 573        kvmppc_unmap_free_pte(kvm, pte, false, lpid);
 574}
 575
 576static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
 577                                        unsigned long gpa, unsigned int lpid)
 578{
 579        pmd_t *pmd = pmd_offset(pud, 0);
 580
 581        /*
 582         * Clearing the pud entry then flushing the PWC ensures that the pmd
 583         * page and any children pte pages will no longer be cached by the MMU,
 584         * so can be freed without flushing the PWC again.
 585         */
 586        pud_clear(pud);
 587        kvmppc_radix_flush_pwc(kvm, lpid);
 588
 589        kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
 590}
 591
 592/*
 593 * There are a number of bits which may differ between different faults to
 594 * the same partition scope entry. RC bits, in the course of cleaning and
 595 * aging. And the write bit can change, either the access could have been
 596 * upgraded, or a read fault could happen concurrently with a write fault
 597 * that sets those bits first.
 598 */
 599#define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
 600
 601int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 602                      unsigned long gpa, unsigned int level,
 603                      unsigned long mmu_seq, unsigned int lpid,
 604                      unsigned long *rmapp, struct rmap_nested **n_rmap)
 605{
 606        pgd_t *pgd;
 607        p4d_t *p4d;
 608        pud_t *pud, *new_pud = NULL;
 609        pmd_t *pmd, *new_pmd = NULL;
 610        pte_t *ptep, *new_ptep = NULL;
 611        int ret;
 612
 613        /* Traverse the guest's 2nd-level tree, allocate new levels needed */
 614        pgd = pgtable + pgd_index(gpa);
 615        p4d = p4d_offset(pgd, gpa);
 616
 617        pud = NULL;
 618        if (p4d_present(*p4d))
 619                pud = pud_offset(p4d, gpa);
 620        else
 621                new_pud = pud_alloc_one(kvm->mm, gpa);
 622
 623        pmd = NULL;
 624        if (pud && pud_present(*pud) && !pud_is_leaf(*pud))
 625                pmd = pmd_offset(pud, gpa);
 626        else if (level <= 1)
 627                new_pmd = kvmppc_pmd_alloc();
 628
 629        if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
 630                new_ptep = kvmppc_pte_alloc();
 631
 632        /* Check if we might have been invalidated; let the guest retry if so */
 633        spin_lock(&kvm->mmu_lock);
 634        ret = -EAGAIN;
 635        if (mmu_notifier_retry(kvm, mmu_seq))
 636                goto out_unlock;
 637
 638        /* Now traverse again under the lock and change the tree */
 639        ret = -ENOMEM;
 640        if (p4d_none(*p4d)) {
 641                if (!new_pud)
 642                        goto out_unlock;
 643                p4d_populate(kvm->mm, p4d, new_pud);
 644                new_pud = NULL;
 645        }
 646        pud = pud_offset(p4d, gpa);
 647        if (pud_is_leaf(*pud)) {
 648                unsigned long hgpa = gpa & PUD_MASK;
 649
 650                /* Check if we raced and someone else has set the same thing */
 651                if (level == 2) {
 652                        if (pud_raw(*pud) == pte_raw(pte)) {
 653                                ret = 0;
 654                                goto out_unlock;
 655                        }
 656                        /* Valid 1GB page here already, add our extra bits */
 657                        WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
 658                                                        PTE_BITS_MUST_MATCH);
 659                        kvmppc_radix_update_pte(kvm, (pte_t *)pud,
 660                                              0, pte_val(pte), hgpa, PUD_SHIFT);
 661                        ret = 0;
 662                        goto out_unlock;
 663                }
 664                /*
 665                 * If we raced with another CPU which has just put
 666                 * a 1GB pte in after we saw a pmd page, try again.
 667                 */
 668                if (!new_pmd) {
 669                        ret = -EAGAIN;
 670                        goto out_unlock;
 671                }
 672                /* Valid 1GB page here already, remove it */
 673                kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
 674                                 lpid);
 675        }
 676        if (level == 2) {
 677                if (!pud_none(*pud)) {
 678                        /*
 679                         * There's a page table page here, but we wanted to
 680                         * install a large page, so remove and free the page
 681                         * table page.
 682                         */
 683                        kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
 684                }
 685                kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
 686                if (rmapp && n_rmap)
 687                        kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
 688                ret = 0;
 689                goto out_unlock;
 690        }
 691        if (pud_none(*pud)) {
 692                if (!new_pmd)
 693                        goto out_unlock;
 694                pud_populate(kvm->mm, pud, new_pmd);
 695                new_pmd = NULL;
 696        }
 697        pmd = pmd_offset(pud, gpa);
 698        if (pmd_is_leaf(*pmd)) {
 699                unsigned long lgpa = gpa & PMD_MASK;
 700
 701                /* Check if we raced and someone else has set the same thing */
 702                if (level == 1) {
 703                        if (pmd_raw(*pmd) == pte_raw(pte)) {
 704                                ret = 0;
 705                                goto out_unlock;
 706                        }
 707                        /* Valid 2MB page here already, add our extra bits */
 708                        WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
 709                                                        PTE_BITS_MUST_MATCH);
 710                        kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
 711                                        0, pte_val(pte), lgpa, PMD_SHIFT);
 712                        ret = 0;
 713                        goto out_unlock;
 714                }
 715
 716                /*
 717                 * If we raced with another CPU which has just put
 718                 * a 2MB pte in after we saw a pte page, try again.
 719                 */
 720                if (!new_ptep) {
 721                        ret = -EAGAIN;
 722                        goto out_unlock;
 723                }
 724                /* Valid 2MB page here already, remove it */
 725                kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
 726                                 lpid);
 727        }
 728        if (level == 1) {
 729                if (!pmd_none(*pmd)) {
 730                        /*
 731                         * There's a page table page here, but we wanted to
 732                         * install a large page, so remove and free the page
 733                         * table page.
 734                         */
 735                        kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
 736                }
 737                kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
 738                if (rmapp && n_rmap)
 739                        kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
 740                ret = 0;
 741                goto out_unlock;
 742        }
 743        if (pmd_none(*pmd)) {
 744                if (!new_ptep)
 745                        goto out_unlock;
 746                pmd_populate(kvm->mm, pmd, new_ptep);
 747                new_ptep = NULL;
 748        }
 749        ptep = pte_offset_kernel(pmd, gpa);
 750        if (pte_present(*ptep)) {
 751                /* Check if someone else set the same thing */
 752                if (pte_raw(*ptep) == pte_raw(pte)) {
 753                        ret = 0;
 754                        goto out_unlock;
 755                }
 756                /* Valid page here already, add our extra bits */
 757                WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
 758                                                        PTE_BITS_MUST_MATCH);
 759                kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
 760                ret = 0;
 761                goto out_unlock;
 762        }
 763        kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
 764        if (rmapp && n_rmap)
 765                kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
 766        ret = 0;
 767
 768 out_unlock:
 769        spin_unlock(&kvm->mmu_lock);
 770        if (new_pud)
 771                pud_free(kvm->mm, new_pud);
 772        if (new_pmd)
 773                kvmppc_pmd_free(new_pmd);
 774        if (new_ptep)
 775                kvmppc_pte_free(new_ptep);
 776        return ret;
 777}
 778
 779bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing,
 780                             unsigned long gpa, unsigned int lpid)
 781{
 782        unsigned long pgflags;
 783        unsigned int shift;
 784        pte_t *ptep;
 785
 786        /*
 787         * Need to set an R or C bit in the 2nd-level tables;
 788         * since we are just helping out the hardware here,
 789         * it is sufficient to do what the hardware does.
 790         */
 791        pgflags = _PAGE_ACCESSED;
 792        if (writing)
 793                pgflags |= _PAGE_DIRTY;
 794
 795        if (nested)
 796                ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
 797        else
 798                ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
 799
 800        if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
 801                kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
 802                return true;
 803        }
 804        return false;
 805}
 806
 807int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 808                                   unsigned long gpa,
 809                                   struct kvm_memory_slot *memslot,
 810                                   bool writing, bool kvm_ro,
 811                                   pte_t *inserted_pte, unsigned int *levelp)
 812{
 813        struct kvm *kvm = vcpu->kvm;
 814        struct page *page = NULL;
 815        unsigned long mmu_seq;
 816        unsigned long hva, gfn = gpa >> PAGE_SHIFT;
 817        bool upgrade_write = false;
 818        bool *upgrade_p = &upgrade_write;
 819        pte_t pte, *ptep;
 820        unsigned int shift, level;
 821        int ret;
 822        bool large_enable;
 823
 824        /* used to check for invalidations in progress */
 825        mmu_seq = kvm->mmu_notifier_seq;
 826        smp_rmb();
 827
 828        /*
 829         * Do a fast check first, since __gfn_to_pfn_memslot doesn't
 830         * do it with !atomic && !async, which is how we call it.
 831         * We always ask for write permission since the common case
 832         * is that the page is writable.
 833         */
 834        hva = gfn_to_hva_memslot(memslot, gfn);
 835        if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) {
 836                upgrade_write = true;
 837        } else {
 838                unsigned long pfn;
 839
 840                /* Call KVM generic code to do the slow-path check */
 841                pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
 842                                           writing, upgrade_p, NULL);
 843                if (is_error_noslot_pfn(pfn))
 844                        return -EFAULT;
 845                page = NULL;
 846                if (pfn_valid(pfn)) {
 847                        page = pfn_to_page(pfn);
 848                        if (PageReserved(page))
 849                                page = NULL;
 850                }
 851        }
 852
 853        /*
 854         * Read the PTE from the process' radix tree and use that
 855         * so we get the shift and attribute bits.
 856         */
 857        spin_lock(&kvm->mmu_lock);
 858        ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
 859        pte = __pte(0);
 860        if (ptep)
 861                pte = READ_ONCE(*ptep);
 862        spin_unlock(&kvm->mmu_lock);
 863        /*
 864         * If the PTE disappeared temporarily due to a THP
 865         * collapse, just return and let the guest try again.
 866         */
 867        if (!pte_present(pte)) {
 868                if (page)
 869                        put_page(page);
 870                return RESUME_GUEST;
 871        }
 872
 873        /* If we're logging dirty pages, always map single pages */
 874        large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
 875
 876        /* Get pte level from shift/size */
 877        if (large_enable && shift == PUD_SHIFT &&
 878            (gpa & (PUD_SIZE - PAGE_SIZE)) ==
 879            (hva & (PUD_SIZE - PAGE_SIZE))) {
 880                level = 2;
 881        } else if (large_enable && shift == PMD_SHIFT &&
 882                   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
 883                   (hva & (PMD_SIZE - PAGE_SIZE))) {
 884                level = 1;
 885        } else {
 886                level = 0;
 887                if (shift > PAGE_SHIFT) {
 888                        /*
 889                         * If the pte maps more than one page, bring over
 890                         * bits from the virtual address to get the real
 891                         * address of the specific single page we want.
 892                         */
 893                        unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
 894                        pte = __pte(pte_val(pte) | (hva & rpnmask));
 895                }
 896        }
 897
 898        pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
 899        if (writing || upgrade_write) {
 900                if (pte_val(pte) & _PAGE_WRITE)
 901                        pte = __pte(pte_val(pte) | _PAGE_DIRTY);
 902        } else {
 903                pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
 904        }
 905
 906        /* Allocate space in the tree and write the PTE */
 907        ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
 908                                mmu_seq, kvm->arch.lpid, NULL, NULL);
 909        if (inserted_pte)
 910                *inserted_pte = pte;
 911        if (levelp)
 912                *levelp = level;
 913
 914        if (page) {
 915                if (!ret && (pte_val(pte) & _PAGE_WRITE))
 916                        set_page_dirty_lock(page);
 917                put_page(page);
 918        }
 919
 920        /* Increment number of large pages if we (successfully) inserted one */
 921        if (!ret) {
 922                if (level == 1)
 923                        kvm->stat.num_2M_pages++;
 924                else if (level == 2)
 925                        kvm->stat.num_1G_pages++;
 926        }
 927
 928        return ret;
 929}
 930
 931int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
 932                                   unsigned long ea, unsigned long dsisr)
 933{
 934        struct kvm *kvm = vcpu->kvm;
 935        unsigned long gpa, gfn;
 936        struct kvm_memory_slot *memslot;
 937        long ret;
 938        bool writing = !!(dsisr & DSISR_ISSTORE);
 939        bool kvm_ro = false;
 940
 941        /* Check for unusual errors */
 942        if (dsisr & DSISR_UNSUPP_MMU) {
 943                pr_err("KVM: Got unsupported MMU fault\n");
 944                return -EFAULT;
 945        }
 946        if (dsisr & DSISR_BADACCESS) {
 947                /* Reflect to the guest as DSI */
 948                pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
 949                kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
 950                return RESUME_GUEST;
 951        }
 952
 953        /* Translate the logical address */
 954        gpa = vcpu->arch.fault_gpa & ~0xfffUL;
 955        gpa &= ~0xF000000000000000ul;
 956        gfn = gpa >> PAGE_SHIFT;
 957        if (!(dsisr & DSISR_PRTABLE_FAULT))
 958                gpa |= ea & 0xfff;
 959
 960        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
 961                return kvmppc_send_page_to_uv(kvm, gfn);
 962
 963        /* Get the corresponding memslot */
 964        memslot = gfn_to_memslot(kvm, gfn);
 965
 966        /* No memslot means it's an emulated MMIO region */
 967        if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
 968                if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
 969                             DSISR_SET_RC)) {
 970                        /*
 971                         * Bad address in guest page table tree, or other
 972                         * unusual error - reflect it to the guest as DSI.
 973                         */
 974                        kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
 975                        return RESUME_GUEST;
 976                }
 977                return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing);
 978        }
 979
 980        if (memslot->flags & KVM_MEM_READONLY) {
 981                if (writing) {
 982                        /* give the guest a DSI */
 983                        kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
 984                                                       DSISR_PROTFAULT);
 985                        return RESUME_GUEST;
 986                }
 987                kvm_ro = true;
 988        }
 989
 990        /* Failed to set the reference/change bits */
 991        if (dsisr & DSISR_SET_RC) {
 992                spin_lock(&kvm->mmu_lock);
 993                if (kvmppc_hv_handle_set_rc(kvm, false, writing,
 994                                            gpa, kvm->arch.lpid))
 995                        dsisr &= ~DSISR_SET_RC;
 996                spin_unlock(&kvm->mmu_lock);
 997
 998                if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
 999                               DSISR_PROTFAULT | DSISR_SET_RC)))
1000                        return RESUME_GUEST;
1001        }
1002
1003        /* Try to insert a pte */
1004        ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
1005                                             kvm_ro, NULL, NULL);
1006
1007        if (ret == 0 || ret == -EAGAIN)
1008                ret = RESUME_GUEST;
1009        return ret;
1010}
1011
1012/* Called with kvm->mmu_lock held */
1013void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1014                     unsigned long gfn)
1015{
1016        pte_t *ptep;
1017        unsigned long gpa = gfn << PAGE_SHIFT;
1018        unsigned int shift;
1019
1020        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
1021                uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
1022                return;
1023        }
1024
1025        ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1026        if (ptep && pte_present(*ptep))
1027                kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1028                                 kvm->arch.lpid);
1029}
1030
1031/* Called with kvm->mmu_lock held */
1032bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1033                   unsigned long gfn)
1034{
1035        pte_t *ptep;
1036        unsigned long gpa = gfn << PAGE_SHIFT;
1037        unsigned int shift;
1038        bool ref = false;
1039        unsigned long old, *rmapp;
1040
1041        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1042                return ref;
1043
1044        ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1045        if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
1046                old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
1047                                              gpa, shift);
1048                /* XXX need to flush tlb here? */
1049                /* Also clear bit in ptes in shadow pgtable for nested guests */
1050                rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1051                kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
1052                                               old & PTE_RPN_MASK,
1053                                               1UL << shift);
1054                ref = true;
1055        }
1056        return ref;
1057}
1058
1059/* Called with kvm->mmu_lock held */
1060bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1061                        unsigned long gfn)
1062
1063{
1064        pte_t *ptep;
1065        unsigned long gpa = gfn << PAGE_SHIFT;
1066        unsigned int shift;
1067        bool ref = false;
1068
1069        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1070                return ref;
1071
1072        ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1073        if (ptep && pte_present(*ptep) && pte_young(*ptep))
1074                ref = true;
1075        return ref;
1076}
1077
1078/* Returns the number of PAGE_SIZE pages that are dirty */
1079static int kvm_radix_test_clear_dirty(struct kvm *kvm,
1080                                struct kvm_memory_slot *memslot, int pagenum)
1081{
1082        unsigned long gfn = memslot->base_gfn + pagenum;
1083        unsigned long gpa = gfn << PAGE_SHIFT;
1084        pte_t *ptep, pte;
1085        unsigned int shift;
1086        int ret = 0;
1087        unsigned long old, *rmapp;
1088
1089        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1090                return ret;
1091
1092        /*
1093         * For performance reasons we don't hold kvm->mmu_lock while walking the
1094         * partition scoped table.
1095         */
1096        ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift);
1097        if (!ptep)
1098                return 0;
1099
1100        pte = READ_ONCE(*ptep);
1101        if (pte_present(pte) && pte_dirty(pte)) {
1102                spin_lock(&kvm->mmu_lock);
1103                /*
1104                 * Recheck the pte again
1105                 */
1106                if (pte_val(pte) != pte_val(*ptep)) {
1107                        /*
1108                         * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can
1109                         * only find PAGE_SIZE pte entries here. We can continue
1110                         * to use the pte addr returned by above page table
1111                         * walk.
1112                         */
1113                        if (!pte_present(*ptep) || !pte_dirty(*ptep)) {
1114                                spin_unlock(&kvm->mmu_lock);
1115                                return 0;
1116                        }
1117                }
1118
1119                ret = 1;
1120                VM_BUG_ON(shift);
1121                old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
1122                                              gpa, shift);
1123                kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
1124                /* Also clear bit in ptes in shadow pgtable for nested guests */
1125                rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1126                kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
1127                                               old & PTE_RPN_MASK,
1128                                               1UL << shift);
1129                spin_unlock(&kvm->mmu_lock);
1130        }
1131        return ret;
1132}
1133
1134long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
1135                        struct kvm_memory_slot *memslot, unsigned long *map)
1136{
1137        unsigned long i, j;
1138        int npages;
1139
1140        for (i = 0; i < memslot->npages; i = j) {
1141                npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
1142
1143                /*
1144                 * Note that if npages > 0 then i must be a multiple of npages,
1145                 * since huge pages are only used to back the guest at guest
1146                 * real addresses that are a multiple of their size.
1147                 * Since we have at most one PTE covering any given guest
1148                 * real address, if npages > 1 we can skip to i + npages.
1149                 */
1150                j = i + 1;
1151                if (npages) {
1152                        set_dirty_bits(map, i, npages);
1153                        j = i + npages;
1154                }
1155        }
1156        return 0;
1157}
1158
1159void kvmppc_radix_flush_memslot(struct kvm *kvm,
1160                                const struct kvm_memory_slot *memslot)
1161{
1162        unsigned long n;
1163        pte_t *ptep;
1164        unsigned long gpa;
1165        unsigned int shift;
1166
1167        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)
1168                kvmppc_uvmem_drop_pages(memslot, kvm, true);
1169
1170        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1171                return;
1172
1173        gpa = memslot->base_gfn << PAGE_SHIFT;
1174        spin_lock(&kvm->mmu_lock);
1175        for (n = memslot->npages; n; --n) {
1176                ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1177                if (ptep && pte_present(*ptep))
1178                        kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1179                                         kvm->arch.lpid);
1180                gpa += PAGE_SIZE;
1181        }
1182        /*
1183         * Increase the mmu notifier sequence number to prevent any page
1184         * fault that read the memslot earlier from writing a PTE.
1185         */
1186        kvm->mmu_notifier_seq++;
1187        spin_unlock(&kvm->mmu_lock);
1188}
1189
1190static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
1191                                 int psize, int *indexp)
1192{
1193        if (!mmu_psize_defs[psize].shift)
1194                return;
1195        info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
1196                (mmu_psize_defs[psize].ap << 29);
1197        ++(*indexp);
1198}
1199
1200int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
1201{
1202        int i;
1203
1204        if (!radix_enabled())
1205                return -EINVAL;
1206        memset(info, 0, sizeof(*info));
1207
1208        /* 4k page size */
1209        info->geometries[0].page_shift = 12;
1210        info->geometries[0].level_bits[0] = 9;
1211        for (i = 1; i < 4; ++i)
1212                info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
1213        /* 64k page size */
1214        info->geometries[1].page_shift = 16;
1215        for (i = 0; i < 4; ++i)
1216                info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
1217
1218        i = 0;
1219        add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
1220        add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
1221        add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
1222        add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
1223
1224        return 0;
1225}
1226
1227int kvmppc_init_vm_radix(struct kvm *kvm)
1228{
1229        kvm->arch.pgtable = pgd_alloc(kvm->mm);
1230        if (!kvm->arch.pgtable)
1231                return -ENOMEM;
1232        return 0;
1233}
1234
1235static void pte_ctor(void *addr)
1236{
1237        memset(addr, 0, RADIX_PTE_TABLE_SIZE);
1238}
1239
1240static void pmd_ctor(void *addr)
1241{
1242        memset(addr, 0, RADIX_PMD_TABLE_SIZE);
1243}
1244
1245struct debugfs_radix_state {
1246        struct kvm      *kvm;
1247        struct mutex    mutex;
1248        unsigned long   gpa;
1249        int             lpid;
1250        int             chars_left;
1251        int             buf_index;
1252        char            buf[128];
1253        u8              hdr;
1254};
1255
1256static int debugfs_radix_open(struct inode *inode, struct file *file)
1257{
1258        struct kvm *kvm = inode->i_private;
1259        struct debugfs_radix_state *p;
1260
1261        p = kzalloc(sizeof(*p), GFP_KERNEL);
1262        if (!p)
1263                return -ENOMEM;
1264
1265        kvm_get_kvm(kvm);
1266        p->kvm = kvm;
1267        mutex_init(&p->mutex);
1268        file->private_data = p;
1269
1270        return nonseekable_open(inode, file);
1271}
1272
1273static int debugfs_radix_release(struct inode *inode, struct file *file)
1274{
1275        struct debugfs_radix_state *p = file->private_data;
1276
1277        kvm_put_kvm(p->kvm);
1278        kfree(p);
1279        return 0;
1280}
1281
1282static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
1283                                 size_t len, loff_t *ppos)
1284{
1285        struct debugfs_radix_state *p = file->private_data;
1286        ssize_t ret, r;
1287        unsigned long n;
1288        struct kvm *kvm;
1289        unsigned long gpa;
1290        pgd_t *pgt;
1291        struct kvm_nested_guest *nested;
1292        pgd_t *pgdp;
1293        p4d_t p4d, *p4dp;
1294        pud_t pud, *pudp;
1295        pmd_t pmd, *pmdp;
1296        pte_t *ptep;
1297        int shift;
1298        unsigned long pte;
1299
1300        kvm = p->kvm;
1301        if (!kvm_is_radix(kvm))
1302                return 0;
1303
1304        ret = mutex_lock_interruptible(&p->mutex);
1305        if (ret)
1306                return ret;
1307
1308        if (p->chars_left) {
1309                n = p->chars_left;
1310                if (n > len)
1311                        n = len;
1312                r = copy_to_user(buf, p->buf + p->buf_index, n);
1313                n -= r;
1314                p->chars_left -= n;
1315                p->buf_index += n;
1316                buf += n;
1317                len -= n;
1318                ret = n;
1319                if (r) {
1320                        if (!n)
1321                                ret = -EFAULT;
1322                        goto out;
1323                }
1324        }
1325
1326        gpa = p->gpa;
1327        nested = NULL;
1328        pgt = NULL;
1329        while (len != 0 && p->lpid >= 0) {
1330                if (gpa >= RADIX_PGTABLE_RANGE) {
1331                        gpa = 0;
1332                        pgt = NULL;
1333                        if (nested) {
1334                                kvmhv_put_nested(nested);
1335                                nested = NULL;
1336                        }
1337                        p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
1338                        p->hdr = 0;
1339                        if (p->lpid < 0)
1340                                break;
1341                }
1342                if (!pgt) {
1343                        if (p->lpid == 0) {
1344                                pgt = kvm->arch.pgtable;
1345                        } else {
1346                                nested = kvmhv_get_nested(kvm, p->lpid, false);
1347                                if (!nested) {
1348                                        gpa = RADIX_PGTABLE_RANGE;
1349                                        continue;
1350                                }
1351                                pgt = nested->shadow_pgtable;
1352                        }
1353                }
1354                n = 0;
1355                if (!p->hdr) {
1356                        if (p->lpid > 0)
1357                                n = scnprintf(p->buf, sizeof(p->buf),
1358                                              "\nNested LPID %d: ", p->lpid);
1359                        n += scnprintf(p->buf + n, sizeof(p->buf) - n,
1360                                      "pgdir: %lx\n", (unsigned long)pgt);
1361                        p->hdr = 1;
1362                        goto copy;
1363                }
1364
1365                pgdp = pgt + pgd_index(gpa);
1366                p4dp = p4d_offset(pgdp, gpa);
1367                p4d = READ_ONCE(*p4dp);
1368                if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
1369                        gpa = (gpa & P4D_MASK) + P4D_SIZE;
1370                        continue;
1371                }
1372
1373                pudp = pud_offset(&p4d, gpa);
1374                pud = READ_ONCE(*pudp);
1375                if (!(pud_val(pud) & _PAGE_PRESENT)) {
1376                        gpa = (gpa & PUD_MASK) + PUD_SIZE;
1377                        continue;
1378                }
1379                if (pud_val(pud) & _PAGE_PTE) {
1380                        pte = pud_val(pud);
1381                        shift = PUD_SHIFT;
1382                        goto leaf;
1383                }
1384
1385                pmdp = pmd_offset(&pud, gpa);
1386                pmd = READ_ONCE(*pmdp);
1387                if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
1388                        gpa = (gpa & PMD_MASK) + PMD_SIZE;
1389                        continue;
1390                }
1391                if (pmd_val(pmd) & _PAGE_PTE) {
1392                        pte = pmd_val(pmd);
1393                        shift = PMD_SHIFT;
1394                        goto leaf;
1395                }
1396
1397                ptep = pte_offset_kernel(&pmd, gpa);
1398                pte = pte_val(READ_ONCE(*ptep));
1399                if (!(pte & _PAGE_PRESENT)) {
1400                        gpa += PAGE_SIZE;
1401                        continue;
1402                }
1403                shift = PAGE_SHIFT;
1404        leaf:
1405                n = scnprintf(p->buf, sizeof(p->buf),
1406                              " %lx: %lx %d\n", gpa, pte, shift);
1407                gpa += 1ul << shift;
1408        copy:
1409                p->chars_left = n;
1410                if (n > len)
1411                        n = len;
1412                r = copy_to_user(buf, p->buf, n);
1413                n -= r;
1414                p->chars_left -= n;
1415                p->buf_index = n;
1416                buf += n;
1417                len -= n;
1418                ret += n;
1419                if (r) {
1420                        if (!ret)
1421                                ret = -EFAULT;
1422                        break;
1423                }
1424        }
1425        p->gpa = gpa;
1426        if (nested)
1427                kvmhv_put_nested(nested);
1428
1429 out:
1430        mutex_unlock(&p->mutex);
1431        return ret;
1432}
1433
1434static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
1435                           size_t len, loff_t *ppos)
1436{
1437        return -EACCES;
1438}
1439
1440static const struct file_operations debugfs_radix_fops = {
1441        .owner   = THIS_MODULE,
1442        .open    = debugfs_radix_open,
1443        .release = debugfs_radix_release,
1444        .read    = debugfs_radix_read,
1445        .write   = debugfs_radix_write,
1446        .llseek  = generic_file_llseek,
1447};
1448
1449void kvmhv_radix_debugfs_init(struct kvm *kvm)
1450{
1451        debugfs_create_file("radix", 0400, kvm->arch.debugfs_dir, kvm,
1452                            &debugfs_radix_fops);
1453}
1454
1455int kvmppc_radix_init(void)
1456{
1457        unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
1458
1459        kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
1460        if (!kvm_pte_cache)
1461                return -ENOMEM;
1462
1463        size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
1464
1465        kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
1466        if (!kvm_pmd_cache) {
1467                kmem_cache_destroy(kvm_pte_cache);
1468                return -ENOMEM;
1469        }
1470
1471        return 0;
1472}
1473
1474void kvmppc_radix_exit(void)
1475{
1476        kmem_cache_destroy(kvm_pte_cache);
1477        kmem_cache_destroy(kvm_pmd_cache);
1478}
1479