linux/arch/powerpc/kvm/book3s_64_mmu_radix.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *
   4 * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   5 */
   6
   7#include <linux/types.h>
   8#include <linux/string.h>
   9#include <linux/kvm.h>
  10#include <linux/kvm_host.h>
  11#include <linux/anon_inodes.h>
  12#include <linux/file.h>
  13#include <linux/debugfs.h>
  14#include <linux/pgtable.h>
  15
  16#include <asm/kvm_ppc.h>
  17#include <asm/kvm_book3s.h>
  18#include <asm/page.h>
  19#include <asm/mmu.h>
  20#include <asm/pgalloc.h>
  21#include <asm/pte-walk.h>
  22#include <asm/ultravisor.h>
  23#include <asm/kvm_book3s_uvmem.h>
  24
  25/*
  26 * Supported radix tree geometry.
  27 * Like p9, we support either 5 or 9 bits at the first (lowest) level,
  28 * for a page size of 64k or 4k.
  29 */
  30static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
  31
  32unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
  33                                              gva_t eaddr, void *to, void *from,
  34                                              unsigned long n)
  35{
  36        int old_pid, old_lpid;
  37        unsigned long quadrant, ret = n;
  38        bool is_load = !!to;
  39
  40        /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
  41        if (kvmhv_on_pseries())
  42                return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
  43                                          (to != NULL) ? __pa(to): 0,
  44                                          (from != NULL) ? __pa(from): 0, n);
  45
  46        quadrant = 1;
  47        if (!pid)
  48                quadrant = 2;
  49        if (is_load)
  50                from = (void *) (eaddr | (quadrant << 62));
  51        else
  52                to = (void *) (eaddr | (quadrant << 62));
  53
  54        preempt_disable();
  55
  56        /* switch the lpid first to avoid running host with unallocated pid */
  57        old_lpid = mfspr(SPRN_LPID);
  58        if (old_lpid != lpid)
  59                mtspr(SPRN_LPID, lpid);
  60        if (quadrant == 1) {
  61                old_pid = mfspr(SPRN_PID);
  62                if (old_pid != pid)
  63                        mtspr(SPRN_PID, pid);
  64        }
  65        isync();
  66
  67        if (is_load)
  68                ret = copy_from_user_nofault(to, (const void __user *)from, n);
  69        else
  70                ret = copy_to_user_nofault((void __user *)to, from, n);
  71
  72        /* switch the pid first to avoid running host with unallocated pid */
  73        if (quadrant == 1 && pid != old_pid)
  74                mtspr(SPRN_PID, old_pid);
  75        if (lpid != old_lpid)
  76                mtspr(SPRN_LPID, old_lpid);
  77        isync();
  78
  79        preempt_enable();
  80
  81        return ret;
  82}
  83EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix);
  84
  85static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
  86                                          void *to, void *from, unsigned long n)
  87{
  88        int lpid = vcpu->kvm->arch.lpid;
  89        int pid = vcpu->arch.pid;
  90
  91        /* This would cause a data segment intr so don't allow the access */
  92        if (eaddr & (0x3FFUL << 52))
  93                return -EINVAL;
  94
  95        /* Should we be using the nested lpid */
  96        if (vcpu->arch.nested)
  97                lpid = vcpu->arch.nested->shadow_lpid;
  98
  99        /* If accessing quadrant 3 then pid is expected to be 0 */
 100        if (((eaddr >> 62) & 0x3) == 0x3)
 101                pid = 0;
 102
 103        eaddr &= ~(0xFFFUL << 52);
 104
 105        return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
 106}
 107
 108long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
 109                                 unsigned long n)
 110{
 111        long ret;
 112
 113        ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
 114        if (ret > 0)
 115                memset(to + (n - ret), 0, ret);
 116
 117        return ret;
 118}
 119EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix);
 120
 121long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
 122                               unsigned long n)
 123{
 124        return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
 125}
 126EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix);
 127
 128int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
 129                               struct kvmppc_pte *gpte, u64 root,
 130                               u64 *pte_ret_p)
 131{
 132        struct kvm *kvm = vcpu->kvm;
 133        int ret, level, ps;
 134        unsigned long rts, bits, offset, index;
 135        u64 pte, base, gpa;
 136        __be64 rpte;
 137
 138        rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
 139                ((root & RTS2_MASK) >> RTS2_SHIFT);
 140        bits = root & RPDS_MASK;
 141        base = root & RPDB_MASK;
 142
 143        offset = rts + 31;
 144
 145        /* Current implementations only support 52-bit space */
 146        if (offset != 52)
 147                return -EINVAL;
 148
 149        /* Walk each level of the radix tree */
 150        for (level = 3; level >= 0; --level) {
 151                u64 addr;
 152                /* Check a valid size */
 153                if (level && bits != p9_supported_radix_bits[level])
 154                        return -EINVAL;
 155                if (level == 0 && !(bits == 5 || bits == 9))
 156                        return -EINVAL;
 157                offset -= bits;
 158                index = (eaddr >> offset) & ((1UL << bits) - 1);
 159                /* Check that low bits of page table base are zero */
 160                if (base & ((1UL << (bits + 3)) - 1))
 161                        return -EINVAL;
 162                /* Read the entry from guest memory */
 163                addr = base + (index * sizeof(rpte));
 164                vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 165                ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
 166                srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 167                if (ret) {
 168                        if (pte_ret_p)
 169                                *pte_ret_p = addr;
 170                        return ret;
 171                }
 172                pte = __be64_to_cpu(rpte);
 173                if (!(pte & _PAGE_PRESENT))
 174                        return -ENOENT;
 175                /* Check if a leaf entry */
 176                if (pte & _PAGE_PTE)
 177                        break;
 178                /* Get ready to walk the next level */
 179                base = pte & RPDB_MASK;
 180                bits = pte & RPDS_MASK;
 181        }
 182
 183        /* Need a leaf at lowest level; 512GB pages not supported */
 184        if (level < 0 || level == 3)
 185                return -EINVAL;
 186
 187        /* We found a valid leaf PTE */
 188        /* Offset is now log base 2 of the page size */
 189        gpa = pte & 0x01fffffffffff000ul;
 190        if (gpa & ((1ul << offset) - 1))
 191                return -EINVAL;
 192        gpa |= eaddr & ((1ul << offset) - 1);
 193        for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
 194                if (offset == mmu_psize_defs[ps].shift)
 195                        break;
 196        gpte->page_size = ps;
 197        gpte->page_shift = offset;
 198
 199        gpte->eaddr = eaddr;
 200        gpte->raddr = gpa;
 201
 202        /* Work out permissions */
 203        gpte->may_read = !!(pte & _PAGE_READ);
 204        gpte->may_write = !!(pte & _PAGE_WRITE);
 205        gpte->may_execute = !!(pte & _PAGE_EXEC);
 206
 207        gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
 208
 209        if (pte_ret_p)
 210                *pte_ret_p = pte;
 211
 212        return 0;
 213}
 214
 215/*
 216 * Used to walk a partition or process table radix tree in guest memory
 217 * Note: We exploit the fact that a partition table and a process
 218 * table have the same layout, a partition-scoped page table and a
 219 * process-scoped page table have the same layout, and the 2nd
 220 * doubleword of a partition table entry has the same layout as
 221 * the PTCR register.
 222 */
 223int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
 224                                     struct kvmppc_pte *gpte, u64 table,
 225                                     int table_index, u64 *pte_ret_p)
 226{
 227        struct kvm *kvm = vcpu->kvm;
 228        int ret;
 229        unsigned long size, ptbl, root;
 230        struct prtb_entry entry;
 231
 232        if ((table & PRTS_MASK) > 24)
 233                return -EINVAL;
 234        size = 1ul << ((table & PRTS_MASK) + 12);
 235
 236        /* Is the table big enough to contain this entry? */
 237        if ((table_index * sizeof(entry)) >= size)
 238                return -EINVAL;
 239
 240        /* Read the table to find the root of the radix tree */
 241        ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
 242        vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 243        ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
 244        srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 245        if (ret)
 246                return ret;
 247
 248        /* Root is stored in the first double word */
 249        root = be64_to_cpu(entry.prtb0);
 250
 251        return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
 252}
 253
 254int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 255                           struct kvmppc_pte *gpte, bool data, bool iswrite)
 256{
 257        u32 pid;
 258        u64 pte;
 259        int ret;
 260
 261        /* Work out effective PID */
 262        switch (eaddr >> 62) {
 263        case 0:
 264                pid = vcpu->arch.pid;
 265                break;
 266        case 3:
 267                pid = 0;
 268                break;
 269        default:
 270                return -EINVAL;
 271        }
 272
 273        ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
 274                                vcpu->kvm->arch.process_table, pid, &pte);
 275        if (ret)
 276                return ret;
 277
 278        /* Check privilege (applies only to process scoped translations) */
 279        if (kvmppc_get_msr(vcpu) & MSR_PR) {
 280                if (pte & _PAGE_PRIVILEGED) {
 281                        gpte->may_read = 0;
 282                        gpte->may_write = 0;
 283                        gpte->may_execute = 0;
 284                }
 285        } else {
 286                if (!(pte & _PAGE_PRIVILEGED)) {
 287                        /* Check AMR/IAMR to see if strict mode is in force */
 288                        if (vcpu->arch.amr & (1ul << 62))
 289                                gpte->may_read = 0;
 290                        if (vcpu->arch.amr & (1ul << 63))
 291                                gpte->may_write = 0;
 292                        if (vcpu->arch.iamr & (1ul << 62))
 293                                gpte->may_execute = 0;
 294                }
 295        }
 296
 297        return 0;
 298}
 299
 300void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
 301                             unsigned int pshift, unsigned int lpid)
 302{
 303        unsigned long psize = PAGE_SIZE;
 304        int psi;
 305        long rc;
 306        unsigned long rb;
 307
 308        if (pshift)
 309                psize = 1UL << pshift;
 310        else
 311                pshift = PAGE_SHIFT;
 312
 313        addr &= ~(psize - 1);
 314
 315        if (!kvmhv_on_pseries()) {
 316                radix__flush_tlb_lpid_page(lpid, addr, psize);
 317                return;
 318        }
 319
 320        psi = shift_to_mmu_psize(pshift);
 321        rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
 322        rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
 323                                lpid, rb);
 324        if (rc)
 325                pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
 326}
 327
 328static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
 329{
 330        long rc;
 331
 332        if (!kvmhv_on_pseries()) {
 333                radix__flush_pwc_lpid(lpid);
 334                return;
 335        }
 336
 337        rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
 338                                lpid, TLBIEL_INVAL_SET_LPID);
 339        if (rc)
 340                pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
 341}
 342
 343static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
 344                                      unsigned long clr, unsigned long set,
 345                                      unsigned long addr, unsigned int shift)
 346{
 347        return __radix_pte_update(ptep, clr, set);
 348}
 349
 350void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr,
 351                             pte_t *ptep, pte_t pte)
 352{
 353        radix__set_pte_at(kvm->mm, addr, ptep, pte, 0);
 354}
 355
 356static struct kmem_cache *kvm_pte_cache;
 357static struct kmem_cache *kvm_pmd_cache;
 358
 359static pte_t *kvmppc_pte_alloc(void)
 360{
 361        pte_t *pte;
 362
 363        pte = kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL);
 364        /* pmd_populate() will only reference _pa(pte). */
 365        kmemleak_ignore(pte);
 366
 367        return pte;
 368}
 369
 370static void kvmppc_pte_free(pte_t *ptep)
 371{
 372        kmem_cache_free(kvm_pte_cache, ptep);
 373}
 374
 375static pmd_t *kvmppc_pmd_alloc(void)
 376{
 377        pmd_t *pmd;
 378
 379        pmd = kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
 380        /* pud_populate() will only reference _pa(pmd). */
 381        kmemleak_ignore(pmd);
 382
 383        return pmd;
 384}
 385
 386static void kvmppc_pmd_free(pmd_t *pmdp)
 387{
 388        kmem_cache_free(kvm_pmd_cache, pmdp);
 389}
 390
 391/* Called with kvm->mmu_lock held */
 392void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
 393                      unsigned int shift,
 394                      const struct kvm_memory_slot *memslot,
 395                      unsigned int lpid)
 396
 397{
 398        unsigned long old;
 399        unsigned long gfn = gpa >> PAGE_SHIFT;
 400        unsigned long page_size = PAGE_SIZE;
 401        unsigned long hpa;
 402
 403        old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
 404        kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
 405
 406        /* The following only applies to L1 entries */
 407        if (lpid != kvm->arch.lpid)
 408                return;
 409
 410        if (!memslot) {
 411                memslot = gfn_to_memslot(kvm, gfn);
 412                if (!memslot)
 413                        return;
 414        }
 415        if (shift) { /* 1GB or 2MB page */
 416                page_size = 1ul << shift;
 417                if (shift == PMD_SHIFT)
 418                        kvm->stat.num_2M_pages--;
 419                else if (shift == PUD_SHIFT)
 420                        kvm->stat.num_1G_pages--;
 421        }
 422
 423        gpa &= ~(page_size - 1);
 424        hpa = old & PTE_RPN_MASK;
 425        kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
 426
 427        if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
 428                kvmppc_update_dirty_map(memslot, gfn, page_size);
 429}
 430
 431/*
 432 * kvmppc_free_p?d are used to free existing page tables, and recursively
 433 * descend and clear and free children.
 434 * Callers are responsible for flushing the PWC.
 435 *
 436 * When page tables are being unmapped/freed as part of page fault path
 437 * (full == false), valid ptes are generally not expected; however, there
 438 * is one situation where they arise, which is when dirty page logging is
 439 * turned off for a memslot while the VM is running.  The new memslot
 440 * becomes visible to page faults before the memslot commit function
 441 * gets to flush the memslot, which can lead to a 2MB page mapping being
 442 * installed for a guest physical address where there are already 64kB
 443 * (or 4kB) mappings (of sub-pages of the same 2MB page).
 444 */
 445static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
 446                                  unsigned int lpid)
 447{
 448        if (full) {
 449                memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE);
 450        } else {
 451                pte_t *p = pte;
 452                unsigned long it;
 453
 454                for (it = 0; it < PTRS_PER_PTE; ++it, ++p) {
 455                        if (pte_val(*p) == 0)
 456                                continue;
 457                        kvmppc_unmap_pte(kvm, p,
 458                                         pte_pfn(*p) << PAGE_SHIFT,
 459                                         PAGE_SHIFT, NULL, lpid);
 460                }
 461        }
 462
 463        kvmppc_pte_free(pte);
 464}
 465
 466static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
 467                                  unsigned int lpid)
 468{
 469        unsigned long im;
 470        pmd_t *p = pmd;
 471
 472        for (im = 0; im < PTRS_PER_PMD; ++im, ++p) {
 473                if (!pmd_present(*p))
 474                        continue;
 475                if (pmd_is_leaf(*p)) {
 476                        if (full) {
 477                                pmd_clear(p);
 478                        } else {
 479                                WARN_ON_ONCE(1);
 480                                kvmppc_unmap_pte(kvm, (pte_t *)p,
 481                                         pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
 482                                         PMD_SHIFT, NULL, lpid);
 483                        }
 484                } else {
 485                        pte_t *pte;
 486
 487                        pte = pte_offset_map(p, 0);
 488                        kvmppc_unmap_free_pte(kvm, pte, full, lpid);
 489                        pmd_clear(p);
 490                }
 491        }
 492        kvmppc_pmd_free(pmd);
 493}
 494
 495static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
 496                                  unsigned int lpid)
 497{
 498        unsigned long iu;
 499        pud_t *p = pud;
 500
 501        for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
 502                if (!pud_present(*p))
 503                        continue;
 504                if (pud_is_leaf(*p)) {
 505                        pud_clear(p);
 506                } else {
 507                        pmd_t *pmd;
 508
 509                        pmd = pmd_offset(p, 0);
 510                        kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
 511                        pud_clear(p);
 512                }
 513        }
 514        pud_free(kvm->mm, pud);
 515}
 516
 517void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
 518{
 519        unsigned long ig;
 520
 521        for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
 522                p4d_t *p4d = p4d_offset(pgd, 0);
 523                pud_t *pud;
 524
 525                if (!p4d_present(*p4d))
 526                        continue;
 527                pud = pud_offset(p4d, 0);
 528                kvmppc_unmap_free_pud(kvm, pud, lpid);
 529                p4d_clear(p4d);
 530        }
 531}
 532
 533void kvmppc_free_radix(struct kvm *kvm)
 534{
 535        if (kvm->arch.pgtable) {
 536                kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
 537                                          kvm->arch.lpid);
 538                pgd_free(kvm->mm, kvm->arch.pgtable);
 539                kvm->arch.pgtable = NULL;
 540        }
 541}
 542
 543static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
 544                                        unsigned long gpa, unsigned int lpid)
 545{
 546        pte_t *pte = pte_offset_kernel(pmd, 0);
 547
 548        /*
 549         * Clearing the pmd entry then flushing the PWC ensures that the pte
 550         * page no longer be cached by the MMU, so can be freed without
 551         * flushing the PWC again.
 552         */
 553        pmd_clear(pmd);
 554        kvmppc_radix_flush_pwc(kvm, lpid);
 555
 556        kvmppc_unmap_free_pte(kvm, pte, false, lpid);
 557}
 558
 559static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
 560                                        unsigned long gpa, unsigned int lpid)
 561{
 562        pmd_t *pmd = pmd_offset(pud, 0);
 563
 564        /*
 565         * Clearing the pud entry then flushing the PWC ensures that the pmd
 566         * page and any children pte pages will no longer be cached by the MMU,
 567         * so can be freed without flushing the PWC again.
 568         */
 569        pud_clear(pud);
 570        kvmppc_radix_flush_pwc(kvm, lpid);
 571
 572        kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
 573}
 574
 575/*
 576 * There are a number of bits which may differ between different faults to
 577 * the same partition scope entry. RC bits, in the course of cleaning and
 578 * aging. And the write bit can change, either the access could have been
 579 * upgraded, or a read fault could happen concurrently with a write fault
 580 * that sets those bits first.
 581 */
 582#define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
 583
 584int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
 585                      unsigned long gpa, unsigned int level,
 586                      unsigned long mmu_seq, unsigned int lpid,
 587                      unsigned long *rmapp, struct rmap_nested **n_rmap)
 588{
 589        pgd_t *pgd;
 590        p4d_t *p4d;
 591        pud_t *pud, *new_pud = NULL;
 592        pmd_t *pmd, *new_pmd = NULL;
 593        pte_t *ptep, *new_ptep = NULL;
 594        int ret;
 595
 596        /* Traverse the guest's 2nd-level tree, allocate new levels needed */
 597        pgd = pgtable + pgd_index(gpa);
 598        p4d = p4d_offset(pgd, gpa);
 599
 600        pud = NULL;
 601        if (p4d_present(*p4d))
 602                pud = pud_offset(p4d, gpa);
 603        else
 604                new_pud = pud_alloc_one(kvm->mm, gpa);
 605
 606        pmd = NULL;
 607        if (pud && pud_present(*pud) && !pud_is_leaf(*pud))
 608                pmd = pmd_offset(pud, gpa);
 609        else if (level <= 1)
 610                new_pmd = kvmppc_pmd_alloc();
 611
 612        if (level == 0 && !(pmd && pmd_present(*pmd) && !pmd_is_leaf(*pmd)))
 613                new_ptep = kvmppc_pte_alloc();
 614
 615        /* Check if we might have been invalidated; let the guest retry if so */
 616        spin_lock(&kvm->mmu_lock);
 617        ret = -EAGAIN;
 618        if (mmu_notifier_retry(kvm, mmu_seq))
 619                goto out_unlock;
 620
 621        /* Now traverse again under the lock and change the tree */
 622        ret = -ENOMEM;
 623        if (p4d_none(*p4d)) {
 624                if (!new_pud)
 625                        goto out_unlock;
 626                p4d_populate(kvm->mm, p4d, new_pud);
 627                new_pud = NULL;
 628        }
 629        pud = pud_offset(p4d, gpa);
 630        if (pud_is_leaf(*pud)) {
 631                unsigned long hgpa = gpa & PUD_MASK;
 632
 633                /* Check if we raced and someone else has set the same thing */
 634                if (level == 2) {
 635                        if (pud_raw(*pud) == pte_raw(pte)) {
 636                                ret = 0;
 637                                goto out_unlock;
 638                        }
 639                        /* Valid 1GB page here already, add our extra bits */
 640                        WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) &
 641                                                        PTE_BITS_MUST_MATCH);
 642                        kvmppc_radix_update_pte(kvm, (pte_t *)pud,
 643                                              0, pte_val(pte), hgpa, PUD_SHIFT);
 644                        ret = 0;
 645                        goto out_unlock;
 646                }
 647                /*
 648                 * If we raced with another CPU which has just put
 649                 * a 1GB pte in after we saw a pmd page, try again.
 650                 */
 651                if (!new_pmd) {
 652                        ret = -EAGAIN;
 653                        goto out_unlock;
 654                }
 655                /* Valid 1GB page here already, remove it */
 656                kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
 657                                 lpid);
 658        }
 659        if (level == 2) {
 660                if (!pud_none(*pud)) {
 661                        /*
 662                         * There's a page table page here, but we wanted to
 663                         * install a large page, so remove and free the page
 664                         * table page.
 665                         */
 666                        kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
 667                }
 668                kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
 669                if (rmapp && n_rmap)
 670                        kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
 671                ret = 0;
 672                goto out_unlock;
 673        }
 674        if (pud_none(*pud)) {
 675                if (!new_pmd)
 676                        goto out_unlock;
 677                pud_populate(kvm->mm, pud, new_pmd);
 678                new_pmd = NULL;
 679        }
 680        pmd = pmd_offset(pud, gpa);
 681        if (pmd_is_leaf(*pmd)) {
 682                unsigned long lgpa = gpa & PMD_MASK;
 683
 684                /* Check if we raced and someone else has set the same thing */
 685                if (level == 1) {
 686                        if (pmd_raw(*pmd) == pte_raw(pte)) {
 687                                ret = 0;
 688                                goto out_unlock;
 689                        }
 690                        /* Valid 2MB page here already, add our extra bits */
 691                        WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
 692                                                        PTE_BITS_MUST_MATCH);
 693                        kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
 694                                        0, pte_val(pte), lgpa, PMD_SHIFT);
 695                        ret = 0;
 696                        goto out_unlock;
 697                }
 698
 699                /*
 700                 * If we raced with another CPU which has just put
 701                 * a 2MB pte in after we saw a pte page, try again.
 702                 */
 703                if (!new_ptep) {
 704                        ret = -EAGAIN;
 705                        goto out_unlock;
 706                }
 707                /* Valid 2MB page here already, remove it */
 708                kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
 709                                 lpid);
 710        }
 711        if (level == 1) {
 712                if (!pmd_none(*pmd)) {
 713                        /*
 714                         * There's a page table page here, but we wanted to
 715                         * install a large page, so remove and free the page
 716                         * table page.
 717                         */
 718                        kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
 719                }
 720                kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
 721                if (rmapp && n_rmap)
 722                        kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
 723                ret = 0;
 724                goto out_unlock;
 725        }
 726        if (pmd_none(*pmd)) {
 727                if (!new_ptep)
 728                        goto out_unlock;
 729                pmd_populate(kvm->mm, pmd, new_ptep);
 730                new_ptep = NULL;
 731        }
 732        ptep = pte_offset_kernel(pmd, gpa);
 733        if (pte_present(*ptep)) {
 734                /* Check if someone else set the same thing */
 735                if (pte_raw(*ptep) == pte_raw(pte)) {
 736                        ret = 0;
 737                        goto out_unlock;
 738                }
 739                /* Valid page here already, add our extra bits */
 740                WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) &
 741                                                        PTE_BITS_MUST_MATCH);
 742                kvmppc_radix_update_pte(kvm, ptep, 0, pte_val(pte), gpa, 0);
 743                ret = 0;
 744                goto out_unlock;
 745        }
 746        kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
 747        if (rmapp && n_rmap)
 748                kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
 749        ret = 0;
 750
 751 out_unlock:
 752        spin_unlock(&kvm->mmu_lock);
 753        if (new_pud)
 754                pud_free(kvm->mm, new_pud);
 755        if (new_pmd)
 756                kvmppc_pmd_free(new_pmd);
 757        if (new_ptep)
 758                kvmppc_pte_free(new_ptep);
 759        return ret;
 760}
 761
 762bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing,
 763                             unsigned long gpa, unsigned int lpid)
 764{
 765        unsigned long pgflags;
 766        unsigned int shift;
 767        pte_t *ptep;
 768
 769        /*
 770         * Need to set an R or C bit in the 2nd-level tables;
 771         * since we are just helping out the hardware here,
 772         * it is sufficient to do what the hardware does.
 773         */
 774        pgflags = _PAGE_ACCESSED;
 775        if (writing)
 776                pgflags |= _PAGE_DIRTY;
 777
 778        if (nested)
 779                ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift);
 780        else
 781                ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
 782
 783        if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
 784                kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
 785                return true;
 786        }
 787        return false;
 788}
 789
 790int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 791                                   unsigned long gpa,
 792                                   struct kvm_memory_slot *memslot,
 793                                   bool writing, bool kvm_ro,
 794                                   pte_t *inserted_pte, unsigned int *levelp)
 795{
 796        struct kvm *kvm = vcpu->kvm;
 797        struct page *page = NULL;
 798        unsigned long mmu_seq;
 799        unsigned long hva, gfn = gpa >> PAGE_SHIFT;
 800        bool upgrade_write = false;
 801        bool *upgrade_p = &upgrade_write;
 802        pte_t pte, *ptep;
 803        unsigned int shift, level;
 804        int ret;
 805        bool large_enable;
 806
 807        /* used to check for invalidations in progress */
 808        mmu_seq = kvm->mmu_notifier_seq;
 809        smp_rmb();
 810
 811        /*
 812         * Do a fast check first, since __gfn_to_pfn_memslot doesn't
 813         * do it with !atomic && !async, which is how we call it.
 814         * We always ask for write permission since the common case
 815         * is that the page is writable.
 816         */
 817        hva = gfn_to_hva_memslot(memslot, gfn);
 818        if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) {
 819                upgrade_write = true;
 820        } else {
 821                unsigned long pfn;
 822
 823                /* Call KVM generic code to do the slow-path check */
 824                pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
 825                                           writing, upgrade_p);
 826                if (is_error_noslot_pfn(pfn))
 827                        return -EFAULT;
 828                page = NULL;
 829                if (pfn_valid(pfn)) {
 830                        page = pfn_to_page(pfn);
 831                        if (PageReserved(page))
 832                                page = NULL;
 833                }
 834        }
 835
 836        /*
 837         * Read the PTE from the process' radix tree and use that
 838         * so we get the shift and attribute bits.
 839         */
 840        spin_lock(&kvm->mmu_lock);
 841        ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift);
 842        pte = __pte(0);
 843        if (ptep)
 844                pte = READ_ONCE(*ptep);
 845        spin_unlock(&kvm->mmu_lock);
 846        /*
 847         * If the PTE disappeared temporarily due to a THP
 848         * collapse, just return and let the guest try again.
 849         */
 850        if (!pte_present(pte)) {
 851                if (page)
 852                        put_page(page);
 853                return RESUME_GUEST;
 854        }
 855
 856        /* If we're logging dirty pages, always map single pages */
 857        large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
 858
 859        /* Get pte level from shift/size */
 860        if (large_enable && shift == PUD_SHIFT &&
 861            (gpa & (PUD_SIZE - PAGE_SIZE)) ==
 862            (hva & (PUD_SIZE - PAGE_SIZE))) {
 863                level = 2;
 864        } else if (large_enable && shift == PMD_SHIFT &&
 865                   (gpa & (PMD_SIZE - PAGE_SIZE)) ==
 866                   (hva & (PMD_SIZE - PAGE_SIZE))) {
 867                level = 1;
 868        } else {
 869                level = 0;
 870                if (shift > PAGE_SHIFT) {
 871                        /*
 872                         * If the pte maps more than one page, bring over
 873                         * bits from the virtual address to get the real
 874                         * address of the specific single page we want.
 875                         */
 876                        unsigned long rpnmask = (1ul << shift) - PAGE_SIZE;
 877                        pte = __pte(pte_val(pte) | (hva & rpnmask));
 878                }
 879        }
 880
 881        pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED);
 882        if (writing || upgrade_write) {
 883                if (pte_val(pte) & _PAGE_WRITE)
 884                        pte = __pte(pte_val(pte) | _PAGE_DIRTY);
 885        } else {
 886                pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY));
 887        }
 888
 889        /* Allocate space in the tree and write the PTE */
 890        ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
 891                                mmu_seq, kvm->arch.lpid, NULL, NULL);
 892        if (inserted_pte)
 893                *inserted_pte = pte;
 894        if (levelp)
 895                *levelp = level;
 896
 897        if (page) {
 898                if (!ret && (pte_val(pte) & _PAGE_WRITE))
 899                        set_page_dirty_lock(page);
 900                put_page(page);
 901        }
 902
 903        /* Increment number of large pages if we (successfully) inserted one */
 904        if (!ret) {
 905                if (level == 1)
 906                        kvm->stat.num_2M_pages++;
 907                else if (level == 2)
 908                        kvm->stat.num_1G_pages++;
 909        }
 910
 911        return ret;
 912}
 913
 914int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu,
 915                                   unsigned long ea, unsigned long dsisr)
 916{
 917        struct kvm *kvm = vcpu->kvm;
 918        unsigned long gpa, gfn;
 919        struct kvm_memory_slot *memslot;
 920        long ret;
 921        bool writing = !!(dsisr & DSISR_ISSTORE);
 922        bool kvm_ro = false;
 923
 924        /* Check for unusual errors */
 925        if (dsisr & DSISR_UNSUPP_MMU) {
 926                pr_err("KVM: Got unsupported MMU fault\n");
 927                return -EFAULT;
 928        }
 929        if (dsisr & DSISR_BADACCESS) {
 930                /* Reflect to the guest as DSI */
 931                pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
 932                kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
 933                return RESUME_GUEST;
 934        }
 935
 936        /* Translate the logical address */
 937        gpa = vcpu->arch.fault_gpa & ~0xfffUL;
 938        gpa &= ~0xF000000000000000ul;
 939        gfn = gpa >> PAGE_SHIFT;
 940        if (!(dsisr & DSISR_PRTABLE_FAULT))
 941                gpa |= ea & 0xfff;
 942
 943        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
 944                return kvmppc_send_page_to_uv(kvm, gfn);
 945
 946        /* Get the corresponding memslot */
 947        memslot = gfn_to_memslot(kvm, gfn);
 948
 949        /* No memslot means it's an emulated MMIO region */
 950        if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
 951                if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
 952                             DSISR_SET_RC)) {
 953                        /*
 954                         * Bad address in guest page table tree, or other
 955                         * unusual error - reflect it to the guest as DSI.
 956                         */
 957                        kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
 958                        return RESUME_GUEST;
 959                }
 960                return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing);
 961        }
 962
 963        if (memslot->flags & KVM_MEM_READONLY) {
 964                if (writing) {
 965                        /* give the guest a DSI */
 966                        kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
 967                                                       DSISR_PROTFAULT);
 968                        return RESUME_GUEST;
 969                }
 970                kvm_ro = true;
 971        }
 972
 973        /* Failed to set the reference/change bits */
 974        if (dsisr & DSISR_SET_RC) {
 975                spin_lock(&kvm->mmu_lock);
 976                if (kvmppc_hv_handle_set_rc(kvm, false, writing,
 977                                            gpa, kvm->arch.lpid))
 978                        dsisr &= ~DSISR_SET_RC;
 979                spin_unlock(&kvm->mmu_lock);
 980
 981                if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
 982                               DSISR_PROTFAULT | DSISR_SET_RC)))
 983                        return RESUME_GUEST;
 984        }
 985
 986        /* Try to insert a pte */
 987        ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
 988                                             kvm_ro, NULL, NULL);
 989
 990        if (ret == 0 || ret == -EAGAIN)
 991                ret = RESUME_GUEST;
 992        return ret;
 993}
 994
 995/* Called with kvm->mmu_lock held */
 996int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 997                    unsigned long gfn)
 998{
 999        pte_t *ptep;
1000        unsigned long gpa = gfn << PAGE_SHIFT;
1001        unsigned int shift;
1002
1003        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) {
1004                uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT);
1005                return 0;
1006        }
1007
1008        ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1009        if (ptep && pte_present(*ptep))
1010                kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1011                                 kvm->arch.lpid);
1012        return 0;
1013}
1014
1015/* Called with kvm->mmu_lock held */
1016int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1017                  unsigned long gfn)
1018{
1019        pte_t *ptep;
1020        unsigned long gpa = gfn << PAGE_SHIFT;
1021        unsigned int shift;
1022        int ref = 0;
1023        unsigned long old, *rmapp;
1024
1025        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1026                return ref;
1027
1028        ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1029        if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
1030                old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
1031                                              gpa, shift);
1032                /* XXX need to flush tlb here? */
1033                /* Also clear bit in ptes in shadow pgtable for nested guests */
1034                rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1035                kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
1036                                               old & PTE_RPN_MASK,
1037                                               1UL << shift);
1038                ref = 1;
1039        }
1040        return ref;
1041}
1042
1043/* Called with kvm->mmu_lock held */
1044int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
1045                       unsigned long gfn)
1046{
1047        pte_t *ptep;
1048        unsigned long gpa = gfn << PAGE_SHIFT;
1049        unsigned int shift;
1050        int ref = 0;
1051
1052        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1053                return ref;
1054
1055        ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1056        if (ptep && pte_present(*ptep) && pte_young(*ptep))
1057                ref = 1;
1058        return ref;
1059}
1060
1061/* Returns the number of PAGE_SIZE pages that are dirty */
1062static int kvm_radix_test_clear_dirty(struct kvm *kvm,
1063                                struct kvm_memory_slot *memslot, int pagenum)
1064{
1065        unsigned long gfn = memslot->base_gfn + pagenum;
1066        unsigned long gpa = gfn << PAGE_SHIFT;
1067        pte_t *ptep, pte;
1068        unsigned int shift;
1069        int ret = 0;
1070        unsigned long old, *rmapp;
1071
1072        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1073                return ret;
1074
1075        /*
1076         * For performance reasons we don't hold kvm->mmu_lock while walking the
1077         * partition scoped table.
1078         */
1079        ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift);
1080        if (!ptep)
1081                return 0;
1082
1083        pte = READ_ONCE(*ptep);
1084        if (pte_present(pte) && pte_dirty(pte)) {
1085                spin_lock(&kvm->mmu_lock);
1086                /*
1087                 * Recheck the pte again
1088                 */
1089                if (pte_val(pte) != pte_val(*ptep)) {
1090                        /*
1091                         * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can
1092                         * only find PAGE_SIZE pte entries here. We can continue
1093                         * to use the pte addr returned by above page table
1094                         * walk.
1095                         */
1096                        if (!pte_present(*ptep) || !pte_dirty(*ptep)) {
1097                                spin_unlock(&kvm->mmu_lock);
1098                                return 0;
1099                        }
1100                }
1101
1102                ret = 1;
1103                VM_BUG_ON(shift);
1104                old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
1105                                              gpa, shift);
1106                kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
1107                /* Also clear bit in ptes in shadow pgtable for nested guests */
1108                rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1109                kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
1110                                               old & PTE_RPN_MASK,
1111                                               1UL << shift);
1112                spin_unlock(&kvm->mmu_lock);
1113        }
1114        return ret;
1115}
1116
1117long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
1118                        struct kvm_memory_slot *memslot, unsigned long *map)
1119{
1120        unsigned long i, j;
1121        int npages;
1122
1123        for (i = 0; i < memslot->npages; i = j) {
1124                npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
1125
1126                /*
1127                 * Note that if npages > 0 then i must be a multiple of npages,
1128                 * since huge pages are only used to back the guest at guest
1129                 * real addresses that are a multiple of their size.
1130                 * Since we have at most one PTE covering any given guest
1131                 * real address, if npages > 1 we can skip to i + npages.
1132                 */
1133                j = i + 1;
1134                if (npages) {
1135                        set_dirty_bits(map, i, npages);
1136                        j = i + npages;
1137                }
1138        }
1139        return 0;
1140}
1141
1142void kvmppc_radix_flush_memslot(struct kvm *kvm,
1143                                const struct kvm_memory_slot *memslot)
1144{
1145        unsigned long n;
1146        pte_t *ptep;
1147        unsigned long gpa;
1148        unsigned int shift;
1149
1150        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START)
1151                kvmppc_uvmem_drop_pages(memslot, kvm, true);
1152
1153        if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE)
1154                return;
1155
1156        gpa = memslot->base_gfn << PAGE_SHIFT;
1157        spin_lock(&kvm->mmu_lock);
1158        for (n = memslot->npages; n; --n) {
1159                ptep = find_kvm_secondary_pte(kvm, gpa, &shift);
1160                if (ptep && pte_present(*ptep))
1161                        kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
1162                                         kvm->arch.lpid);
1163                gpa += PAGE_SIZE;
1164        }
1165        /*
1166         * Increase the mmu notifier sequence number to prevent any page
1167         * fault that read the memslot earlier from writing a PTE.
1168         */
1169        kvm->mmu_notifier_seq++;
1170        spin_unlock(&kvm->mmu_lock);
1171}
1172
1173static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
1174                                 int psize, int *indexp)
1175{
1176        if (!mmu_psize_defs[psize].shift)
1177                return;
1178        info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift |
1179                (mmu_psize_defs[psize].ap << 29);
1180        ++(*indexp);
1181}
1182
1183int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info)
1184{
1185        int i;
1186
1187        if (!radix_enabled())
1188                return -EINVAL;
1189        memset(info, 0, sizeof(*info));
1190
1191        /* 4k page size */
1192        info->geometries[0].page_shift = 12;
1193        info->geometries[0].level_bits[0] = 9;
1194        for (i = 1; i < 4; ++i)
1195                info->geometries[0].level_bits[i] = p9_supported_radix_bits[i];
1196        /* 64k page size */
1197        info->geometries[1].page_shift = 16;
1198        for (i = 0; i < 4; ++i)
1199                info->geometries[1].level_bits[i] = p9_supported_radix_bits[i];
1200
1201        i = 0;
1202        add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i);
1203        add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i);
1204        add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i);
1205        add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i);
1206
1207        return 0;
1208}
1209
1210int kvmppc_init_vm_radix(struct kvm *kvm)
1211{
1212        kvm->arch.pgtable = pgd_alloc(kvm->mm);
1213        if (!kvm->arch.pgtable)
1214                return -ENOMEM;
1215        return 0;
1216}
1217
1218static void pte_ctor(void *addr)
1219{
1220        memset(addr, 0, RADIX_PTE_TABLE_SIZE);
1221}
1222
1223static void pmd_ctor(void *addr)
1224{
1225        memset(addr, 0, RADIX_PMD_TABLE_SIZE);
1226}
1227
1228struct debugfs_radix_state {
1229        struct kvm      *kvm;
1230        struct mutex    mutex;
1231        unsigned long   gpa;
1232        int             lpid;
1233        int             chars_left;
1234        int             buf_index;
1235        char            buf[128];
1236        u8              hdr;
1237};
1238
1239static int debugfs_radix_open(struct inode *inode, struct file *file)
1240{
1241        struct kvm *kvm = inode->i_private;
1242        struct debugfs_radix_state *p;
1243
1244        p = kzalloc(sizeof(*p), GFP_KERNEL);
1245        if (!p)
1246                return -ENOMEM;
1247
1248        kvm_get_kvm(kvm);
1249        p->kvm = kvm;
1250        mutex_init(&p->mutex);
1251        file->private_data = p;
1252
1253        return nonseekable_open(inode, file);
1254}
1255
1256static int debugfs_radix_release(struct inode *inode, struct file *file)
1257{
1258        struct debugfs_radix_state *p = file->private_data;
1259
1260        kvm_put_kvm(p->kvm);
1261        kfree(p);
1262        return 0;
1263}
1264
1265static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
1266                                 size_t len, loff_t *ppos)
1267{
1268        struct debugfs_radix_state *p = file->private_data;
1269        ssize_t ret, r;
1270        unsigned long n;
1271        struct kvm *kvm;
1272        unsigned long gpa;
1273        pgd_t *pgt;
1274        struct kvm_nested_guest *nested;
1275        pgd_t *pgdp;
1276        p4d_t p4d, *p4dp;
1277        pud_t pud, *pudp;
1278        pmd_t pmd, *pmdp;
1279        pte_t *ptep;
1280        int shift;
1281        unsigned long pte;
1282
1283        kvm = p->kvm;
1284        if (!kvm_is_radix(kvm))
1285                return 0;
1286
1287        ret = mutex_lock_interruptible(&p->mutex);
1288        if (ret)
1289                return ret;
1290
1291        if (p->chars_left) {
1292                n = p->chars_left;
1293                if (n > len)
1294                        n = len;
1295                r = copy_to_user(buf, p->buf + p->buf_index, n);
1296                n -= r;
1297                p->chars_left -= n;
1298                p->buf_index += n;
1299                buf += n;
1300                len -= n;
1301                ret = n;
1302                if (r) {
1303                        if (!n)
1304                                ret = -EFAULT;
1305                        goto out;
1306                }
1307        }
1308
1309        gpa = p->gpa;
1310        nested = NULL;
1311        pgt = NULL;
1312        while (len != 0 && p->lpid >= 0) {
1313                if (gpa >= RADIX_PGTABLE_RANGE) {
1314                        gpa = 0;
1315                        pgt = NULL;
1316                        if (nested) {
1317                                kvmhv_put_nested(nested);
1318                                nested = NULL;
1319                        }
1320                        p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
1321                        p->hdr = 0;
1322                        if (p->lpid < 0)
1323                                break;
1324                }
1325                if (!pgt) {
1326                        if (p->lpid == 0) {
1327                                pgt = kvm->arch.pgtable;
1328                        } else {
1329                                nested = kvmhv_get_nested(kvm, p->lpid, false);
1330                                if (!nested) {
1331                                        gpa = RADIX_PGTABLE_RANGE;
1332                                        continue;
1333                                }
1334                                pgt = nested->shadow_pgtable;
1335                        }
1336                }
1337                n = 0;
1338                if (!p->hdr) {
1339                        if (p->lpid > 0)
1340                                n = scnprintf(p->buf, sizeof(p->buf),
1341                                              "\nNested LPID %d: ", p->lpid);
1342                        n += scnprintf(p->buf + n, sizeof(p->buf) - n,
1343                                      "pgdir: %lx\n", (unsigned long)pgt);
1344                        p->hdr = 1;
1345                        goto copy;
1346                }
1347
1348                pgdp = pgt + pgd_index(gpa);
1349                p4dp = p4d_offset(pgdp, gpa);
1350                p4d = READ_ONCE(*p4dp);
1351                if (!(p4d_val(p4d) & _PAGE_PRESENT)) {
1352                        gpa = (gpa & P4D_MASK) + P4D_SIZE;
1353                        continue;
1354                }
1355
1356                pudp = pud_offset(&p4d, gpa);
1357                pud = READ_ONCE(*pudp);
1358                if (!(pud_val(pud) & _PAGE_PRESENT)) {
1359                        gpa = (gpa & PUD_MASK) + PUD_SIZE;
1360                        continue;
1361                }
1362                if (pud_val(pud) & _PAGE_PTE) {
1363                        pte = pud_val(pud);
1364                        shift = PUD_SHIFT;
1365                        goto leaf;
1366                }
1367
1368                pmdp = pmd_offset(&pud, gpa);
1369                pmd = READ_ONCE(*pmdp);
1370                if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
1371                        gpa = (gpa & PMD_MASK) + PMD_SIZE;
1372                        continue;
1373                }
1374                if (pmd_val(pmd) & _PAGE_PTE) {
1375                        pte = pmd_val(pmd);
1376                        shift = PMD_SHIFT;
1377                        goto leaf;
1378                }
1379
1380                ptep = pte_offset_kernel(&pmd, gpa);
1381                pte = pte_val(READ_ONCE(*ptep));
1382                if (!(pte & _PAGE_PRESENT)) {
1383                        gpa += PAGE_SIZE;
1384                        continue;
1385                }
1386                shift = PAGE_SHIFT;
1387        leaf:
1388                n = scnprintf(p->buf, sizeof(p->buf),
1389                              " %lx: %lx %d\n", gpa, pte, shift);
1390                gpa += 1ul << shift;
1391        copy:
1392                p->chars_left = n;
1393                if (n > len)
1394                        n = len;
1395                r = copy_to_user(buf, p->buf, n);
1396                n -= r;
1397                p->chars_left -= n;
1398                p->buf_index = n;
1399                buf += n;
1400                len -= n;
1401                ret += n;
1402                if (r) {
1403                        if (!ret)
1404                                ret = -EFAULT;
1405                        break;
1406                }
1407        }
1408        p->gpa = gpa;
1409        if (nested)
1410                kvmhv_put_nested(nested);
1411
1412 out:
1413        mutex_unlock(&p->mutex);
1414        return ret;
1415}
1416
1417static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
1418                           size_t len, loff_t *ppos)
1419{
1420        return -EACCES;
1421}
1422
1423static const struct file_operations debugfs_radix_fops = {
1424        .owner   = THIS_MODULE,
1425        .open    = debugfs_radix_open,
1426        .release = debugfs_radix_release,
1427        .read    = debugfs_radix_read,
1428        .write   = debugfs_radix_write,
1429        .llseek  = generic_file_llseek,
1430};
1431
1432void kvmhv_radix_debugfs_init(struct kvm *kvm)
1433{
1434        debugfs_create_file("radix", 0400, kvm->arch.debugfs_dir, kvm,
1435                            &debugfs_radix_fops);
1436}
1437
1438int kvmppc_radix_init(void)
1439{
1440        unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
1441
1442        kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor);
1443        if (!kvm_pte_cache)
1444                return -ENOMEM;
1445
1446        size = sizeof(void *) << RADIX_PMD_INDEX_SIZE;
1447
1448        kvm_pmd_cache = kmem_cache_create("kvm-pmd", size, size, 0, pmd_ctor);
1449        if (!kvm_pmd_cache) {
1450                kmem_cache_destroy(kvm_pte_cache);
1451                return -ENOMEM;
1452        }
1453
1454        return 0;
1455}
1456
1457void kvmppc_radix_exit(void)
1458{
1459        kmem_cache_destroy(kvm_pte_cache);
1460        kmem_cache_destroy(kvm_pmd_cache);
1461}
1462