linux/arch/powerpc/kvm/book3s_hv_rm_mmu.c
<<
>>
Prefs
   1/*
   2 * This program is free software; you can redistribute it and/or modify
   3 * it under the terms of the GNU General Public License, version 2, as
   4 * published by the Free Software Foundation.
   5 *
   6 * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   7 */
   8
   9#include <linux/types.h>
  10#include <linux/string.h>
  11#include <linux/kvm.h>
  12#include <linux/kvm_host.h>
  13#include <linux/hugetlb.h>
  14#include <linux/module.h>
  15
  16#include <asm/tlbflush.h>
  17#include <asm/kvm_ppc.h>
  18#include <asm/kvm_book3s.h>
  19#include <asm/mmu-hash64.h>
  20#include <asm/hvcall.h>
  21#include <asm/synch.h>
  22#include <asm/ppc-opcode.h>
  23
  24/* Translate address of a vmalloc'd thing to a linear map address */
  25static void *real_vmalloc_addr(void *x)
  26{
  27        unsigned long addr = (unsigned long) x;
  28        pte_t *p;
  29
  30        p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
  31        if (!p || !pte_present(*p))
  32                return NULL;
  33        /* assume we don't have huge pages in vmalloc space... */
  34        addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
  35        return __va(addr);
  36}
  37
  38/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
  39static int global_invalidates(struct kvm *kvm, unsigned long flags)
  40{
  41        int global;
  42
  43        /*
  44         * If there is only one vcore, and it's currently running,
  45         * we can use tlbiel as long as we mark all other physical
  46         * cores as potentially having stale TLB entries for this lpid.
  47         * If we're not using MMU notifiers, we never take pages away
  48         * from the guest, so we can use tlbiel if requested.
  49         * Otherwise, don't use tlbiel.
  50         */
  51        if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore)
  52                global = 0;
  53        else if (kvm->arch.using_mmu_notifiers)
  54                global = 1;
  55        else
  56                global = !(flags & H_LOCAL);
  57
  58        if (!global) {
  59                /* any other core might now have stale TLB entries... */
  60                smp_wmb();
  61                cpumask_setall(&kvm->arch.need_tlb_flush);
  62                cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
  63                                  &kvm->arch.need_tlb_flush);
  64        }
  65
  66        return global;
  67}
  68
  69/*
  70 * Add this HPTE into the chain for the real page.
  71 * Must be called with the chain locked; it unlocks the chain.
  72 */
  73void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
  74                             unsigned long *rmap, long pte_index, int realmode)
  75{
  76        struct revmap_entry *head, *tail;
  77        unsigned long i;
  78
  79        if (*rmap & KVMPPC_RMAP_PRESENT) {
  80                i = *rmap & KVMPPC_RMAP_INDEX;
  81                head = &kvm->arch.revmap[i];
  82                if (realmode)
  83                        head = real_vmalloc_addr(head);
  84                tail = &kvm->arch.revmap[head->back];
  85                if (realmode)
  86                        tail = real_vmalloc_addr(tail);
  87                rev->forw = i;
  88                rev->back = head->back;
  89                tail->forw = pte_index;
  90                head->back = pte_index;
  91        } else {
  92                rev->forw = rev->back = pte_index;
  93                *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) |
  94                        pte_index | KVMPPC_RMAP_PRESENT;
  95        }
  96        unlock_rmap(rmap);
  97}
  98EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
  99
 100/* Remove this HPTE from the chain for a real page */
 101static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 102                                struct revmap_entry *rev,
 103                                unsigned long hpte_v, unsigned long hpte_r)
 104{
 105        struct revmap_entry *next, *prev;
 106        unsigned long gfn, ptel, head;
 107        struct kvm_memory_slot *memslot;
 108        unsigned long *rmap;
 109        unsigned long rcbits;
 110
 111        rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
 112        ptel = rev->guest_rpte |= rcbits;
 113        gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
 114        memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
 115        if (!memslot)
 116                return;
 117
 118        rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
 119        lock_rmap(rmap);
 120
 121        head = *rmap & KVMPPC_RMAP_INDEX;
 122        next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]);
 123        prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]);
 124        next->back = rev->back;
 125        prev->forw = rev->forw;
 126        if (head == pte_index) {
 127                head = rev->forw;
 128                if (head == pte_index)
 129                        *rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
 130                else
 131                        *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
 132        }
 133        *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
 134        unlock_rmap(rmap);
 135}
 136
 137static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
 138                              int writing, unsigned long *pte_sizep)
 139{
 140        pte_t *ptep;
 141        unsigned long ps = *pte_sizep;
 142        unsigned int hugepage_shift;
 143
 144        ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift);
 145        if (!ptep)
 146                return __pte(0);
 147        if (hugepage_shift)
 148                *pte_sizep = 1ul << hugepage_shift;
 149        else
 150                *pte_sizep = PAGE_SIZE;
 151        if (ps > *pte_sizep)
 152                return __pte(0);
 153        return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
 154}
 155
 156static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
 157{
 158        asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
 159        hpte[0] = hpte_v;
 160}
 161
 162long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 163                       long pte_index, unsigned long pteh, unsigned long ptel,
 164                       pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
 165{
 166        unsigned long i, pa, gpa, gfn, psize;
 167        unsigned long slot_fn, hva;
 168        unsigned long *hpte;
 169        struct revmap_entry *rev;
 170        unsigned long g_ptel;
 171        struct kvm_memory_slot *memslot;
 172        unsigned long *physp, pte_size;
 173        unsigned long is_io;
 174        unsigned long *rmap;
 175        pte_t pte;
 176        unsigned int writing;
 177        unsigned long mmu_seq;
 178        unsigned long rcbits;
 179
 180        psize = hpte_page_size(pteh, ptel);
 181        if (!psize)
 182                return H_PARAMETER;
 183        writing = hpte_is_writable(ptel);
 184        pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
 185        ptel &= ~HPTE_GR_RESERVED;
 186        g_ptel = ptel;
 187
 188        /* used later to detect if we might have been invalidated */
 189        mmu_seq = kvm->mmu_notifier_seq;
 190        smp_rmb();
 191
 192        /* Find the memslot (if any) for this address */
 193        gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
 194        gfn = gpa >> PAGE_SHIFT;
 195        memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
 196        pa = 0;
 197        is_io = ~0ul;
 198        rmap = NULL;
 199        if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) {
 200                /* PPC970 can't do emulated MMIO */
 201                if (!cpu_has_feature(CPU_FTR_ARCH_206))
 202                        return H_PARAMETER;
 203                /* Emulated MMIO - mark this with key=31 */
 204                pteh |= HPTE_V_ABSENT;
 205                ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO;
 206                goto do_insert;
 207        }
 208
 209        /* Check if the requested page fits entirely in the memslot. */
 210        if (!slot_is_aligned(memslot, psize))
 211                return H_PARAMETER;
 212        slot_fn = gfn - memslot->base_gfn;
 213        rmap = &memslot->arch.rmap[slot_fn];
 214
 215        if (!kvm->arch.using_mmu_notifiers) {
 216                physp = memslot->arch.slot_phys;
 217                if (!physp)
 218                        return H_PARAMETER;
 219                physp += slot_fn;
 220                if (realmode)
 221                        physp = real_vmalloc_addr(physp);
 222                pa = *physp;
 223                if (!pa)
 224                        return H_TOO_HARD;
 225                is_io = pa & (HPTE_R_I | HPTE_R_W);
 226                pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK);
 227                pa &= PAGE_MASK;
 228        } else {
 229                /* Translate to host virtual address */
 230                hva = __gfn_to_hva_memslot(memslot, gfn);
 231
 232                /* Look up the Linux PTE for the backing page */
 233                pte_size = psize;
 234                pte = lookup_linux_pte(pgdir, hva, writing, &pte_size);
 235                if (pte_present(pte)) {
 236                        if (writing && !pte_write(pte))
 237                                /* make the actual HPTE be read-only */
 238                                ptel = hpte_make_readonly(ptel);
 239                        is_io = hpte_cache_bits(pte_val(pte));
 240                        pa = pte_pfn(pte) << PAGE_SHIFT;
 241                }
 242        }
 243
 244        if (pte_size < psize)
 245                return H_PARAMETER;
 246        if (pa && pte_size > psize)
 247                pa |= gpa & (pte_size - 1);
 248
 249        ptel &= ~(HPTE_R_PP0 - psize);
 250        ptel |= pa;
 251
 252        if (pa)
 253                pteh |= HPTE_V_VALID;
 254        else
 255                pteh |= HPTE_V_ABSENT;
 256
 257        /* Check WIMG */
 258        if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) {
 259                if (is_io)
 260                        return H_PARAMETER;
 261                /*
 262                 * Allow guest to map emulated device memory as
 263                 * uncacheable, but actually make it cacheable.
 264                 */
 265                ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G);
 266                ptel |= HPTE_R_M;
 267        }
 268
 269        /* Find and lock the HPTEG slot to use */
 270 do_insert:
 271        if (pte_index >= kvm->arch.hpt_npte)
 272                return H_PARAMETER;
 273        if (likely((flags & H_EXACT) == 0)) {
 274                pte_index &= ~7UL;
 275                hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 276                for (i = 0; i < 8; ++i) {
 277                        if ((*hpte & HPTE_V_VALID) == 0 &&
 278                            try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
 279                                          HPTE_V_ABSENT))
 280                                break;
 281                        hpte += 2;
 282                }
 283                if (i == 8) {
 284                        /*
 285                         * Since try_lock_hpte doesn't retry (not even stdcx.
 286                         * failures), it could be that there is a free slot
 287                         * but we transiently failed to lock it.  Try again,
 288                         * actually locking each slot and checking it.
 289                         */
 290                        hpte -= 16;
 291                        for (i = 0; i < 8; ++i) {
 292                                while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 293                                        cpu_relax();
 294                                if (!(*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)))
 295                                        break;
 296                                *hpte &= ~HPTE_V_HVLOCK;
 297                                hpte += 2;
 298                        }
 299                        if (i == 8)
 300                                return H_PTEG_FULL;
 301                }
 302                pte_index += i;
 303        } else {
 304                hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 305                if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
 306                                   HPTE_V_ABSENT)) {
 307                        /* Lock the slot and check again */
 308                        while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 309                                cpu_relax();
 310                        if (*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
 311                                *hpte &= ~HPTE_V_HVLOCK;
 312                                return H_PTEG_FULL;
 313                        }
 314                }
 315        }
 316
 317        /* Save away the guest's idea of the second HPTE dword */
 318        rev = &kvm->arch.revmap[pte_index];
 319        if (realmode)
 320                rev = real_vmalloc_addr(rev);
 321        if (rev) {
 322                rev->guest_rpte = g_ptel;
 323                note_hpte_modification(kvm, rev);
 324        }
 325
 326        /* Link HPTE into reverse-map chain */
 327        if (pteh & HPTE_V_VALID) {
 328                if (realmode)
 329                        rmap = real_vmalloc_addr(rmap);
 330                lock_rmap(rmap);
 331                /* Check for pending invalidations under the rmap chain lock */
 332                if (kvm->arch.using_mmu_notifiers &&
 333                    mmu_notifier_retry(kvm, mmu_seq)) {
 334                        /* inval in progress, write a non-present HPTE */
 335                        pteh |= HPTE_V_ABSENT;
 336                        pteh &= ~HPTE_V_VALID;
 337                        unlock_rmap(rmap);
 338                } else {
 339                        kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index,
 340                                                realmode);
 341                        /* Only set R/C in real HPTE if already set in *rmap */
 342                        rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
 343                        ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C);
 344                }
 345        }
 346
 347        hpte[1] = ptel;
 348
 349        /* Write the first HPTE dword, unlocking the HPTE and making it valid */
 350        eieio();
 351        hpte[0] = pteh;
 352        asm volatile("ptesync" : : : "memory");
 353
 354        *pte_idx_ret = pte_index;
 355        return H_SUCCESS;
 356}
 357EXPORT_SYMBOL_GPL(kvmppc_do_h_enter);
 358
 359long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 360                    long pte_index, unsigned long pteh, unsigned long ptel)
 361{
 362        return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel,
 363                                 vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]);
 364}
 365
 366#ifdef __BIG_ENDIAN__
 367#define LOCK_TOKEN      (*(u32 *)(&get_paca()->lock_token))
 368#else
 369#define LOCK_TOKEN      (*(u32 *)(&get_paca()->paca_index))
 370#endif
 371
 372static inline int try_lock_tlbie(unsigned int *lock)
 373{
 374        unsigned int tmp, old;
 375        unsigned int token = LOCK_TOKEN;
 376
 377        asm volatile("1:lwarx   %1,0,%2\n"
 378                     "  cmpwi   cr0,%1,0\n"
 379                     "  bne     2f\n"
 380                     "  stwcx.  %3,0,%2\n"
 381                     "  bne-    1b\n"
 382                     "  isync\n"
 383                     "2:"
 384                     : "=&r" (tmp), "=&r" (old)
 385                     : "r" (lock), "r" (token)
 386                     : "cc", "memory");
 387        return old == 0;
 388}
 389
 390/*
 391 * tlbie/tlbiel is a bit different on the PPC970 compared to later
 392 * processors such as POWER7; the large page bit is in the instruction
 393 * not RB, and the top 16 bits and the bottom 12 bits of the VA
 394 * in RB must be 0.
 395 */
 396static void do_tlbies_970(struct kvm *kvm, unsigned long *rbvalues,
 397                          long npages, int global, bool need_sync)
 398{
 399        long i;
 400
 401        if (global) {
 402                while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
 403                        cpu_relax();
 404                if (need_sync)
 405                        asm volatile("ptesync" : : : "memory");
 406                for (i = 0; i < npages; ++i) {
 407                        unsigned long rb = rbvalues[i];
 408
 409                        if (rb & 1)             /* large page */
 410                                asm volatile("tlbie %0,1" : :
 411                                             "r" (rb & 0x0000fffffffff000ul));
 412                        else
 413                                asm volatile("tlbie %0,0" : :
 414                                             "r" (rb & 0x0000fffffffff000ul));
 415                }
 416                asm volatile("eieio; tlbsync; ptesync" : : : "memory");
 417                kvm->arch.tlbie_lock = 0;
 418        } else {
 419                if (need_sync)
 420                        asm volatile("ptesync" : : : "memory");
 421                for (i = 0; i < npages; ++i) {
 422                        unsigned long rb = rbvalues[i];
 423
 424                        if (rb & 1)             /* large page */
 425                                asm volatile("tlbiel %0,1" : :
 426                                             "r" (rb & 0x0000fffffffff000ul));
 427                        else
 428                                asm volatile("tlbiel %0,0" : :
 429                                             "r" (rb & 0x0000fffffffff000ul));
 430                }
 431                asm volatile("ptesync" : : : "memory");
 432        }
 433}
 434
 435static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
 436                      long npages, int global, bool need_sync)
 437{
 438        long i;
 439
 440        if (cpu_has_feature(CPU_FTR_ARCH_201)) {
 441                /* PPC970 tlbie instruction is a bit different */
 442                do_tlbies_970(kvm, rbvalues, npages, global, need_sync);
 443                return;
 444        }
 445        if (global) {
 446                while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
 447                        cpu_relax();
 448                if (need_sync)
 449                        asm volatile("ptesync" : : : "memory");
 450                for (i = 0; i < npages; ++i)
 451                        asm volatile(PPC_TLBIE(%1,%0) : :
 452                                     "r" (rbvalues[i]), "r" (kvm->arch.lpid));
 453                asm volatile("eieio; tlbsync; ptesync" : : : "memory");
 454                kvm->arch.tlbie_lock = 0;
 455        } else {
 456                if (need_sync)
 457                        asm volatile("ptesync" : : : "memory");
 458                for (i = 0; i < npages; ++i)
 459                        asm volatile("tlbiel %0" : : "r" (rbvalues[i]));
 460                asm volatile("ptesync" : : : "memory");
 461        }
 462}
 463
 464long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 465                        unsigned long pte_index, unsigned long avpn,
 466                        unsigned long *hpret)
 467{
 468        unsigned long *hpte;
 469        unsigned long v, r, rb;
 470        struct revmap_entry *rev;
 471
 472        if (pte_index >= kvm->arch.hpt_npte)
 473                return H_PARAMETER;
 474        hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 475        while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 476                cpu_relax();
 477        if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
 478            ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
 479            ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) {
 480                hpte[0] &= ~HPTE_V_HVLOCK;
 481                return H_NOT_FOUND;
 482        }
 483
 484        rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 485        v = hpte[0] & ~HPTE_V_HVLOCK;
 486        if (v & HPTE_V_VALID) {
 487                hpte[0] &= ~HPTE_V_VALID;
 488                rb = compute_tlbie_rb(v, hpte[1], pte_index);
 489                do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
 490                /* Read PTE low word after tlbie to get final R/C values */
 491                remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
 492        }
 493        r = rev->guest_rpte & ~HPTE_GR_RESERVED;
 494        note_hpte_modification(kvm, rev);
 495        unlock_hpte(hpte, 0);
 496
 497        hpret[0] = v;
 498        hpret[1] = r;
 499        return H_SUCCESS;
 500}
 501EXPORT_SYMBOL_GPL(kvmppc_do_h_remove);
 502
 503long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
 504                     unsigned long pte_index, unsigned long avpn)
 505{
 506        return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn,
 507                                  &vcpu->arch.gpr[4]);
 508}
 509
 510long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 511{
 512        struct kvm *kvm = vcpu->kvm;
 513        unsigned long *args = &vcpu->arch.gpr[4];
 514        unsigned long *hp, *hptes[4], tlbrb[4];
 515        long int i, j, k, n, found, indexes[4];
 516        unsigned long flags, req, pte_index, rcbits;
 517        int global;
 518        long int ret = H_SUCCESS;
 519        struct revmap_entry *rev, *revs[4];
 520
 521        global = global_invalidates(kvm, 0);
 522        for (i = 0; i < 4 && ret == H_SUCCESS; ) {
 523                n = 0;
 524                for (; i < 4; ++i) {
 525                        j = i * 2;
 526                        pte_index = args[j];
 527                        flags = pte_index >> 56;
 528                        pte_index &= ((1ul << 56) - 1);
 529                        req = flags >> 6;
 530                        flags &= 3;
 531                        if (req == 3) {         /* no more requests */
 532                                i = 4;
 533                                break;
 534                        }
 535                        if (req != 1 || flags == 3 ||
 536                            pte_index >= kvm->arch.hpt_npte) {
 537                                /* parameter error */
 538                                args[j] = ((0xa0 | flags) << 56) + pte_index;
 539                                ret = H_PARAMETER;
 540                                break;
 541                        }
 542                        hp = (unsigned long *)
 543                                (kvm->arch.hpt_virt + (pte_index << 4));
 544                        /* to avoid deadlock, don't spin except for first */
 545                        if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) {
 546                                if (n)
 547                                        break;
 548                                while (!try_lock_hpte(hp, HPTE_V_HVLOCK))
 549                                        cpu_relax();
 550                        }
 551                        found = 0;
 552                        if (hp[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) {
 553                                switch (flags & 3) {
 554                                case 0:         /* absolute */
 555                                        found = 1;
 556                                        break;
 557                                case 1:         /* andcond */
 558                                        if (!(hp[0] & args[j + 1]))
 559                                                found = 1;
 560                                        break;
 561                                case 2:         /* AVPN */
 562                                        if ((hp[0] & ~0x7fUL) == args[j + 1])
 563                                                found = 1;
 564                                        break;
 565                                }
 566                        }
 567                        if (!found) {
 568                                hp[0] &= ~HPTE_V_HVLOCK;
 569                                args[j] = ((0x90 | flags) << 56) + pte_index;
 570                                continue;
 571                        }
 572
 573                        args[j] = ((0x80 | flags) << 56) + pte_index;
 574                        rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 575                        note_hpte_modification(kvm, rev);
 576
 577                        if (!(hp[0] & HPTE_V_VALID)) {
 578                                /* insert R and C bits from PTE */
 579                                rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
 580                                args[j] |= rcbits << (56 - 5);
 581                                hp[0] = 0;
 582                                continue;
 583                        }
 584
 585                        hp[0] &= ~HPTE_V_VALID;         /* leave it locked */
 586                        tlbrb[n] = compute_tlbie_rb(hp[0], hp[1], pte_index);
 587                        indexes[n] = j;
 588                        hptes[n] = hp;
 589                        revs[n] = rev;
 590                        ++n;
 591                }
 592
 593                if (!n)
 594                        break;
 595
 596                /* Now that we've collected a batch, do the tlbies */
 597                do_tlbies(kvm, tlbrb, n, global, true);
 598
 599                /* Read PTE low words after tlbie to get final R/C values */
 600                for (k = 0; k < n; ++k) {
 601                        j = indexes[k];
 602                        pte_index = args[j] & ((1ul << 56) - 1);
 603                        hp = hptes[k];
 604                        rev = revs[k];
 605                        remove_revmap_chain(kvm, pte_index, rev, hp[0], hp[1]);
 606                        rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
 607                        args[j] |= rcbits << (56 - 5);
 608                        hp[0] = 0;
 609                }
 610        }
 611
 612        return ret;
 613}
 614
 615long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 616                      unsigned long pte_index, unsigned long avpn,
 617                      unsigned long va)
 618{
 619        struct kvm *kvm = vcpu->kvm;
 620        unsigned long *hpte;
 621        struct revmap_entry *rev;
 622        unsigned long v, r, rb, mask, bits;
 623
 624        if (pte_index >= kvm->arch.hpt_npte)
 625                return H_PARAMETER;
 626
 627        hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 628        while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 629                cpu_relax();
 630        if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
 631            ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
 632                hpte[0] &= ~HPTE_V_HVLOCK;
 633                return H_NOT_FOUND;
 634        }
 635
 636        v = hpte[0];
 637        bits = (flags << 55) & HPTE_R_PP0;
 638        bits |= (flags << 48) & HPTE_R_KEY_HI;
 639        bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
 640
 641        /* Update guest view of 2nd HPTE dword */
 642        mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
 643                HPTE_R_KEY_HI | HPTE_R_KEY_LO;
 644        rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 645        if (rev) {
 646                r = (rev->guest_rpte & ~mask) | bits;
 647                rev->guest_rpte = r;
 648                note_hpte_modification(kvm, rev);
 649        }
 650        r = (hpte[1] & ~mask) | bits;
 651
 652        /* Update HPTE */
 653        if (v & HPTE_V_VALID) {
 654                rb = compute_tlbie_rb(v, r, pte_index);
 655                hpte[0] = v & ~HPTE_V_VALID;
 656                do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
 657                /*
 658                 * If the host has this page as readonly but the guest
 659                 * wants to make it read/write, reduce the permissions.
 660                 * Checking the host permissions involves finding the
 661                 * memslot and then the Linux PTE for the page.
 662                 */
 663                if (hpte_is_writable(r) && kvm->arch.using_mmu_notifiers) {
 664                        unsigned long psize, gfn, hva;
 665                        struct kvm_memory_slot *memslot;
 666                        pgd_t *pgdir = vcpu->arch.pgdir;
 667                        pte_t pte;
 668
 669                        psize = hpte_page_size(v, r);
 670                        gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
 671                        memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
 672                        if (memslot) {
 673                                hva = __gfn_to_hva_memslot(memslot, gfn);
 674                                pte = lookup_linux_pte(pgdir, hva, 1, &psize);
 675                                if (pte_present(pte) && !pte_write(pte))
 676                                        r = hpte_make_readonly(r);
 677                        }
 678                }
 679        }
 680        hpte[1] = r;
 681        eieio();
 682        hpte[0] = v & ~HPTE_V_HVLOCK;
 683        asm volatile("ptesync" : : : "memory");
 684        return H_SUCCESS;
 685}
 686
 687long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 688                   unsigned long pte_index)
 689{
 690        struct kvm *kvm = vcpu->kvm;
 691        unsigned long *hpte, v, r;
 692        int i, n = 1;
 693        struct revmap_entry *rev = NULL;
 694
 695        if (pte_index >= kvm->arch.hpt_npte)
 696                return H_PARAMETER;
 697        if (flags & H_READ_4) {
 698                pte_index &= ~3;
 699                n = 4;
 700        }
 701        rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 702        for (i = 0; i < n; ++i, ++pte_index) {
 703                hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 704                v = hpte[0] & ~HPTE_V_HVLOCK;
 705                r = hpte[1];
 706                if (v & HPTE_V_ABSENT) {
 707                        v &= ~HPTE_V_ABSENT;
 708                        v |= HPTE_V_VALID;
 709                }
 710                if (v & HPTE_V_VALID) {
 711                        r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
 712                        r &= ~HPTE_GR_RESERVED;
 713                }
 714                vcpu->arch.gpr[4 + i * 2] = v;
 715                vcpu->arch.gpr[5 + i * 2] = r;
 716        }
 717        return H_SUCCESS;
 718}
 719
 720void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,
 721                        unsigned long pte_index)
 722{
 723        unsigned long rb;
 724
 725        hptep[0] &= ~HPTE_V_VALID;
 726        rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
 727        do_tlbies(kvm, &rb, 1, 1, true);
 728}
 729EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);
 730
 731void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
 732                           unsigned long pte_index)
 733{
 734        unsigned long rb;
 735        unsigned char rbyte;
 736
 737        rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
 738        rbyte = (hptep[1] & ~HPTE_R_R) >> 8;
 739        /* modify only the second-last byte, which contains the ref bit */
 740        *((char *)hptep + 14) = rbyte;
 741        do_tlbies(kvm, &rb, 1, 1, false);
 742}
 743EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte);
 744
 745static int slb_base_page_shift[4] = {
 746        24,     /* 16M */
 747        16,     /* 64k */
 748        34,     /* 16G */
 749        20,     /* 1M, unsupported */
 750};
 751
 752long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 753                              unsigned long valid)
 754{
 755        unsigned int i;
 756        unsigned int pshift;
 757        unsigned long somask;
 758        unsigned long vsid, hash;
 759        unsigned long avpn;
 760        unsigned long *hpte;
 761        unsigned long mask, val;
 762        unsigned long v, r;
 763
 764        /* Get page shift, work out hash and AVPN etc. */
 765        mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY;
 766        val = 0;
 767        pshift = 12;
 768        if (slb_v & SLB_VSID_L) {
 769                mask |= HPTE_V_LARGE;
 770                val |= HPTE_V_LARGE;
 771                pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4];
 772        }
 773        if (slb_v & SLB_VSID_B_1T) {
 774                somask = (1UL << 40) - 1;
 775                vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T;
 776                vsid ^= vsid << 25;
 777        } else {
 778                somask = (1UL << 28) - 1;
 779                vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
 780        }
 781        hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask;
 782        avpn = slb_v & ~(somask >> 16); /* also includes B */
 783        avpn |= (eaddr & somask) >> 16;
 784
 785        if (pshift >= 24)
 786                avpn &= ~((1UL << (pshift - 16)) - 1);
 787        else
 788                avpn &= ~0x7fUL;
 789        val |= avpn;
 790
 791        for (;;) {
 792                hpte = (unsigned long *)(kvm->arch.hpt_virt + (hash << 7));
 793
 794                for (i = 0; i < 16; i += 2) {
 795                        /* Read the PTE racily */
 796                        v = hpte[i] & ~HPTE_V_HVLOCK;
 797
 798                        /* Check valid/absent, hash, segment size and AVPN */
 799                        if (!(v & valid) || (v & mask) != val)
 800                                continue;
 801
 802                        /* Lock the PTE and read it under the lock */
 803                        while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK))
 804                                cpu_relax();
 805                        v = hpte[i] & ~HPTE_V_HVLOCK;
 806                        r = hpte[i+1];
 807
 808                        /*
 809                         * Check the HPTE again, including large page size
 810                         * Since we don't currently allow any MPSS (mixed
 811                         * page-size segment) page sizes, it is sufficient
 812                         * to check against the actual page size.
 813                         */
 814                        if ((v & valid) && (v & mask) == val &&
 815                            hpte_page_size(v, r) == (1ul << pshift))
 816                                /* Return with the HPTE still locked */
 817                                return (hash << 3) + (i >> 1);
 818
 819                        /* Unlock and move on */
 820                        hpte[i] = v;
 821                }
 822
 823                if (val & HPTE_V_SECONDARY)
 824                        break;
 825                val |= HPTE_V_SECONDARY;
 826                hash = hash ^ kvm->arch.hpt_mask;
 827        }
 828        return -1;
 829}
 830EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte);
 831
 832/*
 833 * Called in real mode to check whether an HPTE not found fault
 834 * is due to accessing a paged-out page or an emulated MMIO page,
 835 * or if a protection fault is due to accessing a page that the
 836 * guest wanted read/write access to but which we made read-only.
 837 * Returns a possibly modified status (DSISR) value if not
 838 * (i.e. pass the interrupt to the guest),
 839 * -1 to pass the fault up to host kernel mode code, -2 to do that
 840 * and also load the instruction word (for MMIO emulation),
 841 * or 0 if we should make the guest retry the access.
 842 */
 843long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 844                          unsigned long slb_v, unsigned int status, bool data)
 845{
 846        struct kvm *kvm = vcpu->kvm;
 847        long int index;
 848        unsigned long v, r, gr;
 849        unsigned long *hpte;
 850        unsigned long valid;
 851        struct revmap_entry *rev;
 852        unsigned long pp, key;
 853
 854        /* For protection fault, expect to find a valid HPTE */
 855        valid = HPTE_V_VALID;
 856        if (status & DSISR_NOHPTE)
 857                valid |= HPTE_V_ABSENT;
 858
 859        index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
 860        if (index < 0) {
 861                if (status & DSISR_NOHPTE)
 862                        return status;  /* there really was no HPTE */
 863                return 0;               /* for prot fault, HPTE disappeared */
 864        }
 865        hpte = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
 866        v = hpte[0] & ~HPTE_V_HVLOCK;
 867        r = hpte[1];
 868        rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
 869        gr = rev->guest_rpte;
 870
 871        unlock_hpte(hpte, v);
 872
 873        /* For not found, if the HPTE is valid by now, retry the instruction */
 874        if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID))
 875                return 0;
 876
 877        /* Check access permissions to the page */
 878        pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
 879        key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
 880        status &= ~DSISR_NOHPTE;        /* DSISR_NOHPTE == SRR1_ISI_NOPT */
 881        if (!data) {
 882                if (gr & (HPTE_R_N | HPTE_R_G))
 883                        return status | SRR1_ISI_N_OR_G;
 884                if (!hpte_read_permission(pp, slb_v & key))
 885                        return status | SRR1_ISI_PROT;
 886        } else if (status & DSISR_ISSTORE) {
 887                /* check write permission */
 888                if (!hpte_write_permission(pp, slb_v & key))
 889                        return status | DSISR_PROTFAULT;
 890        } else {
 891                if (!hpte_read_permission(pp, slb_v & key))
 892                        return status | DSISR_PROTFAULT;
 893        }
 894
 895        /* Check storage key, if applicable */
 896        if (data && (vcpu->arch.shregs.msr & MSR_DR)) {
 897                unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr);
 898                if (status & DSISR_ISSTORE)
 899                        perm >>= 1;
 900                if (perm & 1)
 901                        return status | DSISR_KEYFAULT;
 902        }
 903
 904        /* Save HPTE info for virtual-mode handler */
 905        vcpu->arch.pgfault_addr = addr;
 906        vcpu->arch.pgfault_index = index;
 907        vcpu->arch.pgfault_hpte[0] = v;
 908        vcpu->arch.pgfault_hpte[1] = r;
 909
 910        /* Check the storage key to see if it is possibly emulated MMIO */
 911        if (data && (vcpu->arch.shregs.msr & MSR_IR) &&
 912            (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
 913            (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
 914                return -2;      /* MMIO emulation - load instr word */
 915
 916        return -1;              /* send fault up to host kernel mode */
 917}
 918