linux/arch/powerpc/kvm/book3s_hv_rm_mmu.c
<<
>>
Prefs
   1/*
   2 * This program is free software; you can redistribute it and/or modify
   3 * it under the terms of the GNU General Public License, version 2, as
   4 * published by the Free Software Foundation.
   5 *
   6 * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   7 */
   8
   9#include <linux/types.h>
  10#include <linux/string.h>
  11#include <linux/kvm.h>
  12#include <linux/kvm_host.h>
  13#include <linux/hugetlb.h>
  14#include <linux/module.h>
  15
  16#include <asm/tlbflush.h>
  17#include <asm/kvm_ppc.h>
  18#include <asm/kvm_book3s.h>
  19#include <asm/mmu-hash64.h>
  20#include <asm/hvcall.h>
  21#include <asm/synch.h>
  22#include <asm/ppc-opcode.h>
  23
  24/* Translate address of a vmalloc'd thing to a linear map address */
  25static void *real_vmalloc_addr(void *x)
  26{
  27        unsigned long addr = (unsigned long) x;
  28        pte_t *p;
  29
  30        p = find_linux_pte(swapper_pg_dir, addr);
  31        if (!p || !pte_present(*p))
  32                return NULL;
  33        /* assume we don't have huge pages in vmalloc space... */
  34        addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
  35        return __va(addr);
  36}
  37
  38/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
  39static int global_invalidates(struct kvm *kvm, unsigned long flags)
  40{
  41        int global;
  42
  43        /*
  44         * If there is only one vcore, and it's currently running,
  45         * we can use tlbiel as long as we mark all other physical
  46         * cores as potentially having stale TLB entries for this lpid.
  47         * If we're not using MMU notifiers, we never take pages away
  48         * from the guest, so we can use tlbiel if requested.
  49         * Otherwise, don't use tlbiel.
  50         */
  51        if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore)
  52                global = 0;
  53        else if (kvm->arch.using_mmu_notifiers)
  54                global = 1;
  55        else
  56                global = !(flags & H_LOCAL);
  57
  58        if (!global) {
  59                /* any other core might now have stale TLB entries... */
  60                smp_wmb();
  61                cpumask_setall(&kvm->arch.need_tlb_flush);
  62                cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
  63                                  &kvm->arch.need_tlb_flush);
  64        }
  65
  66        return global;
  67}
  68
  69/*
  70 * Add this HPTE into the chain for the real page.
  71 * Must be called with the chain locked; it unlocks the chain.
  72 */
  73void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
  74                             unsigned long *rmap, long pte_index, int realmode)
  75{
  76        struct revmap_entry *head, *tail;
  77        unsigned long i;
  78
  79        if (*rmap & KVMPPC_RMAP_PRESENT) {
  80                i = *rmap & KVMPPC_RMAP_INDEX;
  81                head = &kvm->arch.revmap[i];
  82                if (realmode)
  83                        head = real_vmalloc_addr(head);
  84                tail = &kvm->arch.revmap[head->back];
  85                if (realmode)
  86                        tail = real_vmalloc_addr(tail);
  87                rev->forw = i;
  88                rev->back = head->back;
  89                tail->forw = pte_index;
  90                head->back = pte_index;
  91        } else {
  92                rev->forw = rev->back = pte_index;
  93                *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) |
  94                        pte_index | KVMPPC_RMAP_PRESENT;
  95        }
  96        unlock_rmap(rmap);
  97}
  98EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
  99
 100/*
 101 * Note modification of an HPTE; set the HPTE modified bit
 102 * if anyone is interested.
 103 */
 104static inline void note_hpte_modification(struct kvm *kvm,
 105                                          struct revmap_entry *rev)
 106{
 107        if (atomic_read(&kvm->arch.hpte_mod_interest))
 108                rev->guest_rpte |= HPTE_GR_MODIFIED;
 109}
 110
 111/* Remove this HPTE from the chain for a real page */
 112static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 113                                struct revmap_entry *rev,
 114                                unsigned long hpte_v, unsigned long hpte_r)
 115{
 116        struct revmap_entry *next, *prev;
 117        unsigned long gfn, ptel, head;
 118        struct kvm_memory_slot *memslot;
 119        unsigned long *rmap;
 120        unsigned long rcbits;
 121
 122        rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
 123        ptel = rev->guest_rpte |= rcbits;
 124        gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
 125        memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
 126        if (!memslot)
 127                return;
 128
 129        rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
 130        lock_rmap(rmap);
 131
 132        head = *rmap & KVMPPC_RMAP_INDEX;
 133        next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]);
 134        prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]);
 135        next->back = rev->back;
 136        prev->forw = rev->forw;
 137        if (head == pte_index) {
 138                head = rev->forw;
 139                if (head == pte_index)
 140                        *rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
 141                else
 142                        *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
 143        }
 144        *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
 145        unlock_rmap(rmap);
 146}
 147
 148static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva,
 149                              int writing, unsigned long *pte_sizep)
 150{
 151        pte_t *ptep;
 152        unsigned long ps = *pte_sizep;
 153        unsigned int shift;
 154
 155        ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
 156        if (!ptep)
 157                return __pte(0);
 158        if (shift)
 159                *pte_sizep = 1ul << shift;
 160        else
 161                *pte_sizep = PAGE_SIZE;
 162        if (ps > *pte_sizep)
 163                return __pte(0);
 164        if (!pte_present(*ptep))
 165                return __pte(0);
 166        return kvmppc_read_update_linux_pte(ptep, writing);
 167}
 168
 169static inline void unlock_hpte(unsigned long *hpte, unsigned long hpte_v)
 170{
 171        asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
 172        hpte[0] = hpte_v;
 173}
 174
 175long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 176                       long pte_index, unsigned long pteh, unsigned long ptel,
 177                       pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
 178{
 179        unsigned long i, pa, gpa, gfn, psize;
 180        unsigned long slot_fn, hva;
 181        unsigned long *hpte;
 182        struct revmap_entry *rev;
 183        unsigned long g_ptel;
 184        struct kvm_memory_slot *memslot;
 185        unsigned long *physp, pte_size;
 186        unsigned long is_io;
 187        unsigned long *rmap;
 188        pte_t pte;
 189        unsigned int writing;
 190        unsigned long mmu_seq;
 191        unsigned long rcbits;
 192
 193        psize = hpte_page_size(pteh, ptel);
 194        if (!psize)
 195                return H_PARAMETER;
 196        writing = hpte_is_writable(ptel);
 197        pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
 198        ptel &= ~HPTE_GR_RESERVED;
 199        g_ptel = ptel;
 200
 201        /* used later to detect if we might have been invalidated */
 202        mmu_seq = kvm->mmu_notifier_seq;
 203        smp_rmb();
 204
 205        /* Find the memslot (if any) for this address */
 206        gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
 207        gfn = gpa >> PAGE_SHIFT;
 208        memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
 209        pa = 0;
 210        is_io = ~0ul;
 211        rmap = NULL;
 212        if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) {
 213                /* PPC970 can't do emulated MMIO */
 214                if (!cpu_has_feature(CPU_FTR_ARCH_206))
 215                        return H_PARAMETER;
 216                /* Emulated MMIO - mark this with key=31 */
 217                pteh |= HPTE_V_ABSENT;
 218                ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO;
 219                goto do_insert;
 220        }
 221
 222        /* Check if the requested page fits entirely in the memslot. */
 223        if (!slot_is_aligned(memslot, psize))
 224                return H_PARAMETER;
 225        slot_fn = gfn - memslot->base_gfn;
 226        rmap = &memslot->arch.rmap[slot_fn];
 227
 228        if (!kvm->arch.using_mmu_notifiers) {
 229                physp = memslot->arch.slot_phys;
 230                if (!physp)
 231                        return H_PARAMETER;
 232                physp += slot_fn;
 233                if (realmode)
 234                        physp = real_vmalloc_addr(physp);
 235                pa = *physp;
 236                if (!pa)
 237                        return H_TOO_HARD;
 238                is_io = pa & (HPTE_R_I | HPTE_R_W);
 239                pte_size = PAGE_SIZE << (pa & KVMPPC_PAGE_ORDER_MASK);
 240                pa &= PAGE_MASK;
 241        } else {
 242                /* Translate to host virtual address */
 243                hva = __gfn_to_hva_memslot(memslot, gfn);
 244
 245                /* Look up the Linux PTE for the backing page */
 246                pte_size = psize;
 247                pte = lookup_linux_pte(pgdir, hva, writing, &pte_size);
 248                if (pte_present(pte)) {
 249                        if (writing && !pte_write(pte))
 250                                /* make the actual HPTE be read-only */
 251                                ptel = hpte_make_readonly(ptel);
 252                        is_io = hpte_cache_bits(pte_val(pte));
 253                        pa = pte_pfn(pte) << PAGE_SHIFT;
 254                }
 255        }
 256
 257        if (pte_size < psize)
 258                return H_PARAMETER;
 259        if (pa && pte_size > psize)
 260                pa |= gpa & (pte_size - 1);
 261
 262        ptel &= ~(HPTE_R_PP0 - psize);
 263        ptel |= pa;
 264
 265        if (pa)
 266                pteh |= HPTE_V_VALID;
 267        else
 268                pteh |= HPTE_V_ABSENT;
 269
 270        /* Check WIMG */
 271        if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) {
 272                if (is_io)
 273                        return H_PARAMETER;
 274                /*
 275                 * Allow guest to map emulated device memory as
 276                 * uncacheable, but actually make it cacheable.
 277                 */
 278                ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G);
 279                ptel |= HPTE_R_M;
 280        }
 281
 282        /* Find and lock the HPTEG slot to use */
 283 do_insert:
 284        if (pte_index >= kvm->arch.hpt_npte)
 285                return H_PARAMETER;
 286        if (likely((flags & H_EXACT) == 0)) {
 287                pte_index &= ~7UL;
 288                hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 289                for (i = 0; i < 8; ++i) {
 290                        if ((*hpte & HPTE_V_VALID) == 0 &&
 291                            try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
 292                                          HPTE_V_ABSENT))
 293                                break;
 294                        hpte += 2;
 295                }
 296                if (i == 8) {
 297                        /*
 298                         * Since try_lock_hpte doesn't retry (not even stdcx.
 299                         * failures), it could be that there is a free slot
 300                         * but we transiently failed to lock it.  Try again,
 301                         * actually locking each slot and checking it.
 302                         */
 303                        hpte -= 16;
 304                        for (i = 0; i < 8; ++i) {
 305                                while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 306                                        cpu_relax();
 307                                if (!(*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)))
 308                                        break;
 309                                *hpte &= ~HPTE_V_HVLOCK;
 310                                hpte += 2;
 311                        }
 312                        if (i == 8)
 313                                return H_PTEG_FULL;
 314                }
 315                pte_index += i;
 316        } else {
 317                hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 318                if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
 319                                   HPTE_V_ABSENT)) {
 320                        /* Lock the slot and check again */
 321                        while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 322                                cpu_relax();
 323                        if (*hpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
 324                                *hpte &= ~HPTE_V_HVLOCK;
 325                                return H_PTEG_FULL;
 326                        }
 327                }
 328        }
 329
 330        /* Save away the guest's idea of the second HPTE dword */
 331        rev = &kvm->arch.revmap[pte_index];
 332        if (realmode)
 333                rev = real_vmalloc_addr(rev);
 334        if (rev) {
 335                rev->guest_rpte = g_ptel;
 336                note_hpte_modification(kvm, rev);
 337        }
 338
 339        /* Link HPTE into reverse-map chain */
 340        if (pteh & HPTE_V_VALID) {
 341                if (realmode)
 342                        rmap = real_vmalloc_addr(rmap);
 343                lock_rmap(rmap);
 344                /* Check for pending invalidations under the rmap chain lock */
 345                if (kvm->arch.using_mmu_notifiers &&
 346                    mmu_notifier_retry(kvm, mmu_seq)) {
 347                        /* inval in progress, write a non-present HPTE */
 348                        pteh |= HPTE_V_ABSENT;
 349                        pteh &= ~HPTE_V_VALID;
 350                        unlock_rmap(rmap);
 351                } else {
 352                        kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index,
 353                                                realmode);
 354                        /* Only set R/C in real HPTE if already set in *rmap */
 355                        rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
 356                        ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C);
 357                }
 358        }
 359
 360        hpte[1] = ptel;
 361
 362        /* Write the first HPTE dword, unlocking the HPTE and making it valid */
 363        eieio();
 364        hpte[0] = pteh;
 365        asm volatile("ptesync" : : : "memory");
 366
 367        *pte_idx_ret = pte_index;
 368        return H_SUCCESS;
 369}
 370EXPORT_SYMBOL_GPL(kvmppc_do_h_enter);
 371
 372long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 373                    long pte_index, unsigned long pteh, unsigned long ptel)
 374{
 375        return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel,
 376                                 vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]);
 377}
 378
 379#define LOCK_TOKEN      (*(u32 *)(&get_paca()->lock_token))
 380
 381static inline int try_lock_tlbie(unsigned int *lock)
 382{
 383        unsigned int tmp, old;
 384        unsigned int token = LOCK_TOKEN;
 385
 386        asm volatile("1:lwarx   %1,0,%2\n"
 387                     "  cmpwi   cr0,%1,0\n"
 388                     "  bne     2f\n"
 389                     "  stwcx.  %3,0,%2\n"
 390                     "  bne-    1b\n"
 391                     "  isync\n"
 392                     "2:"
 393                     : "=&r" (tmp), "=&r" (old)
 394                     : "r" (lock), "r" (token)
 395                     : "cc", "memory");
 396        return old == 0;
 397}
 398
 399long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 400                        unsigned long pte_index, unsigned long avpn,
 401                        unsigned long *hpret)
 402{
 403        unsigned long *hpte;
 404        unsigned long v, r, rb;
 405        struct revmap_entry *rev;
 406
 407        if (pte_index >= kvm->arch.hpt_npte)
 408                return H_PARAMETER;
 409        hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 410        while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 411                cpu_relax();
 412        if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
 413            ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn) ||
 414            ((flags & H_ANDCOND) && (hpte[0] & avpn) != 0)) {
 415                hpte[0] &= ~HPTE_V_HVLOCK;
 416                return H_NOT_FOUND;
 417        }
 418
 419        rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 420        v = hpte[0] & ~HPTE_V_HVLOCK;
 421        if (v & HPTE_V_VALID) {
 422                hpte[0] &= ~HPTE_V_VALID;
 423                rb = compute_tlbie_rb(v, hpte[1], pte_index);
 424                if (global_invalidates(kvm, flags)) {
 425                        while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
 426                                cpu_relax();
 427                        asm volatile("ptesync" : : : "memory");
 428                        asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
 429                                     : : "r" (rb), "r" (kvm->arch.lpid));
 430                        asm volatile("ptesync" : : : "memory");
 431                        kvm->arch.tlbie_lock = 0;
 432                } else {
 433                        asm volatile("ptesync" : : : "memory");
 434                        asm volatile("tlbiel %0" : : "r" (rb));
 435                        asm volatile("ptesync" : : : "memory");
 436                }
 437                /* Read PTE low word after tlbie to get final R/C values */
 438                remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]);
 439        }
 440        r = rev->guest_rpte & ~HPTE_GR_RESERVED;
 441        note_hpte_modification(kvm, rev);
 442        unlock_hpte(hpte, 0);
 443
 444        hpret[0] = v;
 445        hpret[1] = r;
 446        return H_SUCCESS;
 447}
 448EXPORT_SYMBOL_GPL(kvmppc_do_h_remove);
 449
 450long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
 451                     unsigned long pte_index, unsigned long avpn)
 452{
 453        return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn,
 454                                  &vcpu->arch.gpr[4]);
 455}
 456
 457long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 458{
 459        struct kvm *kvm = vcpu->kvm;
 460        unsigned long *args = &vcpu->arch.gpr[4];
 461        unsigned long *hp, *hptes[4], tlbrb[4];
 462        long int i, j, k, n, found, indexes[4];
 463        unsigned long flags, req, pte_index, rcbits;
 464        long int local = 0;
 465        long int ret = H_SUCCESS;
 466        struct revmap_entry *rev, *revs[4];
 467
 468        if (atomic_read(&kvm->online_vcpus) == 1)
 469                local = 1;
 470        for (i = 0; i < 4 && ret == H_SUCCESS; ) {
 471                n = 0;
 472                for (; i < 4; ++i) {
 473                        j = i * 2;
 474                        pte_index = args[j];
 475                        flags = pte_index >> 56;
 476                        pte_index &= ((1ul << 56) - 1);
 477                        req = flags >> 6;
 478                        flags &= 3;
 479                        if (req == 3) {         /* no more requests */
 480                                i = 4;
 481                                break;
 482                        }
 483                        if (req != 1 || flags == 3 ||
 484                            pte_index >= kvm->arch.hpt_npte) {
 485                                /* parameter error */
 486                                args[j] = ((0xa0 | flags) << 56) + pte_index;
 487                                ret = H_PARAMETER;
 488                                break;
 489                        }
 490                        hp = (unsigned long *)
 491                                (kvm->arch.hpt_virt + (pte_index << 4));
 492                        /* to avoid deadlock, don't spin except for first */
 493                        if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) {
 494                                if (n)
 495                                        break;
 496                                while (!try_lock_hpte(hp, HPTE_V_HVLOCK))
 497                                        cpu_relax();
 498                        }
 499                        found = 0;
 500                        if (hp[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) {
 501                                switch (flags & 3) {
 502                                case 0:         /* absolute */
 503                                        found = 1;
 504                                        break;
 505                                case 1:         /* andcond */
 506                                        if (!(hp[0] & args[j + 1]))
 507                                                found = 1;
 508                                        break;
 509                                case 2:         /* AVPN */
 510                                        if ((hp[0] & ~0x7fUL) == args[j + 1])
 511                                                found = 1;
 512                                        break;
 513                                }
 514                        }
 515                        if (!found) {
 516                                hp[0] &= ~HPTE_V_HVLOCK;
 517                                args[j] = ((0x90 | flags) << 56) + pte_index;
 518                                continue;
 519                        }
 520
 521                        args[j] = ((0x80 | flags) << 56) + pte_index;
 522                        rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 523                        note_hpte_modification(kvm, rev);
 524
 525                        if (!(hp[0] & HPTE_V_VALID)) {
 526                                /* insert R and C bits from PTE */
 527                                rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
 528                                args[j] |= rcbits << (56 - 5);
 529                                hp[0] = 0;
 530                                continue;
 531                        }
 532
 533                        hp[0] &= ~HPTE_V_VALID;         /* leave it locked */
 534                        tlbrb[n] = compute_tlbie_rb(hp[0], hp[1], pte_index);
 535                        indexes[n] = j;
 536                        hptes[n] = hp;
 537                        revs[n] = rev;
 538                        ++n;
 539                }
 540
 541                if (!n)
 542                        break;
 543
 544                /* Now that we've collected a batch, do the tlbies */
 545                if (!local) {
 546                        while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
 547                                cpu_relax();
 548                        asm volatile("ptesync" : : : "memory");
 549                        for (k = 0; k < n; ++k)
 550                                asm volatile(PPC_TLBIE(%1,%0) : :
 551                                             "r" (tlbrb[k]),
 552                                             "r" (kvm->arch.lpid));
 553                        asm volatile("eieio; tlbsync; ptesync" : : : "memory");
 554                        kvm->arch.tlbie_lock = 0;
 555                } else {
 556                        asm volatile("ptesync" : : : "memory");
 557                        for (k = 0; k < n; ++k)
 558                                asm volatile("tlbiel %0" : : "r" (tlbrb[k]));
 559                        asm volatile("ptesync" : : : "memory");
 560                }
 561
 562                /* Read PTE low words after tlbie to get final R/C values */
 563                for (k = 0; k < n; ++k) {
 564                        j = indexes[k];
 565                        pte_index = args[j] & ((1ul << 56) - 1);
 566                        hp = hptes[k];
 567                        rev = revs[k];
 568                        remove_revmap_chain(kvm, pte_index, rev, hp[0], hp[1]);
 569                        rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
 570                        args[j] |= rcbits << (56 - 5);
 571                        hp[0] = 0;
 572                }
 573        }
 574
 575        return ret;
 576}
 577
 578long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 579                      unsigned long pte_index, unsigned long avpn,
 580                      unsigned long va)
 581{
 582        struct kvm *kvm = vcpu->kvm;
 583        unsigned long *hpte;
 584        struct revmap_entry *rev;
 585        unsigned long v, r, rb, mask, bits;
 586
 587        if (pte_index >= kvm->arch.hpt_npte)
 588                return H_PARAMETER;
 589
 590        hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 591        while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 592                cpu_relax();
 593        if ((hpte[0] & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
 594            ((flags & H_AVPN) && (hpte[0] & ~0x7fUL) != avpn)) {
 595                hpte[0] &= ~HPTE_V_HVLOCK;
 596                return H_NOT_FOUND;
 597        }
 598
 599        v = hpte[0];
 600        bits = (flags << 55) & HPTE_R_PP0;
 601        bits |= (flags << 48) & HPTE_R_KEY_HI;
 602        bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
 603
 604        /* Update guest view of 2nd HPTE dword */
 605        mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
 606                HPTE_R_KEY_HI | HPTE_R_KEY_LO;
 607        rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 608        if (rev) {
 609                r = (rev->guest_rpte & ~mask) | bits;
 610                rev->guest_rpte = r;
 611                note_hpte_modification(kvm, rev);
 612        }
 613        r = (hpte[1] & ~mask) | bits;
 614
 615        /* Update HPTE */
 616        if (v & HPTE_V_VALID) {
 617                rb = compute_tlbie_rb(v, r, pte_index);
 618                hpte[0] = v & ~HPTE_V_VALID;
 619                if (global_invalidates(kvm, flags)) {
 620                        while(!try_lock_tlbie(&kvm->arch.tlbie_lock))
 621                                cpu_relax();
 622                        asm volatile("ptesync" : : : "memory");
 623                        asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
 624                                     : : "r" (rb), "r" (kvm->arch.lpid));
 625                        asm volatile("ptesync" : : : "memory");
 626                        kvm->arch.tlbie_lock = 0;
 627                } else {
 628                        asm volatile("ptesync" : : : "memory");
 629                        asm volatile("tlbiel %0" : : "r" (rb));
 630                        asm volatile("ptesync" : : : "memory");
 631                }
 632                /*
 633                 * If the host has this page as readonly but the guest
 634                 * wants to make it read/write, reduce the permissions.
 635                 * Checking the host permissions involves finding the
 636                 * memslot and then the Linux PTE for the page.
 637                 */
 638                if (hpte_is_writable(r) && kvm->arch.using_mmu_notifiers) {
 639                        unsigned long psize, gfn, hva;
 640                        struct kvm_memory_slot *memslot;
 641                        pgd_t *pgdir = vcpu->arch.pgdir;
 642                        pte_t pte;
 643
 644                        psize = hpte_page_size(v, r);
 645                        gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT;
 646                        memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn);
 647                        if (memslot) {
 648                                hva = __gfn_to_hva_memslot(memslot, gfn);
 649                                pte = lookup_linux_pte(pgdir, hva, 1, &psize);
 650                                if (pte_present(pte) && !pte_write(pte))
 651                                        r = hpte_make_readonly(r);
 652                        }
 653                }
 654        }
 655        hpte[1] = r;
 656        eieio();
 657        hpte[0] = v & ~HPTE_V_HVLOCK;
 658        asm volatile("ptesync" : : : "memory");
 659        return H_SUCCESS;
 660}
 661
 662long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 663                   unsigned long pte_index)
 664{
 665        struct kvm *kvm = vcpu->kvm;
 666        unsigned long *hpte, v, r;
 667        int i, n = 1;
 668        struct revmap_entry *rev = NULL;
 669
 670        if (pte_index >= kvm->arch.hpt_npte)
 671                return H_PARAMETER;
 672        if (flags & H_READ_4) {
 673                pte_index &= ~3;
 674                n = 4;
 675        }
 676        rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 677        for (i = 0; i < n; ++i, ++pte_index) {
 678                hpte = (unsigned long *)(kvm->arch.hpt_virt + (pte_index << 4));
 679                v = hpte[0] & ~HPTE_V_HVLOCK;
 680                r = hpte[1];
 681                if (v & HPTE_V_ABSENT) {
 682                        v &= ~HPTE_V_ABSENT;
 683                        v |= HPTE_V_VALID;
 684                }
 685                if (v & HPTE_V_VALID) {
 686                        r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
 687                        r &= ~HPTE_GR_RESERVED;
 688                }
 689                vcpu->arch.gpr[4 + i * 2] = v;
 690                vcpu->arch.gpr[5 + i * 2] = r;
 691        }
 692        return H_SUCCESS;
 693}
 694
 695void kvmppc_invalidate_hpte(struct kvm *kvm, unsigned long *hptep,
 696                        unsigned long pte_index)
 697{
 698        unsigned long rb;
 699
 700        hptep[0] &= ~HPTE_V_VALID;
 701        rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
 702        while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
 703                cpu_relax();
 704        asm volatile("ptesync" : : : "memory");
 705        asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
 706                     : : "r" (rb), "r" (kvm->arch.lpid));
 707        asm volatile("ptesync" : : : "memory");
 708        kvm->arch.tlbie_lock = 0;
 709}
 710EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);
 711
 712void kvmppc_clear_ref_hpte(struct kvm *kvm, unsigned long *hptep,
 713                           unsigned long pte_index)
 714{
 715        unsigned long rb;
 716        unsigned char rbyte;
 717
 718        rb = compute_tlbie_rb(hptep[0], hptep[1], pte_index);
 719        rbyte = (hptep[1] & ~HPTE_R_R) >> 8;
 720        /* modify only the second-last byte, which contains the ref bit */
 721        *((char *)hptep + 14) = rbyte;
 722        while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
 723                cpu_relax();
 724        asm volatile(PPC_TLBIE(%1,%0)"; eieio; tlbsync"
 725                     : : "r" (rb), "r" (kvm->arch.lpid));
 726        asm volatile("ptesync" : : : "memory");
 727        kvm->arch.tlbie_lock = 0;
 728}
 729EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte);
 730
 731static int slb_base_page_shift[4] = {
 732        24,     /* 16M */
 733        16,     /* 64k */
 734        34,     /* 16G */
 735        20,     /* 1M, unsupported */
 736};
 737
 738long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 739                              unsigned long valid)
 740{
 741        unsigned int i;
 742        unsigned int pshift;
 743        unsigned long somask;
 744        unsigned long vsid, hash;
 745        unsigned long avpn;
 746        unsigned long *hpte;
 747        unsigned long mask, val;
 748        unsigned long v, r;
 749
 750        /* Get page shift, work out hash and AVPN etc. */
 751        mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY;
 752        val = 0;
 753        pshift = 12;
 754        if (slb_v & SLB_VSID_L) {
 755                mask |= HPTE_V_LARGE;
 756                val |= HPTE_V_LARGE;
 757                pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4];
 758        }
 759        if (slb_v & SLB_VSID_B_1T) {
 760                somask = (1UL << 40) - 1;
 761                vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T;
 762                vsid ^= vsid << 25;
 763        } else {
 764                somask = (1UL << 28) - 1;
 765                vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
 766        }
 767        hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask;
 768        avpn = slb_v & ~(somask >> 16); /* also includes B */
 769        avpn |= (eaddr & somask) >> 16;
 770
 771        if (pshift >= 24)
 772                avpn &= ~((1UL << (pshift - 16)) - 1);
 773        else
 774                avpn &= ~0x7fUL;
 775        val |= avpn;
 776
 777        for (;;) {
 778                hpte = (unsigned long *)(kvm->arch.hpt_virt + (hash << 7));
 779
 780                for (i = 0; i < 16; i += 2) {
 781                        /* Read the PTE racily */
 782                        v = hpte[i] & ~HPTE_V_HVLOCK;
 783
 784                        /* Check valid/absent, hash, segment size and AVPN */
 785                        if (!(v & valid) || (v & mask) != val)
 786                                continue;
 787
 788                        /* Lock the PTE and read it under the lock */
 789                        while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK))
 790                                cpu_relax();
 791                        v = hpte[i] & ~HPTE_V_HVLOCK;
 792                        r = hpte[i+1];
 793
 794                        /*
 795                         * Check the HPTE again, including large page size
 796                         * Since we don't currently allow any MPSS (mixed
 797                         * page-size segment) page sizes, it is sufficient
 798                         * to check against the actual page size.
 799                         */
 800                        if ((v & valid) && (v & mask) == val &&
 801                            hpte_page_size(v, r) == (1ul << pshift))
 802                                /* Return with the HPTE still locked */
 803                                return (hash << 3) + (i >> 1);
 804
 805                        /* Unlock and move on */
 806                        hpte[i] = v;
 807                }
 808
 809                if (val & HPTE_V_SECONDARY)
 810                        break;
 811                val |= HPTE_V_SECONDARY;
 812                hash = hash ^ kvm->arch.hpt_mask;
 813        }
 814        return -1;
 815}
 816EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte);
 817
 818/*
 819 * Called in real mode to check whether an HPTE not found fault
 820 * is due to accessing a paged-out page or an emulated MMIO page,
 821 * or if a protection fault is due to accessing a page that the
 822 * guest wanted read/write access to but which we made read-only.
 823 * Returns a possibly modified status (DSISR) value if not
 824 * (i.e. pass the interrupt to the guest),
 825 * -1 to pass the fault up to host kernel mode code, -2 to do that
 826 * and also load the instruction word (for MMIO emulation),
 827 * or 0 if we should make the guest retry the access.
 828 */
 829long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 830                          unsigned long slb_v, unsigned int status, bool data)
 831{
 832        struct kvm *kvm = vcpu->kvm;
 833        long int index;
 834        unsigned long v, r, gr;
 835        unsigned long *hpte;
 836        unsigned long valid;
 837        struct revmap_entry *rev;
 838        unsigned long pp, key;
 839
 840        /* For protection fault, expect to find a valid HPTE */
 841        valid = HPTE_V_VALID;
 842        if (status & DSISR_NOHPTE)
 843                valid |= HPTE_V_ABSENT;
 844
 845        index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
 846        if (index < 0) {
 847                if (status & DSISR_NOHPTE)
 848                        return status;  /* there really was no HPTE */
 849                return 0;               /* for prot fault, HPTE disappeared */
 850        }
 851        hpte = (unsigned long *)(kvm->arch.hpt_virt + (index << 4));
 852        v = hpte[0] & ~HPTE_V_HVLOCK;
 853        r = hpte[1];
 854        rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
 855        gr = rev->guest_rpte;
 856
 857        unlock_hpte(hpte, v);
 858
 859        /* For not found, if the HPTE is valid by now, retry the instruction */
 860        if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID))
 861                return 0;
 862
 863        /* Check access permissions to the page */
 864        pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
 865        key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
 866        status &= ~DSISR_NOHPTE;        /* DSISR_NOHPTE == SRR1_ISI_NOPT */
 867        if (!data) {
 868                if (gr & (HPTE_R_N | HPTE_R_G))
 869                        return status | SRR1_ISI_N_OR_G;
 870                if (!hpte_read_permission(pp, slb_v & key))
 871                        return status | SRR1_ISI_PROT;
 872        } else if (status & DSISR_ISSTORE) {
 873                /* check write permission */
 874                if (!hpte_write_permission(pp, slb_v & key))
 875                        return status | DSISR_PROTFAULT;
 876        } else {
 877                if (!hpte_read_permission(pp, slb_v & key))
 878                        return status | DSISR_PROTFAULT;
 879        }
 880
 881        /* Check storage key, if applicable */
 882        if (data && (vcpu->arch.shregs.msr & MSR_DR)) {
 883                unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr);
 884                if (status & DSISR_ISSTORE)
 885                        perm >>= 1;
 886                if (perm & 1)
 887                        return status | DSISR_KEYFAULT;
 888        }
 889
 890        /* Save HPTE info for virtual-mode handler */
 891        vcpu->arch.pgfault_addr = addr;
 892        vcpu->arch.pgfault_index = index;
 893        vcpu->arch.pgfault_hpte[0] = v;
 894        vcpu->arch.pgfault_hpte[1] = r;
 895
 896        /* Check the storage key to see if it is possibly emulated MMIO */
 897        if (data && (vcpu->arch.shregs.msr & MSR_IR) &&
 898            (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
 899            (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
 900                return -2;      /* MMIO emulation - load instr word */
 901
 902        return -1;              /* send fault up to host kernel mode */
 903}
 904