linux/arch/powerpc/kvm/book3s_hv_rm_mmu.c
<<
>>
Prefs
   1/*
   2 * This program is free software; you can redistribute it and/or modify
   3 * it under the terms of the GNU General Public License, version 2, as
   4 * published by the Free Software Foundation.
   5 *
   6 * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   7 */
   8
   9#include <linux/types.h>
  10#include <linux/string.h>
  11#include <linux/kvm.h>
  12#include <linux/kvm_host.h>
  13#include <linux/hugetlb.h>
  14#include <linux/module.h>
  15
  16#include <asm/tlbflush.h>
  17#include <asm/kvm_ppc.h>
  18#include <asm/kvm_book3s.h>
  19#include <asm/mmu-hash64.h>
  20#include <asm/hvcall.h>
  21#include <asm/synch.h>
  22#include <asm/ppc-opcode.h>
  23
  24/* Translate address of a vmalloc'd thing to a linear map address */
  25static void *real_vmalloc_addr(void *x)
  26{
  27        unsigned long addr = (unsigned long) x;
  28        pte_t *p;
  29
  30        p = find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL);
  31        if (!p || !pte_present(*p))
  32                return NULL;
  33        /* assume we don't have huge pages in vmalloc space... */
  34        addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK);
  35        return __va(addr);
  36}
  37
  38/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */
  39static int global_invalidates(struct kvm *kvm, unsigned long flags)
  40{
  41        int global;
  42
  43        /*
  44         * If there is only one vcore, and it's currently running,
  45         * as indicated by local_paca->kvm_hstate.kvm_vcpu being set,
  46         * we can use tlbiel as long as we mark all other physical
  47         * cores as potentially having stale TLB entries for this lpid.
  48         * Otherwise, don't use tlbiel.
  49         */
  50        if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcpu)
  51                global = 0;
  52        else
  53                global = 1;
  54
  55        if (!global) {
  56                /* any other core might now have stale TLB entries... */
  57                smp_wmb();
  58                cpumask_setall(&kvm->arch.need_tlb_flush);
  59                cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu,
  60                                  &kvm->arch.need_tlb_flush);
  61        }
  62
  63        return global;
  64}
  65
  66/*
  67 * Add this HPTE into the chain for the real page.
  68 * Must be called with the chain locked; it unlocks the chain.
  69 */
  70void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
  71                             unsigned long *rmap, long pte_index, int realmode)
  72{
  73        struct revmap_entry *head, *tail;
  74        unsigned long i;
  75
  76        if (*rmap & KVMPPC_RMAP_PRESENT) {
  77                i = *rmap & KVMPPC_RMAP_INDEX;
  78                head = &kvm->arch.revmap[i];
  79                if (realmode)
  80                        head = real_vmalloc_addr(head);
  81                tail = &kvm->arch.revmap[head->back];
  82                if (realmode)
  83                        tail = real_vmalloc_addr(tail);
  84                rev->forw = i;
  85                rev->back = head->back;
  86                tail->forw = pte_index;
  87                head->back = pte_index;
  88        } else {
  89                rev->forw = rev->back = pte_index;
  90                *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) |
  91                        pte_index | KVMPPC_RMAP_PRESENT;
  92        }
  93        unlock_rmap(rmap);
  94}
  95EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
  96
  97/* Remove this HPTE from the chain for a real page */
  98static void remove_revmap_chain(struct kvm *kvm, long pte_index,
  99                                struct revmap_entry *rev,
 100                                unsigned long hpte_v, unsigned long hpte_r)
 101{
 102        struct revmap_entry *next, *prev;
 103        unsigned long gfn, ptel, head;
 104        struct kvm_memory_slot *memslot;
 105        unsigned long *rmap;
 106        unsigned long rcbits;
 107
 108        rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
 109        ptel = rev->guest_rpte |= rcbits;
 110        gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
 111        memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
 112        if (!memslot)
 113                return;
 114
 115        rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
 116        lock_rmap(rmap);
 117
 118        head = *rmap & KVMPPC_RMAP_INDEX;
 119        next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]);
 120        prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]);
 121        next->back = rev->back;
 122        prev->forw = rev->forw;
 123        if (head == pte_index) {
 124                head = rev->forw;
 125                if (head == pte_index)
 126                        *rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
 127                else
 128                        *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
 129        }
 130        *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
 131        unlock_rmap(rmap);
 132}
 133
 134static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva,
 135                              int writing, unsigned long *pte_sizep)
 136{
 137        pte_t *ptep;
 138        unsigned long ps = *pte_sizep;
 139        unsigned int hugepage_shift;
 140
 141        ptep = find_linux_pte_or_hugepte(pgdir, hva, &hugepage_shift);
 142        if (!ptep)
 143                return __pte(0);
 144        if (hugepage_shift)
 145                *pte_sizep = 1ul << hugepage_shift;
 146        else
 147                *pte_sizep = PAGE_SIZE;
 148        if (ps > *pte_sizep)
 149                return __pte(0);
 150        return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
 151}
 152
 153static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
 154{
 155        asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
 156        hpte[0] = cpu_to_be64(hpte_v);
 157}
 158
 159long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 160                       long pte_index, unsigned long pteh, unsigned long ptel,
 161                       pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
 162{
 163        unsigned long i, pa, gpa, gfn, psize;
 164        unsigned long slot_fn, hva;
 165        __be64 *hpte;
 166        struct revmap_entry *rev;
 167        unsigned long g_ptel;
 168        struct kvm_memory_slot *memslot;
 169        unsigned long pte_size;
 170        unsigned long is_io;
 171        unsigned long *rmap;
 172        pte_t pte;
 173        unsigned int writing;
 174        unsigned long mmu_seq;
 175        unsigned long rcbits;
 176
 177        psize = hpte_page_size(pteh, ptel);
 178        if (!psize)
 179                return H_PARAMETER;
 180        writing = hpte_is_writable(ptel);
 181        pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID);
 182        ptel &= ~HPTE_GR_RESERVED;
 183        g_ptel = ptel;
 184
 185        /* used later to detect if we might have been invalidated */
 186        mmu_seq = kvm->mmu_notifier_seq;
 187        smp_rmb();
 188
 189        /* Find the memslot (if any) for this address */
 190        gpa = (ptel & HPTE_R_RPN) & ~(psize - 1);
 191        gfn = gpa >> PAGE_SHIFT;
 192        memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
 193        pa = 0;
 194        is_io = ~0ul;
 195        rmap = NULL;
 196        if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) {
 197                /* Emulated MMIO - mark this with key=31 */
 198                pteh |= HPTE_V_ABSENT;
 199                ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO;
 200                goto do_insert;
 201        }
 202
 203        /* Check if the requested page fits entirely in the memslot. */
 204        if (!slot_is_aligned(memslot, psize))
 205                return H_PARAMETER;
 206        slot_fn = gfn - memslot->base_gfn;
 207        rmap = &memslot->arch.rmap[slot_fn];
 208
 209        /* Translate to host virtual address */
 210        hva = __gfn_to_hva_memslot(memslot, gfn);
 211
 212        /* Look up the Linux PTE for the backing page */
 213        pte_size = psize;
 214        pte = lookup_linux_pte_and_update(pgdir, hva, writing, &pte_size);
 215        if (pte_present(pte) && !pte_protnone(pte)) {
 216                if (writing && !pte_write(pte))
 217                        /* make the actual HPTE be read-only */
 218                        ptel = hpte_make_readonly(ptel);
 219                is_io = hpte_cache_bits(pte_val(pte));
 220                pa = pte_pfn(pte) << PAGE_SHIFT;
 221                pa |= hva & (pte_size - 1);
 222                pa |= gpa & ~PAGE_MASK;
 223        }
 224
 225        if (pte_size < psize)
 226                return H_PARAMETER;
 227
 228        ptel &= ~(HPTE_R_PP0 - psize);
 229        ptel |= pa;
 230
 231        if (pa)
 232                pteh |= HPTE_V_VALID;
 233        else
 234                pteh |= HPTE_V_ABSENT;
 235
 236        /* Check WIMG */
 237        if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) {
 238                if (is_io)
 239                        return H_PARAMETER;
 240                /*
 241                 * Allow guest to map emulated device memory as
 242                 * uncacheable, but actually make it cacheable.
 243                 */
 244                ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G);
 245                ptel |= HPTE_R_M;
 246        }
 247
 248        /* Find and lock the HPTEG slot to use */
 249 do_insert:
 250        if (pte_index >= kvm->arch.hpt_npte)
 251                return H_PARAMETER;
 252        if (likely((flags & H_EXACT) == 0)) {
 253                pte_index &= ~7UL;
 254                hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
 255                for (i = 0; i < 8; ++i) {
 256                        if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0 &&
 257                            try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
 258                                          HPTE_V_ABSENT))
 259                                break;
 260                        hpte += 2;
 261                }
 262                if (i == 8) {
 263                        /*
 264                         * Since try_lock_hpte doesn't retry (not even stdcx.
 265                         * failures), it could be that there is a free slot
 266                         * but we transiently failed to lock it.  Try again,
 267                         * actually locking each slot and checking it.
 268                         */
 269                        hpte -= 16;
 270                        for (i = 0; i < 8; ++i) {
 271                                u64 pte;
 272                                while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 273                                        cpu_relax();
 274                                pte = be64_to_cpu(*hpte);
 275                                if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT)))
 276                                        break;
 277                                *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK);
 278                                hpte += 2;
 279                        }
 280                        if (i == 8)
 281                                return H_PTEG_FULL;
 282                }
 283                pte_index += i;
 284        } else {
 285                hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
 286                if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
 287                                   HPTE_V_ABSENT)) {
 288                        /* Lock the slot and check again */
 289                        u64 pte;
 290
 291                        while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 292                                cpu_relax();
 293                        pte = be64_to_cpu(*hpte);
 294                        if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
 295                                *hpte &= ~cpu_to_be64(HPTE_V_HVLOCK);
 296                                return H_PTEG_FULL;
 297                        }
 298                }
 299        }
 300
 301        /* Save away the guest's idea of the second HPTE dword */
 302        rev = &kvm->arch.revmap[pte_index];
 303        if (realmode)
 304                rev = real_vmalloc_addr(rev);
 305        if (rev) {
 306                rev->guest_rpte = g_ptel;
 307                note_hpte_modification(kvm, rev);
 308        }
 309
 310        /* Link HPTE into reverse-map chain */
 311        if (pteh & HPTE_V_VALID) {
 312                if (realmode)
 313                        rmap = real_vmalloc_addr(rmap);
 314                lock_rmap(rmap);
 315                /* Check for pending invalidations under the rmap chain lock */
 316                if (mmu_notifier_retry(kvm, mmu_seq)) {
 317                        /* inval in progress, write a non-present HPTE */
 318                        pteh |= HPTE_V_ABSENT;
 319                        pteh &= ~HPTE_V_VALID;
 320                        unlock_rmap(rmap);
 321                } else {
 322                        kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index,
 323                                                realmode);
 324                        /* Only set R/C in real HPTE if already set in *rmap */
 325                        rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
 326                        ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C);
 327                }
 328        }
 329
 330        hpte[1] = cpu_to_be64(ptel);
 331
 332        /* Write the first HPTE dword, unlocking the HPTE and making it valid */
 333        eieio();
 334        hpte[0] = cpu_to_be64(pteh);
 335        asm volatile("ptesync" : : : "memory");
 336
 337        *pte_idx_ret = pte_index;
 338        return H_SUCCESS;
 339}
 340EXPORT_SYMBOL_GPL(kvmppc_do_h_enter);
 341
 342long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 343                    long pte_index, unsigned long pteh, unsigned long ptel)
 344{
 345        return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel,
 346                                 vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]);
 347}
 348
 349#ifdef __BIG_ENDIAN__
 350#define LOCK_TOKEN      (*(u32 *)(&get_paca()->lock_token))
 351#else
 352#define LOCK_TOKEN      (*(u32 *)(&get_paca()->paca_index))
 353#endif
 354
 355static inline int try_lock_tlbie(unsigned int *lock)
 356{
 357        unsigned int tmp, old;
 358        unsigned int token = LOCK_TOKEN;
 359
 360        asm volatile("1:lwarx   %1,0,%2\n"
 361                     "  cmpwi   cr0,%1,0\n"
 362                     "  bne     2f\n"
 363                     "  stwcx.  %3,0,%2\n"
 364                     "  bne-    1b\n"
 365                     "  isync\n"
 366                     "2:"
 367                     : "=&r" (tmp), "=&r" (old)
 368                     : "r" (lock), "r" (token)
 369                     : "cc", "memory");
 370        return old == 0;
 371}
 372
 373static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
 374                      long npages, int global, bool need_sync)
 375{
 376        long i;
 377
 378        if (global) {
 379                while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
 380                        cpu_relax();
 381                if (need_sync)
 382                        asm volatile("ptesync" : : : "memory");
 383                for (i = 0; i < npages; ++i)
 384                        asm volatile(PPC_TLBIE(%1,%0) : :
 385                                     "r" (rbvalues[i]), "r" (kvm->arch.lpid));
 386                asm volatile("eieio; tlbsync; ptesync" : : : "memory");
 387                kvm->arch.tlbie_lock = 0;
 388        } else {
 389                if (need_sync)
 390                        asm volatile("ptesync" : : : "memory");
 391                for (i = 0; i < npages; ++i)
 392                        asm volatile("tlbiel %0" : : "r" (rbvalues[i]));
 393                asm volatile("ptesync" : : : "memory");
 394        }
 395}
 396
 397long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 398                        unsigned long pte_index, unsigned long avpn,
 399                        unsigned long *hpret)
 400{
 401        __be64 *hpte;
 402        unsigned long v, r, rb;
 403        struct revmap_entry *rev;
 404        u64 pte;
 405
 406        if (pte_index >= kvm->arch.hpt_npte)
 407                return H_PARAMETER;
 408        hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
 409        while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 410                cpu_relax();
 411        pte = be64_to_cpu(hpte[0]);
 412        if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
 413            ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) ||
 414            ((flags & H_ANDCOND) && (pte & avpn) != 0)) {
 415                hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
 416                return H_NOT_FOUND;
 417        }
 418
 419        rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 420        v = pte & ~HPTE_V_HVLOCK;
 421        if (v & HPTE_V_VALID) {
 422                u64 pte1;
 423
 424                pte1 = be64_to_cpu(hpte[1]);
 425                hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
 426                rb = compute_tlbie_rb(v, pte1, pte_index);
 427                do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
 428                /* Read PTE low word after tlbie to get final R/C values */
 429                remove_revmap_chain(kvm, pte_index, rev, v, pte1);
 430        }
 431        r = rev->guest_rpte & ~HPTE_GR_RESERVED;
 432        note_hpte_modification(kvm, rev);
 433        unlock_hpte(hpte, 0);
 434
 435        hpret[0] = v;
 436        hpret[1] = r;
 437        return H_SUCCESS;
 438}
 439EXPORT_SYMBOL_GPL(kvmppc_do_h_remove);
 440
 441long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
 442                     unsigned long pte_index, unsigned long avpn)
 443{
 444        return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn,
 445                                  &vcpu->arch.gpr[4]);
 446}
 447
 448long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 449{
 450        struct kvm *kvm = vcpu->kvm;
 451        unsigned long *args = &vcpu->arch.gpr[4];
 452        __be64 *hp, *hptes[4];
 453        unsigned long tlbrb[4];
 454        long int i, j, k, n, found, indexes[4];
 455        unsigned long flags, req, pte_index, rcbits;
 456        int global;
 457        long int ret = H_SUCCESS;
 458        struct revmap_entry *rev, *revs[4];
 459        u64 hp0;
 460
 461        global = global_invalidates(kvm, 0);
 462        for (i = 0; i < 4 && ret == H_SUCCESS; ) {
 463                n = 0;
 464                for (; i < 4; ++i) {
 465                        j = i * 2;
 466                        pte_index = args[j];
 467                        flags = pte_index >> 56;
 468                        pte_index &= ((1ul << 56) - 1);
 469                        req = flags >> 6;
 470                        flags &= 3;
 471                        if (req == 3) {         /* no more requests */
 472                                i = 4;
 473                                break;
 474                        }
 475                        if (req != 1 || flags == 3 ||
 476                            pte_index >= kvm->arch.hpt_npte) {
 477                                /* parameter error */
 478                                args[j] = ((0xa0 | flags) << 56) + pte_index;
 479                                ret = H_PARAMETER;
 480                                break;
 481                        }
 482                        hp = (__be64 *) (kvm->arch.hpt_virt + (pte_index << 4));
 483                        /* to avoid deadlock, don't spin except for first */
 484                        if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) {
 485                                if (n)
 486                                        break;
 487                                while (!try_lock_hpte(hp, HPTE_V_HVLOCK))
 488                                        cpu_relax();
 489                        }
 490                        found = 0;
 491                        hp0 = be64_to_cpu(hp[0]);
 492                        if (hp0 & (HPTE_V_ABSENT | HPTE_V_VALID)) {
 493                                switch (flags & 3) {
 494                                case 0:         /* absolute */
 495                                        found = 1;
 496                                        break;
 497                                case 1:         /* andcond */
 498                                        if (!(hp0 & args[j + 1]))
 499                                                found = 1;
 500                                        break;
 501                                case 2:         /* AVPN */
 502                                        if ((hp0 & ~0x7fUL) == args[j + 1])
 503                                                found = 1;
 504                                        break;
 505                                }
 506                        }
 507                        if (!found) {
 508                                hp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
 509                                args[j] = ((0x90 | flags) << 56) + pte_index;
 510                                continue;
 511                        }
 512
 513                        args[j] = ((0x80 | flags) << 56) + pte_index;
 514                        rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 515                        note_hpte_modification(kvm, rev);
 516
 517                        if (!(hp0 & HPTE_V_VALID)) {
 518                                /* insert R and C bits from PTE */
 519                                rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
 520                                args[j] |= rcbits << (56 - 5);
 521                                hp[0] = 0;
 522                                continue;
 523                        }
 524
 525                        /* leave it locked */
 526                        hp[0] &= ~cpu_to_be64(HPTE_V_VALID);
 527                        tlbrb[n] = compute_tlbie_rb(be64_to_cpu(hp[0]),
 528                                be64_to_cpu(hp[1]), pte_index);
 529                        indexes[n] = j;
 530                        hptes[n] = hp;
 531                        revs[n] = rev;
 532                        ++n;
 533                }
 534
 535                if (!n)
 536                        break;
 537
 538                /* Now that we've collected a batch, do the tlbies */
 539                do_tlbies(kvm, tlbrb, n, global, true);
 540
 541                /* Read PTE low words after tlbie to get final R/C values */
 542                for (k = 0; k < n; ++k) {
 543                        j = indexes[k];
 544                        pte_index = args[j] & ((1ul << 56) - 1);
 545                        hp = hptes[k];
 546                        rev = revs[k];
 547                        remove_revmap_chain(kvm, pte_index, rev,
 548                                be64_to_cpu(hp[0]), be64_to_cpu(hp[1]));
 549                        rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
 550                        args[j] |= rcbits << (56 - 5);
 551                        hp[0] = 0;
 552                }
 553        }
 554
 555        return ret;
 556}
 557
 558long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 559                      unsigned long pte_index, unsigned long avpn,
 560                      unsigned long va)
 561{
 562        struct kvm *kvm = vcpu->kvm;
 563        __be64 *hpte;
 564        struct revmap_entry *rev;
 565        unsigned long v, r, rb, mask, bits;
 566        u64 pte;
 567
 568        if (pte_index >= kvm->arch.hpt_npte)
 569                return H_PARAMETER;
 570
 571        hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
 572        while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 573                cpu_relax();
 574        pte = be64_to_cpu(hpte[0]);
 575        if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
 576            ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) {
 577                hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
 578                return H_NOT_FOUND;
 579        }
 580
 581        v = pte;
 582        bits = (flags << 55) & HPTE_R_PP0;
 583        bits |= (flags << 48) & HPTE_R_KEY_HI;
 584        bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
 585
 586        /* Update guest view of 2nd HPTE dword */
 587        mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
 588                HPTE_R_KEY_HI | HPTE_R_KEY_LO;
 589        rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 590        if (rev) {
 591                r = (rev->guest_rpte & ~mask) | bits;
 592                rev->guest_rpte = r;
 593                note_hpte_modification(kvm, rev);
 594        }
 595
 596        /* Update HPTE */
 597        if (v & HPTE_V_VALID) {
 598                /*
 599                 * If the page is valid, don't let it transition from
 600                 * readonly to writable.  If it should be writable, we'll
 601                 * take a trap and let the page fault code sort it out.
 602                 */
 603                pte = be64_to_cpu(hpte[1]);
 604                r = (pte & ~mask) | bits;
 605                if (hpte_is_writable(r) && !hpte_is_writable(pte))
 606                        r = hpte_make_readonly(r);
 607                /* If the PTE is changing, invalidate it first */
 608                if (r != pte) {
 609                        rb = compute_tlbie_rb(v, r, pte_index);
 610                        hpte[0] = cpu_to_be64((v & ~HPTE_V_VALID) |
 611                                              HPTE_V_ABSENT);
 612                        do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags),
 613                                  true);
 614                        hpte[1] = cpu_to_be64(r);
 615                }
 616        }
 617        unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
 618        asm volatile("ptesync" : : : "memory");
 619        return H_SUCCESS;
 620}
 621
 622long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 623                   unsigned long pte_index)
 624{
 625        struct kvm *kvm = vcpu->kvm;
 626        __be64 *hpte;
 627        unsigned long v, r;
 628        int i, n = 1;
 629        struct revmap_entry *rev = NULL;
 630
 631        if (pte_index >= kvm->arch.hpt_npte)
 632                return H_PARAMETER;
 633        if (flags & H_READ_4) {
 634                pte_index &= ~3;
 635                n = 4;
 636        }
 637        rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 638        for (i = 0; i < n; ++i, ++pte_index) {
 639                hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
 640                v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
 641                r = be64_to_cpu(hpte[1]);
 642                if (v & HPTE_V_ABSENT) {
 643                        v &= ~HPTE_V_ABSENT;
 644                        v |= HPTE_V_VALID;
 645                }
 646                if (v & HPTE_V_VALID) {
 647                        r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C));
 648                        r &= ~HPTE_GR_RESERVED;
 649                }
 650                vcpu->arch.gpr[4 + i * 2] = v;
 651                vcpu->arch.gpr[5 + i * 2] = r;
 652        }
 653        return H_SUCCESS;
 654}
 655
 656void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
 657                        unsigned long pte_index)
 658{
 659        unsigned long rb;
 660
 661        hptep[0] &= ~cpu_to_be64(HPTE_V_VALID);
 662        rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]),
 663                              pte_index);
 664        do_tlbies(kvm, &rb, 1, 1, true);
 665}
 666EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);
 667
 668void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
 669                           unsigned long pte_index)
 670{
 671        unsigned long rb;
 672        unsigned char rbyte;
 673
 674        rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]),
 675                              pte_index);
 676        rbyte = (be64_to_cpu(hptep[1]) & ~HPTE_R_R) >> 8;
 677        /* modify only the second-last byte, which contains the ref bit */
 678        *((char *)hptep + 14) = rbyte;
 679        do_tlbies(kvm, &rb, 1, 1, false);
 680}
 681EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte);
 682
 683static int slb_base_page_shift[4] = {
 684        24,     /* 16M */
 685        16,     /* 64k */
 686        34,     /* 16G */
 687        20,     /* 1M, unsupported */
 688};
 689
 690/* When called from virtmode, this func should be protected by
 691 * preempt_disable(), otherwise, the holding of HPTE_V_HVLOCK
 692 * can trigger deadlock issue.
 693 */
 694long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 695                              unsigned long valid)
 696{
 697        unsigned int i;
 698        unsigned int pshift;
 699        unsigned long somask;
 700        unsigned long vsid, hash;
 701        unsigned long avpn;
 702        __be64 *hpte;
 703        unsigned long mask, val;
 704        unsigned long v, r;
 705
 706        /* Get page shift, work out hash and AVPN etc. */
 707        mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY;
 708        val = 0;
 709        pshift = 12;
 710        if (slb_v & SLB_VSID_L) {
 711                mask |= HPTE_V_LARGE;
 712                val |= HPTE_V_LARGE;
 713                pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4];
 714        }
 715        if (slb_v & SLB_VSID_B_1T) {
 716                somask = (1UL << 40) - 1;
 717                vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T;
 718                vsid ^= vsid << 25;
 719        } else {
 720                somask = (1UL << 28) - 1;
 721                vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
 722        }
 723        hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask;
 724        avpn = slb_v & ~(somask >> 16); /* also includes B */
 725        avpn |= (eaddr & somask) >> 16;
 726
 727        if (pshift >= 24)
 728                avpn &= ~((1UL << (pshift - 16)) - 1);
 729        else
 730                avpn &= ~0x7fUL;
 731        val |= avpn;
 732
 733        for (;;) {
 734                hpte = (__be64 *)(kvm->arch.hpt_virt + (hash << 7));
 735
 736                for (i = 0; i < 16; i += 2) {
 737                        /* Read the PTE racily */
 738                        v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
 739
 740                        /* Check valid/absent, hash, segment size and AVPN */
 741                        if (!(v & valid) || (v & mask) != val)
 742                                continue;
 743
 744                        /* Lock the PTE and read it under the lock */
 745                        while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK))
 746                                cpu_relax();
 747                        v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
 748                        r = be64_to_cpu(hpte[i+1]);
 749
 750                        /*
 751                         * Check the HPTE again, including base page size
 752                         */
 753                        if ((v & valid) && (v & mask) == val &&
 754                            hpte_base_page_size(v, r) == (1ul << pshift))
 755                                /* Return with the HPTE still locked */
 756                                return (hash << 3) + (i >> 1);
 757
 758                        /* Unlock and move on */
 759                        hpte[i] = cpu_to_be64(v);
 760                }
 761
 762                if (val & HPTE_V_SECONDARY)
 763                        break;
 764                val |= HPTE_V_SECONDARY;
 765                hash = hash ^ kvm->arch.hpt_mask;
 766        }
 767        return -1;
 768}
 769EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte);
 770
 771/*
 772 * Called in real mode to check whether an HPTE not found fault
 773 * is due to accessing a paged-out page or an emulated MMIO page,
 774 * or if a protection fault is due to accessing a page that the
 775 * guest wanted read/write access to but which we made read-only.
 776 * Returns a possibly modified status (DSISR) value if not
 777 * (i.e. pass the interrupt to the guest),
 778 * -1 to pass the fault up to host kernel mode code, -2 to do that
 779 * and also load the instruction word (for MMIO emulation),
 780 * or 0 if we should make the guest retry the access.
 781 */
 782long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 783                          unsigned long slb_v, unsigned int status, bool data)
 784{
 785        struct kvm *kvm = vcpu->kvm;
 786        long int index;
 787        unsigned long v, r, gr;
 788        __be64 *hpte;
 789        unsigned long valid;
 790        struct revmap_entry *rev;
 791        unsigned long pp, key;
 792
 793        /* For protection fault, expect to find a valid HPTE */
 794        valid = HPTE_V_VALID;
 795        if (status & DSISR_NOHPTE)
 796                valid |= HPTE_V_ABSENT;
 797
 798        index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
 799        if (index < 0) {
 800                if (status & DSISR_NOHPTE)
 801                        return status;  /* there really was no HPTE */
 802                return 0;               /* for prot fault, HPTE disappeared */
 803        }
 804        hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
 805        v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
 806        r = be64_to_cpu(hpte[1]);
 807        rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
 808        gr = rev->guest_rpte;
 809
 810        unlock_hpte(hpte, v);
 811
 812        /* For not found, if the HPTE is valid by now, retry the instruction */
 813        if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID))
 814                return 0;
 815
 816        /* Check access permissions to the page */
 817        pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
 818        key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
 819        status &= ~DSISR_NOHPTE;        /* DSISR_NOHPTE == SRR1_ISI_NOPT */
 820        if (!data) {
 821                if (gr & (HPTE_R_N | HPTE_R_G))
 822                        return status | SRR1_ISI_N_OR_G;
 823                if (!hpte_read_permission(pp, slb_v & key))
 824                        return status | SRR1_ISI_PROT;
 825        } else if (status & DSISR_ISSTORE) {
 826                /* check write permission */
 827                if (!hpte_write_permission(pp, slb_v & key))
 828                        return status | DSISR_PROTFAULT;
 829        } else {
 830                if (!hpte_read_permission(pp, slb_v & key))
 831                        return status | DSISR_PROTFAULT;
 832        }
 833
 834        /* Check storage key, if applicable */
 835        if (data && (vcpu->arch.shregs.msr & MSR_DR)) {
 836                unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr);
 837                if (status & DSISR_ISSTORE)
 838                        perm >>= 1;
 839                if (perm & 1)
 840                        return status | DSISR_KEYFAULT;
 841        }
 842
 843        /* Save HPTE info for virtual-mode handler */
 844        vcpu->arch.pgfault_addr = addr;
 845        vcpu->arch.pgfault_index = index;
 846        vcpu->arch.pgfault_hpte[0] = v;
 847        vcpu->arch.pgfault_hpte[1] = r;
 848
 849        /* Check the storage key to see if it is possibly emulated MMIO */
 850        if (data && (vcpu->arch.shregs.msr & MSR_IR) &&
 851            (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
 852            (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
 853                return -2;      /* MMIO emulation - load instr word */
 854
 855        return -1;              /* send fault up to host kernel mode */
 856}
 857