linux/arch/powerpc/kvm/book3s_64_mmu_hv.c
<<
>>
Prefs
   1/*
   2 * This program is free software; you can redistribute it and/or modify
   3 * it under the terms of the GNU General Public License, version 2, as
   4 * published by the Free Software Foundation.
   5 *
   6 * This program is distributed in the hope that it will be useful,
   7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
   8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   9 * GNU General Public License for more details.
  10 *
  11 * You should have received a copy of the GNU General Public License
  12 * along with this program; if not, write to the Free Software
  13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  14 *
  15 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  16 */
  17
  18#include <linux/types.h>
  19#include <linux/string.h>
  20#include <linux/kvm.h>
  21#include <linux/kvm_host.h>
  22#include <linux/highmem.h>
  23#include <linux/gfp.h>
  24#include <linux/slab.h>
  25#include <linux/hugetlb.h>
  26#include <linux/vmalloc.h>
  27#include <linux/srcu.h>
  28#include <linux/anon_inodes.h>
  29#include <linux/file.h>
  30#include <linux/debugfs.h>
  31
  32#include <asm/tlbflush.h>
  33#include <asm/kvm_ppc.h>
  34#include <asm/kvm_book3s.h>
  35#include <asm/mmu-hash64.h>
  36#include <asm/hvcall.h>
  37#include <asm/synch.h>
  38#include <asm/ppc-opcode.h>
  39#include <asm/cputable.h>
  40
  41#include "trace_hv.h"
  42
  43/* Power architecture requires HPT is at least 256kB */
  44#define PPC_MIN_HPT_ORDER       18
  45
  46static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
  47                                long pte_index, unsigned long pteh,
  48                                unsigned long ptel, unsigned long *pte_idx_ret);
  49static void kvmppc_rmap_reset(struct kvm *kvm);
  50
  51long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
  52{
  53        unsigned long hpt = 0;
  54        struct revmap_entry *rev;
  55        struct page *page = NULL;
  56        long order = KVM_DEFAULT_HPT_ORDER;
  57
  58        if (htab_orderp) {
  59                order = *htab_orderp;
  60                if (order < PPC_MIN_HPT_ORDER)
  61                        order = PPC_MIN_HPT_ORDER;
  62        }
  63
  64        kvm->arch.hpt_cma_alloc = 0;
  65        page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT));
  66        if (page) {
  67                hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
  68                memset((void *)hpt, 0, (1ul << order));
  69                kvm->arch.hpt_cma_alloc = 1;
  70        }
  71
  72        /* Lastly try successively smaller sizes from the page allocator */
  73        while (!hpt && order > PPC_MIN_HPT_ORDER) {
  74                hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
  75                                       __GFP_NOWARN, order - PAGE_SHIFT);
  76                if (!hpt)
  77                        --order;
  78        }
  79
  80        if (!hpt)
  81                return -ENOMEM;
  82
  83        kvm->arch.hpt_virt = hpt;
  84        kvm->arch.hpt_order = order;
  85        /* HPTEs are 2**4 bytes long */
  86        kvm->arch.hpt_npte = 1ul << (order - 4);
  87        /* 128 (2**7) bytes in each HPTEG */
  88        kvm->arch.hpt_mask = (1ul << (order - 7)) - 1;
  89
  90        /* Allocate reverse map array */
  91        rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte);
  92        if (!rev) {
  93                pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
  94                goto out_freehpt;
  95        }
  96        kvm->arch.revmap = rev;
  97        kvm->arch.sdr1 = __pa(hpt) | (order - 18);
  98
  99        pr_info("KVM guest htab at %lx (order %ld), LPID %x\n",
 100                hpt, order, kvm->arch.lpid);
 101
 102        if (htab_orderp)
 103                *htab_orderp = order;
 104        return 0;
 105
 106 out_freehpt:
 107        if (kvm->arch.hpt_cma_alloc)
 108                kvm_release_hpt(page, 1 << (order - PAGE_SHIFT));
 109        else
 110                free_pages(hpt, order - PAGE_SHIFT);
 111        return -ENOMEM;
 112}
 113
 114long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
 115{
 116        long err = -EBUSY;
 117        long order;
 118
 119        mutex_lock(&kvm->lock);
 120        if (kvm->arch.hpte_setup_done) {
 121                kvm->arch.hpte_setup_done = 0;
 122                /* order hpte_setup_done vs. vcpus_running */
 123                smp_mb();
 124                if (atomic_read(&kvm->arch.vcpus_running)) {
 125                        kvm->arch.hpte_setup_done = 1;
 126                        goto out;
 127                }
 128        }
 129        if (kvm->arch.hpt_virt) {
 130                order = kvm->arch.hpt_order;
 131                /* Set the entire HPT to 0, i.e. invalid HPTEs */
 132                memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
 133                /*
 134                 * Reset all the reverse-mapping chains for all memslots
 135                 */
 136                kvmppc_rmap_reset(kvm);
 137                /* Ensure that each vcpu will flush its TLB on next entry. */
 138                cpumask_setall(&kvm->arch.need_tlb_flush);
 139                *htab_orderp = order;
 140                err = 0;
 141        } else {
 142                err = kvmppc_alloc_hpt(kvm, htab_orderp);
 143                order = *htab_orderp;
 144        }
 145 out:
 146        mutex_unlock(&kvm->lock);
 147        return err;
 148}
 149
 150void kvmppc_free_hpt(struct kvm *kvm)
 151{
 152        kvmppc_free_lpid(kvm->arch.lpid);
 153        vfree(kvm->arch.revmap);
 154        if (kvm->arch.hpt_cma_alloc)
 155                kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
 156                                1 << (kvm->arch.hpt_order - PAGE_SHIFT));
 157        else
 158                free_pages(kvm->arch.hpt_virt,
 159                           kvm->arch.hpt_order - PAGE_SHIFT);
 160}
 161
 162/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
 163static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
 164{
 165        return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
 166}
 167
 168/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
 169static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
 170{
 171        return (pgsize == 0x10000) ? 0x1000 : 0;
 172}
 173
 174void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 175                     unsigned long porder)
 176{
 177        unsigned long i;
 178        unsigned long npages;
 179        unsigned long hp_v, hp_r;
 180        unsigned long addr, hash;
 181        unsigned long psize;
 182        unsigned long hp0, hp1;
 183        unsigned long idx_ret;
 184        long ret;
 185        struct kvm *kvm = vcpu->kvm;
 186
 187        psize = 1ul << porder;
 188        npages = memslot->npages >> (porder - PAGE_SHIFT);
 189
 190        /* VRMA can't be > 1TB */
 191        if (npages > 1ul << (40 - porder))
 192                npages = 1ul << (40 - porder);
 193        /* Can't use more than 1 HPTE per HPTEG */
 194        if (npages > kvm->arch.hpt_mask + 1)
 195                npages = kvm->arch.hpt_mask + 1;
 196
 197        hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
 198                HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
 199        hp1 = hpte1_pgsize_encoding(psize) |
 200                HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
 201
 202        for (i = 0; i < npages; ++i) {
 203                addr = i << porder;
 204                /* can't use hpt_hash since va > 64 bits */
 205                hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask;
 206                /*
 207                 * We assume that the hash table is empty and no
 208                 * vcpus are using it at this stage.  Since we create
 209                 * at most one HPTE per HPTEG, we just assume entry 7
 210                 * is available and use it.
 211                 */
 212                hash = (hash << 3) + 7;
 213                hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
 214                hp_r = hp1 | addr;
 215                ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r,
 216                                                 &idx_ret);
 217                if (ret != H_SUCCESS) {
 218                        pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
 219                               addr, ret);
 220                        break;
 221                }
 222        }
 223}
 224
 225int kvmppc_mmu_hv_init(void)
 226{
 227        unsigned long host_lpid, rsvd_lpid;
 228
 229        if (!cpu_has_feature(CPU_FTR_HVMODE))
 230                return -EINVAL;
 231
 232        /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */
 233        host_lpid = mfspr(SPRN_LPID);
 234        rsvd_lpid = LPID_RSVD;
 235
 236        kvmppc_init_lpid(rsvd_lpid + 1);
 237
 238        kvmppc_claim_lpid(host_lpid);
 239        /* rsvd_lpid is reserved for use in partition switching */
 240        kvmppc_claim_lpid(rsvd_lpid);
 241
 242        return 0;
 243}
 244
 245static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
 246{
 247        unsigned long msr = vcpu->arch.intr_msr;
 248
 249        /* If transactional, change to suspend mode on IRQ delivery */
 250        if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr))
 251                msr |= MSR_TS_S;
 252        else
 253                msr |= vcpu->arch.shregs.msr & MSR_TS_MASK;
 254        kvmppc_set_msr(vcpu, msr);
 255}
 256
 257long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
 258                                long pte_index, unsigned long pteh,
 259                                unsigned long ptel, unsigned long *pte_idx_ret)
 260{
 261        long ret;
 262
 263        /* Protect linux PTE lookup from page table destruction */
 264        rcu_read_lock_sched();  /* this disables preemption too */
 265        ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel,
 266                                current->mm->pgd, false, pte_idx_ret);
 267        rcu_read_unlock_sched();
 268        if (ret == H_TOO_HARD) {
 269                /* this can't happen */
 270                pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
 271                ret = H_RESOURCE;       /* or something */
 272        }
 273        return ret;
 274
 275}
 276
 277static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
 278                                                         gva_t eaddr)
 279{
 280        u64 mask;
 281        int i;
 282
 283        for (i = 0; i < vcpu->arch.slb_nr; i++) {
 284                if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
 285                        continue;
 286
 287                if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
 288                        mask = ESID_MASK_1T;
 289                else
 290                        mask = ESID_MASK;
 291
 292                if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
 293                        return &vcpu->arch.slb[i];
 294        }
 295        return NULL;
 296}
 297
 298static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
 299                        unsigned long ea)
 300{
 301        unsigned long ra_mask;
 302
 303        ra_mask = hpte_page_size(v, r) - 1;
 304        return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
 305}
 306
 307static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 308                        struct kvmppc_pte *gpte, bool data, bool iswrite)
 309{
 310        struct kvm *kvm = vcpu->kvm;
 311        struct kvmppc_slb *slbe;
 312        unsigned long slb_v;
 313        unsigned long pp, key;
 314        unsigned long v, gr;
 315        __be64 *hptep;
 316        int index;
 317        int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
 318
 319        /* Get SLB entry */
 320        if (virtmode) {
 321                slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
 322                if (!slbe)
 323                        return -EINVAL;
 324                slb_v = slbe->origv;
 325        } else {
 326                /* real mode access */
 327                slb_v = vcpu->kvm->arch.vrma_slb_v;
 328        }
 329
 330        preempt_disable();
 331        /* Find the HPTE in the hash table */
 332        index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
 333                                         HPTE_V_VALID | HPTE_V_ABSENT);
 334        if (index < 0) {
 335                preempt_enable();
 336                return -ENOENT;
 337        }
 338        hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
 339        v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
 340        gr = kvm->arch.revmap[index].guest_rpte;
 341
 342        unlock_hpte(hptep, v);
 343        preempt_enable();
 344
 345        gpte->eaddr = eaddr;
 346        gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
 347
 348        /* Get PP bits and key for permission check */
 349        pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
 350        key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
 351        key &= slb_v;
 352
 353        /* Calculate permissions */
 354        gpte->may_read = hpte_read_permission(pp, key);
 355        gpte->may_write = hpte_write_permission(pp, key);
 356        gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G));
 357
 358        /* Storage key permission check for POWER7 */
 359        if (data && virtmode) {
 360                int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr);
 361                if (amrfield & 1)
 362                        gpte->may_read = 0;
 363                if (amrfield & 2)
 364                        gpte->may_write = 0;
 365        }
 366
 367        /* Get the guest physical address */
 368        gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr);
 369        return 0;
 370}
 371
 372/*
 373 * Quick test for whether an instruction is a load or a store.
 374 * If the instruction is a load or a store, then this will indicate
 375 * which it is, at least on server processors.  (Embedded processors
 376 * have some external PID instructions that don't follow the rule
 377 * embodied here.)  If the instruction isn't a load or store, then
 378 * this doesn't return anything useful.
 379 */
 380static int instruction_is_store(unsigned int instr)
 381{
 382        unsigned int mask;
 383
 384        mask = 0x10000000;
 385        if ((instr & 0xfc000000) == 0x7c000000)
 386                mask = 0x100;           /* major opcode 31 */
 387        return (instr & mask) != 0;
 388}
 389
 390static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
 391                                  unsigned long gpa, gva_t ea, int is_store)
 392{
 393        u32 last_inst;
 394
 395        /*
 396         * If we fail, we just return to the guest and try executing it again.
 397         */
 398        if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
 399                EMULATE_DONE)
 400                return RESUME_GUEST;
 401
 402        /*
 403         * WARNING: We do not know for sure whether the instruction we just
 404         * read from memory is the same that caused the fault in the first
 405         * place.  If the instruction we read is neither an load or a store,
 406         * then it can't access memory, so we don't need to worry about
 407         * enforcing access permissions.  So, assuming it is a load or
 408         * store, we just check that its direction (load or store) is
 409         * consistent with the original fault, since that's what we
 410         * checked the access permissions against.  If there is a mismatch
 411         * we just return and retry the instruction.
 412         */
 413
 414        if (instruction_is_store(last_inst) != !!is_store)
 415                return RESUME_GUEST;
 416
 417        /*
 418         * Emulated accesses are emulated by looking at the hash for
 419         * translation once, then performing the access later. The
 420         * translation could be invalidated in the meantime in which
 421         * point performing the subsequent memory access on the old
 422         * physical address could possibly be a security hole for the
 423         * guest (but not the host).
 424         *
 425         * This is less of an issue for MMIO stores since they aren't
 426         * globally visible. It could be an issue for MMIO loads to
 427         * a certain extent but we'll ignore it for now.
 428         */
 429
 430        vcpu->arch.paddr_accessed = gpa;
 431        vcpu->arch.vaddr_accessed = ea;
 432        return kvmppc_emulate_mmio(run, vcpu);
 433}
 434
 435int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 436                                unsigned long ea, unsigned long dsisr)
 437{
 438        struct kvm *kvm = vcpu->kvm;
 439        unsigned long hpte[3], r;
 440        __be64 *hptep;
 441        unsigned long mmu_seq, psize, pte_size;
 442        unsigned long gpa_base, gfn_base;
 443        unsigned long gpa, gfn, hva, pfn;
 444        struct kvm_memory_slot *memslot;
 445        unsigned long *rmap;
 446        struct revmap_entry *rev;
 447        struct page *page, *pages[1];
 448        long index, ret, npages;
 449        unsigned long is_io;
 450        unsigned int writing, write_ok;
 451        struct vm_area_struct *vma;
 452        unsigned long rcbits;
 453
 454        /*
 455         * Real-mode code has already searched the HPT and found the
 456         * entry we're interested in.  Lock the entry and check that
 457         * it hasn't changed.  If it has, just return and re-execute the
 458         * instruction.
 459         */
 460        if (ea != vcpu->arch.pgfault_addr)
 461                return RESUME_GUEST;
 462        index = vcpu->arch.pgfault_index;
 463        hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
 464        rev = &kvm->arch.revmap[index];
 465        preempt_disable();
 466        while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
 467                cpu_relax();
 468        hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
 469        hpte[1] = be64_to_cpu(hptep[1]);
 470        hpte[2] = r = rev->guest_rpte;
 471        unlock_hpte(hptep, hpte[0]);
 472        preempt_enable();
 473
 474        if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
 475            hpte[1] != vcpu->arch.pgfault_hpte[1])
 476                return RESUME_GUEST;
 477
 478        /* Translate the logical address and get the page */
 479        psize = hpte_page_size(hpte[0], r);
 480        gpa_base = r & HPTE_R_RPN & ~(psize - 1);
 481        gfn_base = gpa_base >> PAGE_SHIFT;
 482        gpa = gpa_base | (ea & (psize - 1));
 483        gfn = gpa >> PAGE_SHIFT;
 484        memslot = gfn_to_memslot(kvm, gfn);
 485
 486        trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr);
 487
 488        /* No memslot means it's an emulated MMIO region */
 489        if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
 490                return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
 491                                              dsisr & DSISR_ISSTORE);
 492
 493        /*
 494         * This should never happen, because of the slot_is_aligned()
 495         * check in kvmppc_do_h_enter().
 496         */
 497        if (gfn_base < memslot->base_gfn)
 498                return -EFAULT;
 499
 500        /* used to check for invalidations in progress */
 501        mmu_seq = kvm->mmu_notifier_seq;
 502        smp_rmb();
 503
 504        ret = -EFAULT;
 505        is_io = 0;
 506        pfn = 0;
 507        page = NULL;
 508        pte_size = PAGE_SIZE;
 509        writing = (dsisr & DSISR_ISSTORE) != 0;
 510        /* If writing != 0, then the HPTE must allow writing, if we get here */
 511        write_ok = writing;
 512        hva = gfn_to_hva_memslot(memslot, gfn);
 513        npages = get_user_pages_fast(hva, 1, writing, pages);
 514        if (npages < 1) {
 515                /* Check if it's an I/O mapping */
 516                down_read(&current->mm->mmap_sem);
 517                vma = find_vma(current->mm, hva);
 518                if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
 519                    (vma->vm_flags & VM_PFNMAP)) {
 520                        pfn = vma->vm_pgoff +
 521                                ((hva - vma->vm_start) >> PAGE_SHIFT);
 522                        pte_size = psize;
 523                        is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot));
 524                        write_ok = vma->vm_flags & VM_WRITE;
 525                }
 526                up_read(&current->mm->mmap_sem);
 527                if (!pfn)
 528                        goto out_put;
 529        } else {
 530                page = pages[0];
 531                pfn = page_to_pfn(page);
 532                if (PageHuge(page)) {
 533                        page = compound_head(page);
 534                        pte_size <<= compound_order(page);
 535                }
 536                /* if the guest wants write access, see if that is OK */
 537                if (!writing && hpte_is_writable(r)) {
 538                        pte_t *ptep, pte;
 539                        unsigned long flags;
 540                        /*
 541                         * We need to protect against page table destruction
 542                         * hugepage split and collapse.
 543                         */
 544                        local_irq_save(flags);
 545                        ptep = find_linux_pte_or_hugepte(current->mm->pgd,
 546                                                         hva, NULL);
 547                        if (ptep) {
 548                                pte = kvmppc_read_update_linux_pte(ptep, 1);
 549                                if (pte_write(pte))
 550                                        write_ok = 1;
 551                        }
 552                        local_irq_restore(flags);
 553                }
 554        }
 555
 556        if (psize > pte_size)
 557                goto out_put;
 558
 559        /* Check WIMG vs. the actual page we're accessing */
 560        if (!hpte_cache_flags_ok(r, is_io)) {
 561                if (is_io)
 562                        goto out_put;
 563
 564                /*
 565                 * Allow guest to map emulated device memory as
 566                 * uncacheable, but actually make it cacheable.
 567                 */
 568                r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
 569        }
 570
 571        /*
 572         * Set the HPTE to point to pfn.
 573         * Since the pfn is at PAGE_SIZE granularity, make sure we
 574         * don't mask out lower-order bits if psize < PAGE_SIZE.
 575         */
 576        if (psize < PAGE_SIZE)
 577                psize = PAGE_SIZE;
 578        r = (r & ~(HPTE_R_PP0 - psize)) | ((pfn << PAGE_SHIFT) & ~(psize - 1));
 579        if (hpte_is_writable(r) && !write_ok)
 580                r = hpte_make_readonly(r);
 581        ret = RESUME_GUEST;
 582        preempt_disable();
 583        while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
 584                cpu_relax();
 585        if ((be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK) != hpte[0] ||
 586                be64_to_cpu(hptep[1]) != hpte[1] ||
 587                rev->guest_rpte != hpte[2])
 588                /* HPTE has been changed under us; let the guest retry */
 589                goto out_unlock;
 590        hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
 591
 592        /* Always put the HPTE in the rmap chain for the page base address */
 593        rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn];
 594        lock_rmap(rmap);
 595
 596        /* Check if we might have been invalidated; let the guest retry if so */
 597        ret = RESUME_GUEST;
 598        if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
 599                unlock_rmap(rmap);
 600                goto out_unlock;
 601        }
 602
 603        /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
 604        rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
 605        r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
 606
 607        if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) {
 608                /* HPTE was previously valid, so we need to invalidate it */
 609                unlock_rmap(rmap);
 610                hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
 611                kvmppc_invalidate_hpte(kvm, hptep, index);
 612                /* don't lose previous R and C bits */
 613                r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
 614        } else {
 615                kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
 616        }
 617
 618        hptep[1] = cpu_to_be64(r);
 619        eieio();
 620        __unlock_hpte(hptep, hpte[0]);
 621        asm volatile("ptesync" : : : "memory");
 622        preempt_enable();
 623        if (page && hpte_is_writable(r))
 624                SetPageDirty(page);
 625
 626 out_put:
 627        trace_kvm_page_fault_exit(vcpu, hpte, ret);
 628
 629        if (page) {
 630                /*
 631                 * We drop pages[0] here, not page because page might
 632                 * have been set to the head page of a compound, but
 633                 * we have to drop the reference on the correct tail
 634                 * page to match the get inside gup()
 635                 */
 636                put_page(pages[0]);
 637        }
 638        return ret;
 639
 640 out_unlock:
 641        __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 642        preempt_enable();
 643        goto out_put;
 644}
 645
 646static void kvmppc_rmap_reset(struct kvm *kvm)
 647{
 648        struct kvm_memslots *slots;
 649        struct kvm_memory_slot *memslot;
 650        int srcu_idx;
 651
 652        srcu_idx = srcu_read_lock(&kvm->srcu);
 653        slots = kvm->memslots;
 654        kvm_for_each_memslot(memslot, slots) {
 655                /*
 656                 * This assumes it is acceptable to lose reference and
 657                 * change bits across a reset.
 658                 */
 659                memset(memslot->arch.rmap, 0,
 660                       memslot->npages * sizeof(*memslot->arch.rmap));
 661        }
 662        srcu_read_unlock(&kvm->srcu, srcu_idx);
 663}
 664
 665static int kvm_handle_hva_range(struct kvm *kvm,
 666                                unsigned long start,
 667                                unsigned long end,
 668                                int (*handler)(struct kvm *kvm,
 669                                               unsigned long *rmapp,
 670                                               unsigned long gfn))
 671{
 672        int ret;
 673        int retval = 0;
 674        struct kvm_memslots *slots;
 675        struct kvm_memory_slot *memslot;
 676
 677        slots = kvm_memslots(kvm);
 678        kvm_for_each_memslot(memslot, slots) {
 679                unsigned long hva_start, hva_end;
 680                gfn_t gfn, gfn_end;
 681
 682                hva_start = max(start, memslot->userspace_addr);
 683                hva_end = min(end, memslot->userspace_addr +
 684                                        (memslot->npages << PAGE_SHIFT));
 685                if (hva_start >= hva_end)
 686                        continue;
 687                /*
 688                 * {gfn(page) | page intersects with [hva_start, hva_end)} =
 689                 * {gfn, gfn+1, ..., gfn_end-1}.
 690                 */
 691                gfn = hva_to_gfn_memslot(hva_start, memslot);
 692                gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 693
 694                for (; gfn < gfn_end; ++gfn) {
 695                        gfn_t gfn_offset = gfn - memslot->base_gfn;
 696
 697                        ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn);
 698                        retval |= ret;
 699                }
 700        }
 701
 702        return retval;
 703}
 704
 705static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 706                          int (*handler)(struct kvm *kvm, unsigned long *rmapp,
 707                                         unsigned long gfn))
 708{
 709        return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
 710}
 711
 712static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 713                           unsigned long gfn)
 714{
 715        struct revmap_entry *rev = kvm->arch.revmap;
 716        unsigned long h, i, j;
 717        __be64 *hptep;
 718        unsigned long ptel, psize, rcbits;
 719
 720        for (;;) {
 721                lock_rmap(rmapp);
 722                if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
 723                        unlock_rmap(rmapp);
 724                        break;
 725                }
 726
 727                /*
 728                 * To avoid an ABBA deadlock with the HPTE lock bit,
 729                 * we can't spin on the HPTE lock while holding the
 730                 * rmap chain lock.
 731                 */
 732                i = *rmapp & KVMPPC_RMAP_INDEX;
 733                hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
 734                if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
 735                        /* unlock rmap before spinning on the HPTE lock */
 736                        unlock_rmap(rmapp);
 737                        while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
 738                                cpu_relax();
 739                        continue;
 740                }
 741                j = rev[i].forw;
 742                if (j == i) {
 743                        /* chain is now empty */
 744                        *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
 745                } else {
 746                        /* remove i from chain */
 747                        h = rev[i].back;
 748                        rev[h].forw = j;
 749                        rev[j].back = h;
 750                        rev[i].forw = rev[i].back = i;
 751                        *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
 752                }
 753
 754                /* Now check and modify the HPTE */
 755                ptel = rev[i].guest_rpte;
 756                psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
 757                if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
 758                    hpte_rpn(ptel, psize) == gfn) {
 759                        hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
 760                        kvmppc_invalidate_hpte(kvm, hptep, i);
 761                        /* Harvest R and C */
 762                        rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
 763                        *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
 764                        if (rcbits & ~rev[i].guest_rpte) {
 765                                rev[i].guest_rpte = ptel | rcbits;
 766                                note_hpte_modification(kvm, &rev[i]);
 767                        }
 768                }
 769                unlock_rmap(rmapp);
 770                __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 771        }
 772        return 0;
 773}
 774
 775int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva)
 776{
 777        kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
 778        return 0;
 779}
 780
 781int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
 782{
 783        kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp);
 784        return 0;
 785}
 786
 787void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
 788                                  struct kvm_memory_slot *memslot)
 789{
 790        unsigned long *rmapp;
 791        unsigned long gfn;
 792        unsigned long n;
 793
 794        rmapp = memslot->arch.rmap;
 795        gfn = memslot->base_gfn;
 796        for (n = memslot->npages; n; --n) {
 797                /*
 798                 * Testing the present bit without locking is OK because
 799                 * the memslot has been marked invalid already, and hence
 800                 * no new HPTEs referencing this page can be created,
 801                 * thus the present bit can't go from 0 to 1.
 802                 */
 803                if (*rmapp & KVMPPC_RMAP_PRESENT)
 804                        kvm_unmap_rmapp(kvm, rmapp, gfn);
 805                ++rmapp;
 806                ++gfn;
 807        }
 808}
 809
 810static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 811                         unsigned long gfn)
 812{
 813        struct revmap_entry *rev = kvm->arch.revmap;
 814        unsigned long head, i, j;
 815        __be64 *hptep;
 816        int ret = 0;
 817
 818 retry:
 819        lock_rmap(rmapp);
 820        if (*rmapp & KVMPPC_RMAP_REFERENCED) {
 821                *rmapp &= ~KVMPPC_RMAP_REFERENCED;
 822                ret = 1;
 823        }
 824        if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
 825                unlock_rmap(rmapp);
 826                return ret;
 827        }
 828
 829        i = head = *rmapp & KVMPPC_RMAP_INDEX;
 830        do {
 831                hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
 832                j = rev[i].forw;
 833
 834                /* If this HPTE isn't referenced, ignore it */
 835                if (!(be64_to_cpu(hptep[1]) & HPTE_R_R))
 836                        continue;
 837
 838                if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
 839                        /* unlock rmap before spinning on the HPTE lock */
 840                        unlock_rmap(rmapp);
 841                        while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
 842                                cpu_relax();
 843                        goto retry;
 844                }
 845
 846                /* Now check and modify the HPTE */
 847                if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
 848                    (be64_to_cpu(hptep[1]) & HPTE_R_R)) {
 849                        kvmppc_clear_ref_hpte(kvm, hptep, i);
 850                        if (!(rev[i].guest_rpte & HPTE_R_R)) {
 851                                rev[i].guest_rpte |= HPTE_R_R;
 852                                note_hpte_modification(kvm, &rev[i]);
 853                        }
 854                        ret = 1;
 855                }
 856                __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 857        } while ((i = j) != head);
 858
 859        unlock_rmap(rmapp);
 860        return ret;
 861}
 862
 863int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
 864{
 865        return kvm_handle_hva_range(kvm, start, end, kvm_age_rmapp);
 866}
 867
 868static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 869                              unsigned long gfn)
 870{
 871        struct revmap_entry *rev = kvm->arch.revmap;
 872        unsigned long head, i, j;
 873        unsigned long *hp;
 874        int ret = 1;
 875
 876        if (*rmapp & KVMPPC_RMAP_REFERENCED)
 877                return 1;
 878
 879        lock_rmap(rmapp);
 880        if (*rmapp & KVMPPC_RMAP_REFERENCED)
 881                goto out;
 882
 883        if (*rmapp & KVMPPC_RMAP_PRESENT) {
 884                i = head = *rmapp & KVMPPC_RMAP_INDEX;
 885                do {
 886                        hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4));
 887                        j = rev[i].forw;
 888                        if (be64_to_cpu(hp[1]) & HPTE_R_R)
 889                                goto out;
 890                } while ((i = j) != head);
 891        }
 892        ret = 0;
 893
 894 out:
 895        unlock_rmap(rmapp);
 896        return ret;
 897}
 898
 899int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
 900{
 901        return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp);
 902}
 903
 904void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
 905{
 906        kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
 907}
 908
 909static int vcpus_running(struct kvm *kvm)
 910{
 911        return atomic_read(&kvm->arch.vcpus_running) != 0;
 912}
 913
 914/*
 915 * Returns the number of system pages that are dirty.
 916 * This can be more than 1 if we find a huge-page HPTE.
 917 */
 918static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
 919{
 920        struct revmap_entry *rev = kvm->arch.revmap;
 921        unsigned long head, i, j;
 922        unsigned long n;
 923        unsigned long v, r;
 924        __be64 *hptep;
 925        int npages_dirty = 0;
 926
 927 retry:
 928        lock_rmap(rmapp);
 929        if (*rmapp & KVMPPC_RMAP_CHANGED) {
 930                *rmapp &= ~KVMPPC_RMAP_CHANGED;
 931                npages_dirty = 1;
 932        }
 933        if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
 934                unlock_rmap(rmapp);
 935                return npages_dirty;
 936        }
 937
 938        i = head = *rmapp & KVMPPC_RMAP_INDEX;
 939        do {
 940                unsigned long hptep1;
 941                hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
 942                j = rev[i].forw;
 943
 944                /*
 945                 * Checking the C (changed) bit here is racy since there
 946                 * is no guarantee about when the hardware writes it back.
 947                 * If the HPTE is not writable then it is stable since the
 948                 * page can't be written to, and we would have done a tlbie
 949                 * (which forces the hardware to complete any writeback)
 950                 * when making the HPTE read-only.
 951                 * If vcpus are running then this call is racy anyway
 952                 * since the page could get dirtied subsequently, so we
 953                 * expect there to be a further call which would pick up
 954                 * any delayed C bit writeback.
 955                 * Otherwise we need to do the tlbie even if C==0 in
 956                 * order to pick up any delayed writeback of C.
 957                 */
 958                hptep1 = be64_to_cpu(hptep[1]);
 959                if (!(hptep1 & HPTE_R_C) &&
 960                    (!hpte_is_writable(hptep1) || vcpus_running(kvm)))
 961                        continue;
 962
 963                if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
 964                        /* unlock rmap before spinning on the HPTE lock */
 965                        unlock_rmap(rmapp);
 966                        while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK))
 967                                cpu_relax();
 968                        goto retry;
 969                }
 970
 971                /* Now check and modify the HPTE */
 972                if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) {
 973                        __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 974                        continue;
 975                }
 976
 977                /* need to make it temporarily absent so C is stable */
 978                hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
 979                kvmppc_invalidate_hpte(kvm, hptep, i);
 980                v = be64_to_cpu(hptep[0]);
 981                r = be64_to_cpu(hptep[1]);
 982                if (r & HPTE_R_C) {
 983                        hptep[1] = cpu_to_be64(r & ~HPTE_R_C);
 984                        if (!(rev[i].guest_rpte & HPTE_R_C)) {
 985                                rev[i].guest_rpte |= HPTE_R_C;
 986                                note_hpte_modification(kvm, &rev[i]);
 987                        }
 988                        n = hpte_page_size(v, r);
 989                        n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
 990                        if (n > npages_dirty)
 991                                npages_dirty = n;
 992                        eieio();
 993                }
 994                v &= ~HPTE_V_ABSENT;
 995                v |= HPTE_V_VALID;
 996                __unlock_hpte(hptep, v);
 997        } while ((i = j) != head);
 998
 999        unlock_rmap(rmapp);
1000        return npages_dirty;
1001}
1002
1003static void harvest_vpa_dirty(struct kvmppc_vpa *vpa,
1004                              struct kvm_memory_slot *memslot,
1005                              unsigned long *map)
1006{
1007        unsigned long gfn;
1008
1009        if (!vpa->dirty || !vpa->pinned_addr)
1010                return;
1011        gfn = vpa->gpa >> PAGE_SHIFT;
1012        if (gfn < memslot->base_gfn ||
1013            gfn >= memslot->base_gfn + memslot->npages)
1014                return;
1015
1016        vpa->dirty = false;
1017        if (map)
1018                __set_bit_le(gfn - memslot->base_gfn, map);
1019}
1020
1021long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot,
1022                             unsigned long *map)
1023{
1024        unsigned long i, j;
1025        unsigned long *rmapp;
1026        struct kvm_vcpu *vcpu;
1027
1028        preempt_disable();
1029        rmapp = memslot->arch.rmap;
1030        for (i = 0; i < memslot->npages; ++i) {
1031                int npages = kvm_test_clear_dirty_npages(kvm, rmapp);
1032                /*
1033                 * Note that if npages > 0 then i must be a multiple of npages,
1034                 * since we always put huge-page HPTEs in the rmap chain
1035                 * corresponding to their page base address.
1036                 */
1037                if (npages && map)
1038                        for (j = i; npages; ++j, --npages)
1039                                __set_bit_le(j, map);
1040                ++rmapp;
1041        }
1042
1043        /* Harvest dirty bits from VPA and DTL updates */
1044        /* Note: we never modify the SLB shadow buffer areas */
1045        kvm_for_each_vcpu(i, vcpu, kvm) {
1046                spin_lock(&vcpu->arch.vpa_update_lock);
1047                harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map);
1048                harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map);
1049                spin_unlock(&vcpu->arch.vpa_update_lock);
1050        }
1051        preempt_enable();
1052        return 0;
1053}
1054
1055void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
1056                            unsigned long *nb_ret)
1057{
1058        struct kvm_memory_slot *memslot;
1059        unsigned long gfn = gpa >> PAGE_SHIFT;
1060        struct page *page, *pages[1];
1061        int npages;
1062        unsigned long hva, offset;
1063        int srcu_idx;
1064
1065        srcu_idx = srcu_read_lock(&kvm->srcu);
1066        memslot = gfn_to_memslot(kvm, gfn);
1067        if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
1068                goto err;
1069        hva = gfn_to_hva_memslot(memslot, gfn);
1070        npages = get_user_pages_fast(hva, 1, 1, pages);
1071        if (npages < 1)
1072                goto err;
1073        page = pages[0];
1074        srcu_read_unlock(&kvm->srcu, srcu_idx);
1075
1076        offset = gpa & (PAGE_SIZE - 1);
1077        if (nb_ret)
1078                *nb_ret = PAGE_SIZE - offset;
1079        return page_address(page) + offset;
1080
1081 err:
1082        srcu_read_unlock(&kvm->srcu, srcu_idx);
1083        return NULL;
1084}
1085
1086void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
1087                             bool dirty)
1088{
1089        struct page *page = virt_to_page(va);
1090        struct kvm_memory_slot *memslot;
1091        unsigned long gfn;
1092        unsigned long *rmap;
1093        int srcu_idx;
1094
1095        put_page(page);
1096
1097        if (!dirty)
1098                return;
1099
1100        /* We need to mark this page dirty in the rmap chain */
1101        gfn = gpa >> PAGE_SHIFT;
1102        srcu_idx = srcu_read_lock(&kvm->srcu);
1103        memslot = gfn_to_memslot(kvm, gfn);
1104        if (memslot) {
1105                rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
1106                lock_rmap(rmap);
1107                *rmap |= KVMPPC_RMAP_CHANGED;
1108                unlock_rmap(rmap);
1109        }
1110        srcu_read_unlock(&kvm->srcu, srcu_idx);
1111}
1112
1113/*
1114 * Functions for reading and writing the hash table via reads and
1115 * writes on a file descriptor.
1116 *
1117 * Reads return the guest view of the hash table, which has to be
1118 * pieced together from the real hash table and the guest_rpte
1119 * values in the revmap array.
1120 *
1121 * On writes, each HPTE written is considered in turn, and if it
1122 * is valid, it is written to the HPT as if an H_ENTER with the
1123 * exact flag set was done.  When the invalid count is non-zero
1124 * in the header written to the stream, the kernel will make
1125 * sure that that many HPTEs are invalid, and invalidate them
1126 * if not.
1127 */
1128
1129struct kvm_htab_ctx {
1130        unsigned long   index;
1131        unsigned long   flags;
1132        struct kvm      *kvm;
1133        int             first_pass;
1134};
1135
1136#define HPTE_SIZE       (2 * sizeof(unsigned long))
1137
1138/*
1139 * Returns 1 if this HPT entry has been modified or has pending
1140 * R/C bit changes.
1141 */
1142static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp)
1143{
1144        unsigned long rcbits_unset;
1145
1146        if (revp->guest_rpte & HPTE_GR_MODIFIED)
1147                return 1;
1148
1149        /* Also need to consider changes in reference and changed bits */
1150        rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
1151        if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) &&
1152            (be64_to_cpu(hptp[1]) & rcbits_unset))
1153                return 1;
1154
1155        return 0;
1156}
1157
1158static long record_hpte(unsigned long flags, __be64 *hptp,
1159                        unsigned long *hpte, struct revmap_entry *revp,
1160                        int want_valid, int first_pass)
1161{
1162        unsigned long v, r;
1163        unsigned long rcbits_unset;
1164        int ok = 1;
1165        int valid, dirty;
1166
1167        /* Unmodified entries are uninteresting except on the first pass */
1168        dirty = hpte_dirty(revp, hptp);
1169        if (!first_pass && !dirty)
1170                return 0;
1171
1172        valid = 0;
1173        if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) {
1174                valid = 1;
1175                if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
1176                    !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED))
1177                        valid = 0;
1178        }
1179        if (valid != want_valid)
1180                return 0;
1181
1182        v = r = 0;
1183        if (valid || dirty) {
1184                /* lock the HPTE so it's stable and read it */
1185                preempt_disable();
1186                while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
1187                        cpu_relax();
1188                v = be64_to_cpu(hptp[0]);
1189
1190                /* re-evaluate valid and dirty from synchronized HPTE value */
1191                valid = !!(v & HPTE_V_VALID);
1192                dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
1193
1194                /* Harvest R and C into guest view if necessary */
1195                rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
1196                if (valid && (rcbits_unset & be64_to_cpu(hptp[1]))) {
1197                        revp->guest_rpte |= (be64_to_cpu(hptp[1]) &
1198                                (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED;
1199                        dirty = 1;
1200                }
1201
1202                if (v & HPTE_V_ABSENT) {
1203                        v &= ~HPTE_V_ABSENT;
1204                        v |= HPTE_V_VALID;
1205                        valid = 1;
1206                }
1207                if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
1208                        valid = 0;
1209
1210                r = revp->guest_rpte;
1211                /* only clear modified if this is the right sort of entry */
1212                if (valid == want_valid && dirty) {
1213                        r &= ~HPTE_GR_MODIFIED;
1214                        revp->guest_rpte = r;
1215                }
1216                unlock_hpte(hptp, be64_to_cpu(hptp[0]));
1217                preempt_enable();
1218                if (!(valid == want_valid && (first_pass || dirty)))
1219                        ok = 0;
1220        }
1221        hpte[0] = cpu_to_be64(v);
1222        hpte[1] = cpu_to_be64(r);
1223        return ok;
1224}
1225
1226static ssize_t kvm_htab_read(struct file *file, char __user *buf,
1227                             size_t count, loff_t *ppos)
1228{
1229        struct kvm_htab_ctx *ctx = file->private_data;
1230        struct kvm *kvm = ctx->kvm;
1231        struct kvm_get_htab_header hdr;
1232        __be64 *hptp;
1233        struct revmap_entry *revp;
1234        unsigned long i, nb, nw;
1235        unsigned long __user *lbuf;
1236        struct kvm_get_htab_header __user *hptr;
1237        unsigned long flags;
1238        int first_pass;
1239        unsigned long hpte[2];
1240
1241        if (!access_ok(VERIFY_WRITE, buf, count))
1242                return -EFAULT;
1243
1244        first_pass = ctx->first_pass;
1245        flags = ctx->flags;
1246
1247        i = ctx->index;
1248        hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
1249        revp = kvm->arch.revmap + i;
1250        lbuf = (unsigned long __user *)buf;
1251
1252        nb = 0;
1253        while (nb + sizeof(hdr) + HPTE_SIZE < count) {
1254                /* Initialize header */
1255                hptr = (struct kvm_get_htab_header __user *)buf;
1256                hdr.n_valid = 0;
1257                hdr.n_invalid = 0;
1258                nw = nb;
1259                nb += sizeof(hdr);
1260                lbuf = (unsigned long __user *)(buf + sizeof(hdr));
1261
1262                /* Skip uninteresting entries, i.e. clean on not-first pass */
1263                if (!first_pass) {
1264                        while (i < kvm->arch.hpt_npte &&
1265                               !hpte_dirty(revp, hptp)) {
1266                                ++i;
1267                                hptp += 2;
1268                                ++revp;
1269                        }
1270                }
1271                hdr.index = i;
1272
1273                /* Grab a series of valid entries */
1274                while (i < kvm->arch.hpt_npte &&
1275                       hdr.n_valid < 0xffff &&
1276                       nb + HPTE_SIZE < count &&
1277                       record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
1278                        /* valid entry, write it out */
1279                        ++hdr.n_valid;
1280                        if (__put_user(hpte[0], lbuf) ||
1281                            __put_user(hpte[1], lbuf + 1))
1282                                return -EFAULT;
1283                        nb += HPTE_SIZE;
1284                        lbuf += 2;
1285                        ++i;
1286                        hptp += 2;
1287                        ++revp;
1288                }
1289                /* Now skip invalid entries while we can */
1290                while (i < kvm->arch.hpt_npte &&
1291                       hdr.n_invalid < 0xffff &&
1292                       record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
1293                        /* found an invalid entry */
1294                        ++hdr.n_invalid;
1295                        ++i;
1296                        hptp += 2;
1297                        ++revp;
1298                }
1299
1300                if (hdr.n_valid || hdr.n_invalid) {
1301                        /* write back the header */
1302                        if (__copy_to_user(hptr, &hdr, sizeof(hdr)))
1303                                return -EFAULT;
1304                        nw = nb;
1305                        buf = (char __user *)lbuf;
1306                } else {
1307                        nb = nw;
1308                }
1309
1310                /* Check if we've wrapped around the hash table */
1311                if (i >= kvm->arch.hpt_npte) {
1312                        i = 0;
1313                        ctx->first_pass = 0;
1314                        break;
1315                }
1316        }
1317
1318        ctx->index = i;
1319
1320        return nb;
1321}
1322
1323static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
1324                              size_t count, loff_t *ppos)
1325{
1326        struct kvm_htab_ctx *ctx = file->private_data;
1327        struct kvm *kvm = ctx->kvm;
1328        struct kvm_get_htab_header hdr;
1329        unsigned long i, j;
1330        unsigned long v, r;
1331        unsigned long __user *lbuf;
1332        __be64 *hptp;
1333        unsigned long tmp[2];
1334        ssize_t nb;
1335        long int err, ret;
1336        int hpte_setup;
1337
1338        if (!access_ok(VERIFY_READ, buf, count))
1339                return -EFAULT;
1340
1341        /* lock out vcpus from running while we're doing this */
1342        mutex_lock(&kvm->lock);
1343        hpte_setup = kvm->arch.hpte_setup_done;
1344        if (hpte_setup) {
1345                kvm->arch.hpte_setup_done = 0;  /* temporarily */
1346                /* order hpte_setup_done vs. vcpus_running */
1347                smp_mb();
1348                if (atomic_read(&kvm->arch.vcpus_running)) {
1349                        kvm->arch.hpte_setup_done = 1;
1350                        mutex_unlock(&kvm->lock);
1351                        return -EBUSY;
1352                }
1353        }
1354
1355        err = 0;
1356        for (nb = 0; nb + sizeof(hdr) <= count; ) {
1357                err = -EFAULT;
1358                if (__copy_from_user(&hdr, buf, sizeof(hdr)))
1359                        break;
1360
1361                err = 0;
1362                if (nb + hdr.n_valid * HPTE_SIZE > count)
1363                        break;
1364
1365                nb += sizeof(hdr);
1366                buf += sizeof(hdr);
1367
1368                err = -EINVAL;
1369                i = hdr.index;
1370                if (i >= kvm->arch.hpt_npte ||
1371                    i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
1372                        break;
1373
1374                hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
1375                lbuf = (unsigned long __user *)buf;
1376                for (j = 0; j < hdr.n_valid; ++j) {
1377                        __be64 hpte_v;
1378                        __be64 hpte_r;
1379
1380                        err = -EFAULT;
1381                        if (__get_user(hpte_v, lbuf) ||
1382                            __get_user(hpte_r, lbuf + 1))
1383                                goto out;
1384                        v = be64_to_cpu(hpte_v);
1385                        r = be64_to_cpu(hpte_r);
1386                        err = -EINVAL;
1387                        if (!(v & HPTE_V_VALID))
1388                                goto out;
1389                        lbuf += 2;
1390                        nb += HPTE_SIZE;
1391
1392                        if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
1393                                kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1394                        err = -EIO;
1395                        ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
1396                                                         tmp);
1397                        if (ret != H_SUCCESS) {
1398                                pr_err("kvm_htab_write ret %ld i=%ld v=%lx "
1399                                       "r=%lx\n", ret, i, v, r);
1400                                goto out;
1401                        }
1402                        if (!hpte_setup && is_vrma_hpte(v)) {
1403                                unsigned long psize = hpte_base_page_size(v, r);
1404                                unsigned long senc = slb_pgsize_encoding(psize);
1405                                unsigned long lpcr;
1406
1407                                kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
1408                                        (VRMA_VSID << SLB_VSID_SHIFT_1T);
1409                                lpcr = senc << (LPCR_VRMASD_SH - 4);
1410                                kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
1411                                hpte_setup = 1;
1412                        }
1413                        ++i;
1414                        hptp += 2;
1415                }
1416
1417                for (j = 0; j < hdr.n_invalid; ++j) {
1418                        if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
1419                                kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1420                        ++i;
1421                        hptp += 2;
1422                }
1423                err = 0;
1424        }
1425
1426 out:
1427        /* Order HPTE updates vs. hpte_setup_done */
1428        smp_wmb();
1429        kvm->arch.hpte_setup_done = hpte_setup;
1430        mutex_unlock(&kvm->lock);
1431
1432        if (err)
1433                return err;
1434        return nb;
1435}
1436
1437static int kvm_htab_release(struct inode *inode, struct file *filp)
1438{
1439        struct kvm_htab_ctx *ctx = filp->private_data;
1440
1441        filp->private_data = NULL;
1442        if (!(ctx->flags & KVM_GET_HTAB_WRITE))
1443                atomic_dec(&ctx->kvm->arch.hpte_mod_interest);
1444        kvm_put_kvm(ctx->kvm);
1445        kfree(ctx);
1446        return 0;
1447}
1448
1449static const struct file_operations kvm_htab_fops = {
1450        .read           = kvm_htab_read,
1451        .write          = kvm_htab_write,
1452        .llseek         = default_llseek,
1453        .release        = kvm_htab_release,
1454};
1455
1456int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
1457{
1458        int ret;
1459        struct kvm_htab_ctx *ctx;
1460        int rwflag;
1461
1462        /* reject flags we don't recognize */
1463        if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE))
1464                return -EINVAL;
1465        ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1466        if (!ctx)
1467                return -ENOMEM;
1468        kvm_get_kvm(kvm);
1469        ctx->kvm = kvm;
1470        ctx->index = ghf->start_index;
1471        ctx->flags = ghf->flags;
1472        ctx->first_pass = 1;
1473
1474        rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
1475        ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
1476        if (ret < 0) {
1477                kvm_put_kvm(kvm);
1478                return ret;
1479        }
1480
1481        if (rwflag == O_RDONLY) {
1482                mutex_lock(&kvm->slots_lock);
1483                atomic_inc(&kvm->arch.hpte_mod_interest);
1484                /* make sure kvmppc_do_h_enter etc. see the increment */
1485                synchronize_srcu_expedited(&kvm->srcu);
1486                mutex_unlock(&kvm->slots_lock);
1487        }
1488
1489        return ret;
1490}
1491
1492struct debugfs_htab_state {
1493        struct kvm      *kvm;
1494        struct mutex    mutex;
1495        unsigned long   hpt_index;
1496        int             chars_left;
1497        int             buf_index;
1498        char            buf[64];
1499};
1500
1501static int debugfs_htab_open(struct inode *inode, struct file *file)
1502{
1503        struct kvm *kvm = inode->i_private;
1504        struct debugfs_htab_state *p;
1505
1506        p = kzalloc(sizeof(*p), GFP_KERNEL);
1507        if (!p)
1508                return -ENOMEM;
1509
1510        kvm_get_kvm(kvm);
1511        p->kvm = kvm;
1512        mutex_init(&p->mutex);
1513        file->private_data = p;
1514
1515        return nonseekable_open(inode, file);
1516}
1517
1518static int debugfs_htab_release(struct inode *inode, struct file *file)
1519{
1520        struct debugfs_htab_state *p = file->private_data;
1521
1522        kvm_put_kvm(p->kvm);
1523        kfree(p);
1524        return 0;
1525}
1526
1527static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
1528                                 size_t len, loff_t *ppos)
1529{
1530        struct debugfs_htab_state *p = file->private_data;
1531        ssize_t ret, r;
1532        unsigned long i, n;
1533        unsigned long v, hr, gr;
1534        struct kvm *kvm;
1535        __be64 *hptp;
1536
1537        ret = mutex_lock_interruptible(&p->mutex);
1538        if (ret)
1539                return ret;
1540
1541        if (p->chars_left) {
1542                n = p->chars_left;
1543                if (n > len)
1544                        n = len;
1545                r = copy_to_user(buf, p->buf + p->buf_index, n);
1546                n -= r;
1547                p->chars_left -= n;
1548                p->buf_index += n;
1549                buf += n;
1550                len -= n;
1551                ret = n;
1552                if (r) {
1553                        if (!n)
1554                                ret = -EFAULT;
1555                        goto out;
1556                }
1557        }
1558
1559        kvm = p->kvm;
1560        i = p->hpt_index;
1561        hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
1562        for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) {
1563                if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
1564                        continue;
1565
1566                /* lock the HPTE so it's stable and read it */
1567                preempt_disable();
1568                while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
1569                        cpu_relax();
1570                v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
1571                hr = be64_to_cpu(hptp[1]);
1572                gr = kvm->arch.revmap[i].guest_rpte;
1573                unlock_hpte(hptp, v);
1574                preempt_enable();
1575
1576                if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
1577                        continue;
1578
1579                n = scnprintf(p->buf, sizeof(p->buf),
1580                              "%6lx %.16lx %.16lx %.16lx\n",
1581                              i, v, hr, gr);
1582                p->chars_left = n;
1583                if (n > len)
1584                        n = len;
1585                r = copy_to_user(buf, p->buf, n);
1586                n -= r;
1587                p->chars_left -= n;
1588                p->buf_index = n;
1589                buf += n;
1590                len -= n;
1591                ret += n;
1592                if (r) {
1593                        if (!ret)
1594                                ret = -EFAULT;
1595                        goto out;
1596                }
1597        }
1598        p->hpt_index = i;
1599
1600 out:
1601        mutex_unlock(&p->mutex);
1602        return ret;
1603}
1604
1605ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
1606                           size_t len, loff_t *ppos)
1607{
1608        return -EACCES;
1609}
1610
1611static const struct file_operations debugfs_htab_fops = {
1612        .owner   = THIS_MODULE,
1613        .open    = debugfs_htab_open,
1614        .release = debugfs_htab_release,
1615        .read    = debugfs_htab_read,
1616        .write   = debugfs_htab_write,
1617        .llseek  = generic_file_llseek,
1618};
1619
1620void kvmppc_mmu_debugfs_init(struct kvm *kvm)
1621{
1622        kvm->arch.htab_dentry = debugfs_create_file("htab", 0400,
1623                                                    kvm->arch.debugfs_dir, kvm,
1624                                                    &debugfs_htab_fops);
1625}
1626
1627void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
1628{
1629        struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
1630
1631        vcpu->arch.slb_nr = 32;         /* POWER7/POWER8 */
1632
1633        mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
1634        mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
1635
1636        vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
1637}
1638