linux/arch/powerpc/kvm/book3s_64_mmu_hv.c
<<
>>
Prefs
   1/*
   2 * This program is free software; you can redistribute it and/or modify
   3 * it under the terms of the GNU General Public License, version 2, as
   4 * published by the Free Software Foundation.
   5 *
   6 * This program is distributed in the hope that it will be useful,
   7 * but WITHOUT ANY WARRANTY; without even the implied warranty of
   8 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   9 * GNU General Public License for more details.
  10 *
  11 * You should have received a copy of the GNU General Public License
  12 * along with this program; if not, write to the Free Software
  13 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  14 *
  15 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  16 */
  17
  18#include <linux/types.h>
  19#include <linux/string.h>
  20#include <linux/kvm.h>
  21#include <linux/kvm_host.h>
  22#include <linux/highmem.h>
  23#include <linux/gfp.h>
  24#include <linux/slab.h>
  25#include <linux/hugetlb.h>
  26#include <linux/vmalloc.h>
  27#include <linux/srcu.h>
  28#include <linux/anon_inodes.h>
  29#include <linux/file.h>
  30#include <linux/debugfs.h>
  31
  32#include <asm/tlbflush.h>
  33#include <asm/kvm_ppc.h>
  34#include <asm/kvm_book3s.h>
  35#include <asm/book3s/64/mmu-hash.h>
  36#include <asm/hvcall.h>
  37#include <asm/synch.h>
  38#include <asm/ppc-opcode.h>
  39#include <asm/cputable.h>
  40#include <asm/pte-walk.h>
  41
  42#include "trace_hv.h"
  43
  44//#define DEBUG_RESIZE_HPT      1
  45
  46#ifdef DEBUG_RESIZE_HPT
  47#define resize_hpt_debug(resize, ...)                           \
  48        do {                                                    \
  49                printk(KERN_DEBUG "RESIZE HPT %p: ", resize);   \
  50                printk(__VA_ARGS__);                            \
  51        } while (0)
  52#else
  53#define resize_hpt_debug(resize, ...)                           \
  54        do { } while (0)
  55#endif
  56
  57static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
  58                                long pte_index, unsigned long pteh,
  59                                unsigned long ptel, unsigned long *pte_idx_ret);
  60
  61struct kvm_resize_hpt {
  62        /* These fields read-only after init */
  63        struct kvm *kvm;
  64        struct work_struct work;
  65        u32 order;
  66
  67        /* These fields protected by kvm->lock */
  68        int error;
  69        bool prepare_done;
  70
  71        /* Private to the work thread, until prepare_done is true,
  72         * then protected by kvm->resize_hpt_sem */
  73        struct kvm_hpt_info hpt;
  74};
  75
  76static void kvmppc_rmap_reset(struct kvm *kvm);
  77
  78int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
  79{
  80        unsigned long hpt = 0;
  81        int cma = 0;
  82        struct page *page = NULL;
  83        struct revmap_entry *rev;
  84        unsigned long npte;
  85
  86        if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER))
  87                return -EINVAL;
  88
  89        page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
  90        if (page) {
  91                hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
  92                memset((void *)hpt, 0, (1ul << order));
  93                cma = 1;
  94        }
  95
  96        if (!hpt)
  97                hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_RETRY_MAYFAIL
  98                                       |__GFP_NOWARN, order - PAGE_SHIFT);
  99
 100        if (!hpt)
 101                return -ENOMEM;
 102
 103        /* HPTEs are 2**4 bytes long */
 104        npte = 1ul << (order - 4);
 105
 106        /* Allocate reverse map array */
 107        rev = vmalloc(sizeof(struct revmap_entry) * npte);
 108        if (!rev) {
 109                pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n");
 110                if (cma)
 111                        kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
 112                else
 113                        free_pages(hpt, order - PAGE_SHIFT);
 114                return -ENOMEM;
 115        }
 116
 117        info->order = order;
 118        info->virt = hpt;
 119        info->cma = cma;
 120        info->rev = rev;
 121
 122        return 0;
 123}
 124
 125void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info)
 126{
 127        atomic64_set(&kvm->arch.mmio_update, 0);
 128        kvm->arch.hpt = *info;
 129        kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18);
 130
 131        pr_debug("KVM guest htab at %lx (order %ld), LPID %x\n",
 132                 info->virt, (long)info->order, kvm->arch.lpid);
 133}
 134
 135long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
 136{
 137        long err = -EBUSY;
 138        struct kvm_hpt_info info;
 139
 140        if (kvm_is_radix(kvm))
 141                return -EINVAL;
 142
 143        mutex_lock(&kvm->lock);
 144        if (kvm->arch.hpte_setup_done) {
 145                kvm->arch.hpte_setup_done = 0;
 146                /* order hpte_setup_done vs. vcpus_running */
 147                smp_mb();
 148                if (atomic_read(&kvm->arch.vcpus_running)) {
 149                        kvm->arch.hpte_setup_done = 1;
 150                        goto out;
 151                }
 152        }
 153        if (kvm->arch.hpt.order == order) {
 154                /* We already have a suitable HPT */
 155
 156                /* Set the entire HPT to 0, i.e. invalid HPTEs */
 157                memset((void *)kvm->arch.hpt.virt, 0, 1ul << order);
 158                /*
 159                 * Reset all the reverse-mapping chains for all memslots
 160                 */
 161                kvmppc_rmap_reset(kvm);
 162                /* Ensure that each vcpu will flush its TLB on next entry. */
 163                cpumask_setall(&kvm->arch.need_tlb_flush);
 164                err = 0;
 165                goto out;
 166        }
 167
 168        if (kvm->arch.hpt.virt) {
 169                kvmppc_free_hpt(&kvm->arch.hpt);
 170                kvmppc_rmap_reset(kvm);
 171        }
 172
 173        err = kvmppc_allocate_hpt(&info, order);
 174        if (err < 0)
 175                goto out;
 176        kvmppc_set_hpt(kvm, &info);
 177
 178out:
 179        mutex_unlock(&kvm->lock);
 180        return err;
 181}
 182
 183void kvmppc_free_hpt(struct kvm_hpt_info *info)
 184{
 185        vfree(info->rev);
 186        if (info->cma)
 187                kvm_free_hpt_cma(virt_to_page(info->virt),
 188                                 1 << (info->order - PAGE_SHIFT));
 189        else if (info->virt)
 190                free_pages(info->virt, info->order - PAGE_SHIFT);
 191        info->virt = 0;
 192        info->order = 0;
 193}
 194
 195/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
 196static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize)
 197{
 198        return (pgsize > 0x1000) ? HPTE_V_LARGE : 0;
 199}
 200
 201/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
 202static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize)
 203{
 204        return (pgsize == 0x10000) ? 0x1000 : 0;
 205}
 206
 207void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
 208                     unsigned long porder)
 209{
 210        unsigned long i;
 211        unsigned long npages;
 212        unsigned long hp_v, hp_r;
 213        unsigned long addr, hash;
 214        unsigned long psize;
 215        unsigned long hp0, hp1;
 216        unsigned long idx_ret;
 217        long ret;
 218        struct kvm *kvm = vcpu->kvm;
 219
 220        psize = 1ul << porder;
 221        npages = memslot->npages >> (porder - PAGE_SHIFT);
 222
 223        /* VRMA can't be > 1TB */
 224        if (npages > 1ul << (40 - porder))
 225                npages = 1ul << (40 - porder);
 226        /* Can't use more than 1 HPTE per HPTEG */
 227        if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1)
 228                npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1;
 229
 230        hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
 231                HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
 232        hp1 = hpte1_pgsize_encoding(psize) |
 233                HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX;
 234
 235        for (i = 0; i < npages; ++i) {
 236                addr = i << porder;
 237                /* can't use hpt_hash since va > 64 bits */
 238                hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25)))
 239                        & kvmppc_hpt_mask(&kvm->arch.hpt);
 240                /*
 241                 * We assume that the hash table is empty and no
 242                 * vcpus are using it at this stage.  Since we create
 243                 * at most one HPTE per HPTEG, we just assume entry 7
 244                 * is available and use it.
 245                 */
 246                hash = (hash << 3) + 7;
 247                hp_v = hp0 | ((addr >> 16) & ~0x7fUL);
 248                hp_r = hp1 | addr;
 249                ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r,
 250                                                 &idx_ret);
 251                if (ret != H_SUCCESS) {
 252                        pr_err("KVM: map_vrma at %lx failed, ret=%ld\n",
 253                               addr, ret);
 254                        break;
 255                }
 256        }
 257}
 258
 259int kvmppc_mmu_hv_init(void)
 260{
 261        unsigned long host_lpid, rsvd_lpid;
 262
 263        if (!cpu_has_feature(CPU_FTR_HVMODE))
 264                return -EINVAL;
 265
 266        /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */
 267        host_lpid = mfspr(SPRN_LPID);
 268        rsvd_lpid = LPID_RSVD;
 269
 270        kvmppc_init_lpid(rsvd_lpid + 1);
 271
 272        kvmppc_claim_lpid(host_lpid);
 273        /* rsvd_lpid is reserved for use in partition switching */
 274        kvmppc_claim_lpid(rsvd_lpid);
 275
 276        return 0;
 277}
 278
 279static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
 280{
 281        unsigned long msr = vcpu->arch.intr_msr;
 282
 283        /* If transactional, change to suspend mode on IRQ delivery */
 284        if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr))
 285                msr |= MSR_TS_S;
 286        else
 287                msr |= vcpu->arch.shregs.msr & MSR_TS_MASK;
 288        kvmppc_set_msr(vcpu, msr);
 289}
 290
 291static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
 292                                long pte_index, unsigned long pteh,
 293                                unsigned long ptel, unsigned long *pte_idx_ret)
 294{
 295        long ret;
 296
 297        /* Protect linux PTE lookup from page table destruction */
 298        rcu_read_lock_sched();  /* this disables preemption too */
 299        ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel,
 300                                current->mm->pgd, false, pte_idx_ret);
 301        rcu_read_unlock_sched();
 302        if (ret == H_TOO_HARD) {
 303                /* this can't happen */
 304                pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n");
 305                ret = H_RESOURCE;       /* or something */
 306        }
 307        return ret;
 308
 309}
 310
 311static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu,
 312                                                         gva_t eaddr)
 313{
 314        u64 mask;
 315        int i;
 316
 317        for (i = 0; i < vcpu->arch.slb_nr; i++) {
 318                if (!(vcpu->arch.slb[i].orige & SLB_ESID_V))
 319                        continue;
 320
 321                if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T)
 322                        mask = ESID_MASK_1T;
 323                else
 324                        mask = ESID_MASK;
 325
 326                if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0)
 327                        return &vcpu->arch.slb[i];
 328        }
 329        return NULL;
 330}
 331
 332static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
 333                        unsigned long ea)
 334{
 335        unsigned long ra_mask;
 336
 337        ra_mask = hpte_page_size(v, r) - 1;
 338        return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
 339}
 340
 341static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 342                        struct kvmppc_pte *gpte, bool data, bool iswrite)
 343{
 344        struct kvm *kvm = vcpu->kvm;
 345        struct kvmppc_slb *slbe;
 346        unsigned long slb_v;
 347        unsigned long pp, key;
 348        unsigned long v, orig_v, gr;
 349        __be64 *hptep;
 350        int index;
 351        int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
 352
 353        /* Get SLB entry */
 354        if (virtmode) {
 355                slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
 356                if (!slbe)
 357                        return -EINVAL;
 358                slb_v = slbe->origv;
 359        } else {
 360                /* real mode access */
 361                slb_v = vcpu->kvm->arch.vrma_slb_v;
 362        }
 363
 364        preempt_disable();
 365        /* Find the HPTE in the hash table */
 366        index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v,
 367                                         HPTE_V_VALID | HPTE_V_ABSENT);
 368        if (index < 0) {
 369                preempt_enable();
 370                return -ENOENT;
 371        }
 372        hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
 373        v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
 374        if (cpu_has_feature(CPU_FTR_ARCH_300))
 375                v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1]));
 376        gr = kvm->arch.hpt.rev[index].guest_rpte;
 377
 378        unlock_hpte(hptep, orig_v);
 379        preempt_enable();
 380
 381        gpte->eaddr = eaddr;
 382        gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff);
 383
 384        /* Get PP bits and key for permission check */
 385        pp = gr & (HPTE_R_PP0 | HPTE_R_PP);
 386        key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS;
 387        key &= slb_v;
 388
 389        /* Calculate permissions */
 390        gpte->may_read = hpte_read_permission(pp, key);
 391        gpte->may_write = hpte_write_permission(pp, key);
 392        gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G));
 393
 394        /* Storage key permission check for POWER7 */
 395        if (data && virtmode) {
 396                int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr);
 397                if (amrfield & 1)
 398                        gpte->may_read = 0;
 399                if (amrfield & 2)
 400                        gpte->may_write = 0;
 401        }
 402
 403        /* Get the guest physical address */
 404        gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr);
 405        return 0;
 406}
 407
 408/*
 409 * Quick test for whether an instruction is a load or a store.
 410 * If the instruction is a load or a store, then this will indicate
 411 * which it is, at least on server processors.  (Embedded processors
 412 * have some external PID instructions that don't follow the rule
 413 * embodied here.)  If the instruction isn't a load or store, then
 414 * this doesn't return anything useful.
 415 */
 416static int instruction_is_store(unsigned int instr)
 417{
 418        unsigned int mask;
 419
 420        mask = 0x10000000;
 421        if ((instr & 0xfc000000) == 0x7c000000)
 422                mask = 0x100;           /* major opcode 31 */
 423        return (instr & mask) != 0;
 424}
 425
 426int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu,
 427                           unsigned long gpa, gva_t ea, int is_store)
 428{
 429        u32 last_inst;
 430
 431        /*
 432         * If we fail, we just return to the guest and try executing it again.
 433         */
 434        if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
 435                EMULATE_DONE)
 436                return RESUME_GUEST;
 437
 438        /*
 439         * WARNING: We do not know for sure whether the instruction we just
 440         * read from memory is the same that caused the fault in the first
 441         * place.  If the instruction we read is neither an load or a store,
 442         * then it can't access memory, so we don't need to worry about
 443         * enforcing access permissions.  So, assuming it is a load or
 444         * store, we just check that its direction (load or store) is
 445         * consistent with the original fault, since that's what we
 446         * checked the access permissions against.  If there is a mismatch
 447         * we just return and retry the instruction.
 448         */
 449
 450        if (instruction_is_store(last_inst) != !!is_store)
 451                return RESUME_GUEST;
 452
 453        /*
 454         * Emulated accesses are emulated by looking at the hash for
 455         * translation once, then performing the access later. The
 456         * translation could be invalidated in the meantime in which
 457         * point performing the subsequent memory access on the old
 458         * physical address could possibly be a security hole for the
 459         * guest (but not the host).
 460         *
 461         * This is less of an issue for MMIO stores since they aren't
 462         * globally visible. It could be an issue for MMIO loads to
 463         * a certain extent but we'll ignore it for now.
 464         */
 465
 466        vcpu->arch.paddr_accessed = gpa;
 467        vcpu->arch.vaddr_accessed = ea;
 468        return kvmppc_emulate_mmio(run, vcpu);
 469}
 470
 471int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 472                                unsigned long ea, unsigned long dsisr)
 473{
 474        struct kvm *kvm = vcpu->kvm;
 475        unsigned long hpte[3], r;
 476        unsigned long hnow_v, hnow_r;
 477        __be64 *hptep;
 478        unsigned long mmu_seq, psize, pte_size;
 479        unsigned long gpa_base, gfn_base;
 480        unsigned long gpa, gfn, hva, pfn;
 481        struct kvm_memory_slot *memslot;
 482        unsigned long *rmap;
 483        struct revmap_entry *rev;
 484        struct page *page, *pages[1];
 485        long index, ret, npages;
 486        bool is_ci;
 487        unsigned int writing, write_ok;
 488        struct vm_area_struct *vma;
 489        unsigned long rcbits;
 490        long mmio_update;
 491
 492        if (kvm_is_radix(kvm))
 493                return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr);
 494
 495        /*
 496         * Real-mode code has already searched the HPT and found the
 497         * entry we're interested in.  Lock the entry and check that
 498         * it hasn't changed.  If it has, just return and re-execute the
 499         * instruction.
 500         */
 501        if (ea != vcpu->arch.pgfault_addr)
 502                return RESUME_GUEST;
 503
 504        if (vcpu->arch.pgfault_cache) {
 505                mmio_update = atomic64_read(&kvm->arch.mmio_update);
 506                if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) {
 507                        r = vcpu->arch.pgfault_cache->rpte;
 508                        psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r);
 509                        gpa_base = r & HPTE_R_RPN & ~(psize - 1);
 510                        gfn_base = gpa_base >> PAGE_SHIFT;
 511                        gpa = gpa_base | (ea & (psize - 1));
 512                        return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
 513                                                dsisr & DSISR_ISSTORE);
 514                }
 515        }
 516        index = vcpu->arch.pgfault_index;
 517        hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
 518        rev = &kvm->arch.hpt.rev[index];
 519        preempt_disable();
 520        while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
 521                cpu_relax();
 522        hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
 523        hpte[1] = be64_to_cpu(hptep[1]);
 524        hpte[2] = r = rev->guest_rpte;
 525        unlock_hpte(hptep, hpte[0]);
 526        preempt_enable();
 527
 528        if (cpu_has_feature(CPU_FTR_ARCH_300)) {
 529                hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]);
 530                hpte[1] = hpte_new_to_old_r(hpte[1]);
 531        }
 532        if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
 533            hpte[1] != vcpu->arch.pgfault_hpte[1])
 534                return RESUME_GUEST;
 535
 536        /* Translate the logical address and get the page */
 537        psize = hpte_page_size(hpte[0], r);
 538        gpa_base = r & HPTE_R_RPN & ~(psize - 1);
 539        gfn_base = gpa_base >> PAGE_SHIFT;
 540        gpa = gpa_base | (ea & (psize - 1));
 541        gfn = gpa >> PAGE_SHIFT;
 542        memslot = gfn_to_memslot(kvm, gfn);
 543
 544        trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr);
 545
 546        /* No memslot means it's an emulated MMIO region */
 547        if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
 548                return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
 549                                              dsisr & DSISR_ISSTORE);
 550
 551        /*
 552         * This should never happen, because of the slot_is_aligned()
 553         * check in kvmppc_do_h_enter().
 554         */
 555        if (gfn_base < memslot->base_gfn)
 556                return -EFAULT;
 557
 558        /* used to check for invalidations in progress */
 559        mmu_seq = kvm->mmu_notifier_seq;
 560        smp_rmb();
 561
 562        ret = -EFAULT;
 563        is_ci = false;
 564        pfn = 0;
 565        page = NULL;
 566        pte_size = PAGE_SIZE;
 567        writing = (dsisr & DSISR_ISSTORE) != 0;
 568        /* If writing != 0, then the HPTE must allow writing, if we get here */
 569        write_ok = writing;
 570        hva = gfn_to_hva_memslot(memslot, gfn);
 571        npages = get_user_pages_fast(hva, 1, writing, pages);
 572        if (npages < 1) {
 573                /* Check if it's an I/O mapping */
 574                down_read(&current->mm->mmap_sem);
 575                vma = find_vma(current->mm, hva);
 576                if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end &&
 577                    (vma->vm_flags & VM_PFNMAP)) {
 578                        pfn = vma->vm_pgoff +
 579                                ((hva - vma->vm_start) >> PAGE_SHIFT);
 580                        pte_size = psize;
 581                        is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot))));
 582                        write_ok = vma->vm_flags & VM_WRITE;
 583                }
 584                up_read(&current->mm->mmap_sem);
 585                if (!pfn)
 586                        goto out_put;
 587        } else {
 588                page = pages[0];
 589                pfn = page_to_pfn(page);
 590                if (PageHuge(page)) {
 591                        page = compound_head(page);
 592                        pte_size <<= compound_order(page);
 593                }
 594                /* if the guest wants write access, see if that is OK */
 595                if (!writing && hpte_is_writable(r)) {
 596                        pte_t *ptep, pte;
 597                        unsigned long flags;
 598                        /*
 599                         * We need to protect against page table destruction
 600                         * hugepage split and collapse.
 601                         */
 602                        local_irq_save(flags);
 603                        ptep = find_current_mm_pte(current->mm->pgd,
 604                                                   hva, NULL, NULL);
 605                        if (ptep) {
 606                                pte = kvmppc_read_update_linux_pte(ptep, 1);
 607                                if (__pte_write(pte))
 608                                        write_ok = 1;
 609                        }
 610                        local_irq_restore(flags);
 611                }
 612        }
 613
 614        if (psize > pte_size)
 615                goto out_put;
 616
 617        /* Check WIMG vs. the actual page we're accessing */
 618        if (!hpte_cache_flags_ok(r, is_ci)) {
 619                if (is_ci)
 620                        goto out_put;
 621                /*
 622                 * Allow guest to map emulated device memory as
 623                 * uncacheable, but actually make it cacheable.
 624                 */
 625                r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M;
 626        }
 627
 628        /*
 629         * Set the HPTE to point to pfn.
 630         * Since the pfn is at PAGE_SIZE granularity, make sure we
 631         * don't mask out lower-order bits if psize < PAGE_SIZE.
 632         */
 633        if (psize < PAGE_SIZE)
 634                psize = PAGE_SIZE;
 635        r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) |
 636                                        ((pfn << PAGE_SHIFT) & ~(psize - 1));
 637        if (hpte_is_writable(r) && !write_ok)
 638                r = hpte_make_readonly(r);
 639        ret = RESUME_GUEST;
 640        preempt_disable();
 641        while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
 642                cpu_relax();
 643        hnow_v = be64_to_cpu(hptep[0]);
 644        hnow_r = be64_to_cpu(hptep[1]);
 645        if (cpu_has_feature(CPU_FTR_ARCH_300)) {
 646                hnow_v = hpte_new_to_old_v(hnow_v, hnow_r);
 647                hnow_r = hpte_new_to_old_r(hnow_r);
 648        }
 649
 650        /*
 651         * If the HPT is being resized, don't update the HPTE,
 652         * instead let the guest retry after the resize operation is complete.
 653         * The synchronization for hpte_setup_done test vs. set is provided
 654         * by the HPTE lock.
 655         */
 656        if (!kvm->arch.hpte_setup_done)
 657                goto out_unlock;
 658
 659        if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] ||
 660            rev->guest_rpte != hpte[2])
 661                /* HPTE has been changed under us; let the guest retry */
 662                goto out_unlock;
 663        hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
 664
 665        /* Always put the HPTE in the rmap chain for the page base address */
 666        rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn];
 667        lock_rmap(rmap);
 668
 669        /* Check if we might have been invalidated; let the guest retry if so */
 670        ret = RESUME_GUEST;
 671        if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) {
 672                unlock_rmap(rmap);
 673                goto out_unlock;
 674        }
 675
 676        /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */
 677        rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT;
 678        r &= rcbits | ~(HPTE_R_R | HPTE_R_C);
 679
 680        if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) {
 681                /* HPTE was previously valid, so we need to invalidate it */
 682                unlock_rmap(rmap);
 683                hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
 684                kvmppc_invalidate_hpte(kvm, hptep, index);
 685                /* don't lose previous R and C bits */
 686                r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
 687        } else {
 688                kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
 689        }
 690
 691        if (cpu_has_feature(CPU_FTR_ARCH_300)) {
 692                r = hpte_old_to_new_r(hpte[0], r);
 693                hpte[0] = hpte_old_to_new_v(hpte[0]);
 694        }
 695        hptep[1] = cpu_to_be64(r);
 696        eieio();
 697        __unlock_hpte(hptep, hpte[0]);
 698        asm volatile("ptesync" : : : "memory");
 699        preempt_enable();
 700        if (page && hpte_is_writable(r))
 701                SetPageDirty(page);
 702
 703 out_put:
 704        trace_kvm_page_fault_exit(vcpu, hpte, ret);
 705
 706        if (page) {
 707                /*
 708                 * We drop pages[0] here, not page because page might
 709                 * have been set to the head page of a compound, but
 710                 * we have to drop the reference on the correct tail
 711                 * page to match the get inside gup()
 712                 */
 713                put_page(pages[0]);
 714        }
 715        return ret;
 716
 717 out_unlock:
 718        __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 719        preempt_enable();
 720        goto out_put;
 721}
 722
 723static void kvmppc_rmap_reset(struct kvm *kvm)
 724{
 725        struct kvm_memslots *slots;
 726        struct kvm_memory_slot *memslot;
 727        int srcu_idx;
 728
 729        srcu_idx = srcu_read_lock(&kvm->srcu);
 730        slots = kvm_memslots(kvm);
 731        kvm_for_each_memslot(memslot, slots) {
 732                /*
 733                 * This assumes it is acceptable to lose reference and
 734                 * change bits across a reset.
 735                 */
 736                memset(memslot->arch.rmap, 0,
 737                       memslot->npages * sizeof(*memslot->arch.rmap));
 738        }
 739        srcu_read_unlock(&kvm->srcu, srcu_idx);
 740}
 741
 742typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot,
 743                              unsigned long gfn);
 744
 745static int kvm_handle_hva_range(struct kvm *kvm,
 746                                unsigned long start,
 747                                unsigned long end,
 748                                hva_handler_fn handler)
 749{
 750        int ret;
 751        int retval = 0;
 752        struct kvm_memslots *slots;
 753        struct kvm_memory_slot *memslot;
 754
 755        slots = kvm_memslots(kvm);
 756        kvm_for_each_memslot(memslot, slots) {
 757                unsigned long hva_start, hva_end;
 758                gfn_t gfn, gfn_end;
 759
 760                hva_start = max(start, memslot->userspace_addr);
 761                hva_end = min(end, memslot->userspace_addr +
 762                                        (memslot->npages << PAGE_SHIFT));
 763                if (hva_start >= hva_end)
 764                        continue;
 765                /*
 766                 * {gfn(page) | page intersects with [hva_start, hva_end)} =
 767                 * {gfn, gfn+1, ..., gfn_end-1}.
 768                 */
 769                gfn = hva_to_gfn_memslot(hva_start, memslot);
 770                gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
 771
 772                for (; gfn < gfn_end; ++gfn) {
 773                        ret = handler(kvm, memslot, gfn);
 774                        retval |= ret;
 775                }
 776        }
 777
 778        return retval;
 779}
 780
 781static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 782                          hva_handler_fn handler)
 783{
 784        return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
 785}
 786
 787/* Must be called with both HPTE and rmap locked */
 788static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
 789                              unsigned long *rmapp, unsigned long gfn)
 790{
 791        __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
 792        struct revmap_entry *rev = kvm->arch.hpt.rev;
 793        unsigned long j, h;
 794        unsigned long ptel, psize, rcbits;
 795
 796        j = rev[i].forw;
 797        if (j == i) {
 798                /* chain is now empty */
 799                *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
 800        } else {
 801                /* remove i from chain */
 802                h = rev[i].back;
 803                rev[h].forw = j;
 804                rev[j].back = h;
 805                rev[i].forw = rev[i].back = i;
 806                *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
 807        }
 808
 809        /* Now check and modify the HPTE */
 810        ptel = rev[i].guest_rpte;
 811        psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
 812        if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
 813            hpte_rpn(ptel, psize) == gfn) {
 814                hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
 815                kvmppc_invalidate_hpte(kvm, hptep, i);
 816                hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
 817                /* Harvest R and C */
 818                rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
 819                *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
 820                if (rcbits & HPTE_R_C)
 821                        kvmppc_update_rmap_change(rmapp, psize);
 822                if (rcbits & ~rev[i].guest_rpte) {
 823                        rev[i].guest_rpte = ptel | rcbits;
 824                        note_hpte_modification(kvm, &rev[i]);
 825                }
 826        }
 827}
 828
 829static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 830                           unsigned long gfn)
 831{
 832        unsigned long i;
 833        __be64 *hptep;
 834        unsigned long *rmapp;
 835
 836        rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
 837        for (;;) {
 838                lock_rmap(rmapp);
 839                if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
 840                        unlock_rmap(rmapp);
 841                        break;
 842                }
 843
 844                /*
 845                 * To avoid an ABBA deadlock with the HPTE lock bit,
 846                 * we can't spin on the HPTE lock while holding the
 847                 * rmap chain lock.
 848                 */
 849                i = *rmapp & KVMPPC_RMAP_INDEX;
 850                hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
 851                if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
 852                        /* unlock rmap before spinning on the HPTE lock */
 853                        unlock_rmap(rmapp);
 854                        while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
 855                                cpu_relax();
 856                        continue;
 857                }
 858
 859                kvmppc_unmap_hpte(kvm, i, rmapp, gfn);
 860                unlock_rmap(rmapp);
 861                __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 862        }
 863        return 0;
 864}
 865
 866int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva)
 867{
 868        hva_handler_fn handler;
 869
 870        handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
 871        kvm_handle_hva(kvm, hva, handler);
 872        return 0;
 873}
 874
 875int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end)
 876{
 877        hva_handler_fn handler;
 878
 879        handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
 880        kvm_handle_hva_range(kvm, start, end, handler);
 881        return 0;
 882}
 883
 884void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
 885                                  struct kvm_memory_slot *memslot)
 886{
 887        unsigned long gfn;
 888        unsigned long n;
 889        unsigned long *rmapp;
 890
 891        gfn = memslot->base_gfn;
 892        rmapp = memslot->arch.rmap;
 893        for (n = memslot->npages; n; --n, ++gfn) {
 894                if (kvm_is_radix(kvm)) {
 895                        kvm_unmap_radix(kvm, memslot, gfn);
 896                        continue;
 897                }
 898                /*
 899                 * Testing the present bit without locking is OK because
 900                 * the memslot has been marked invalid already, and hence
 901                 * no new HPTEs referencing this page can be created,
 902                 * thus the present bit can't go from 0 to 1.
 903                 */
 904                if (*rmapp & KVMPPC_RMAP_PRESENT)
 905                        kvm_unmap_rmapp(kvm, memslot, gfn);
 906                ++rmapp;
 907        }
 908}
 909
 910static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 911                         unsigned long gfn)
 912{
 913        struct revmap_entry *rev = kvm->arch.hpt.rev;
 914        unsigned long head, i, j;
 915        __be64 *hptep;
 916        int ret = 0;
 917        unsigned long *rmapp;
 918
 919        rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
 920 retry:
 921        lock_rmap(rmapp);
 922        if (*rmapp & KVMPPC_RMAP_REFERENCED) {
 923                *rmapp &= ~KVMPPC_RMAP_REFERENCED;
 924                ret = 1;
 925        }
 926        if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
 927                unlock_rmap(rmapp);
 928                return ret;
 929        }
 930
 931        i = head = *rmapp & KVMPPC_RMAP_INDEX;
 932        do {
 933                hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
 934                j = rev[i].forw;
 935
 936                /* If this HPTE isn't referenced, ignore it */
 937                if (!(be64_to_cpu(hptep[1]) & HPTE_R_R))
 938                        continue;
 939
 940                if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
 941                        /* unlock rmap before spinning on the HPTE lock */
 942                        unlock_rmap(rmapp);
 943                        while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK)
 944                                cpu_relax();
 945                        goto retry;
 946                }
 947
 948                /* Now check and modify the HPTE */
 949                if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
 950                    (be64_to_cpu(hptep[1]) & HPTE_R_R)) {
 951                        kvmppc_clear_ref_hpte(kvm, hptep, i);
 952                        if (!(rev[i].guest_rpte & HPTE_R_R)) {
 953                                rev[i].guest_rpte |= HPTE_R_R;
 954                                note_hpte_modification(kvm, &rev[i]);
 955                        }
 956                        ret = 1;
 957                }
 958                __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 959        } while ((i = j) != head);
 960
 961        unlock_rmap(rmapp);
 962        return ret;
 963}
 964
 965int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
 966{
 967        hva_handler_fn handler;
 968
 969        handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp;
 970        return kvm_handle_hva_range(kvm, start, end, handler);
 971}
 972
 973static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
 974                              unsigned long gfn)
 975{
 976        struct revmap_entry *rev = kvm->arch.hpt.rev;
 977        unsigned long head, i, j;
 978        unsigned long *hp;
 979        int ret = 1;
 980        unsigned long *rmapp;
 981
 982        rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
 983        if (*rmapp & KVMPPC_RMAP_REFERENCED)
 984                return 1;
 985
 986        lock_rmap(rmapp);
 987        if (*rmapp & KVMPPC_RMAP_REFERENCED)
 988                goto out;
 989
 990        if (*rmapp & KVMPPC_RMAP_PRESENT) {
 991                i = head = *rmapp & KVMPPC_RMAP_INDEX;
 992                do {
 993                        hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4));
 994                        j = rev[i].forw;
 995                        if (be64_to_cpu(hp[1]) & HPTE_R_R)
 996                                goto out;
 997                } while ((i = j) != head);
 998        }
 999        ret = 0;
1000
1001 out:
1002        unlock_rmap(rmapp);
1003        return ret;
1004}
1005
1006int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva)
1007{
1008        hva_handler_fn handler;
1009
1010        handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp;
1011        return kvm_handle_hva(kvm, hva, handler);
1012}
1013
1014void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte)
1015{
1016        hva_handler_fn handler;
1017
1018        handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp;
1019        kvm_handle_hva(kvm, hva, handler);
1020}
1021
1022static int vcpus_running(struct kvm *kvm)
1023{
1024        return atomic_read(&kvm->arch.vcpus_running) != 0;
1025}
1026
1027/*
1028 * Returns the number of system pages that are dirty.
1029 * This can be more than 1 if we find a huge-page HPTE.
1030 */
1031static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
1032{
1033        struct revmap_entry *rev = kvm->arch.hpt.rev;
1034        unsigned long head, i, j;
1035        unsigned long n;
1036        unsigned long v, r;
1037        __be64 *hptep;
1038        int npages_dirty = 0;
1039
1040 retry:
1041        lock_rmap(rmapp);
1042        if (*rmapp & KVMPPC_RMAP_CHANGED) {
1043                long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER)
1044                        >> KVMPPC_RMAP_CHG_SHIFT;
1045                *rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER);
1046                npages_dirty = 1;
1047                if (change_order > PAGE_SHIFT)
1048                        npages_dirty = 1ul << (change_order - PAGE_SHIFT);
1049        }
1050        if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
1051                unlock_rmap(rmapp);
1052                return npages_dirty;
1053        }
1054
1055        i = head = *rmapp & KVMPPC_RMAP_INDEX;
1056        do {
1057                unsigned long hptep1;
1058                hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
1059                j = rev[i].forw;
1060
1061                /*
1062                 * Checking the C (changed) bit here is racy since there
1063                 * is no guarantee about when the hardware writes it back.
1064                 * If the HPTE is not writable then it is stable since the
1065                 * page can't be written to, and we would have done a tlbie
1066                 * (which forces the hardware to complete any writeback)
1067                 * when making the HPTE read-only.
1068                 * If vcpus are running then this call is racy anyway
1069                 * since the page could get dirtied subsequently, so we
1070                 * expect there to be a further call which would pick up
1071                 * any delayed C bit writeback.
1072                 * Otherwise we need to do the tlbie even if C==0 in
1073                 * order to pick up any delayed writeback of C.
1074                 */
1075                hptep1 = be64_to_cpu(hptep[1]);
1076                if (!(hptep1 & HPTE_R_C) &&
1077                    (!hpte_is_writable(hptep1) || vcpus_running(kvm)))
1078                        continue;
1079
1080                if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
1081                        /* unlock rmap before spinning on the HPTE lock */
1082                        unlock_rmap(rmapp);
1083                        while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK))
1084                                cpu_relax();
1085                        goto retry;
1086                }
1087
1088                /* Now check and modify the HPTE */
1089                if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) {
1090                        __unlock_hpte(hptep, be64_to_cpu(hptep[0]));
1091                        continue;
1092                }
1093
1094                /* need to make it temporarily absent so C is stable */
1095                hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
1096                kvmppc_invalidate_hpte(kvm, hptep, i);
1097                v = be64_to_cpu(hptep[0]);
1098                r = be64_to_cpu(hptep[1]);
1099                if (r & HPTE_R_C) {
1100                        hptep[1] = cpu_to_be64(r & ~HPTE_R_C);
1101                        if (!(rev[i].guest_rpte & HPTE_R_C)) {
1102                                rev[i].guest_rpte |= HPTE_R_C;
1103                                note_hpte_modification(kvm, &rev[i]);
1104                        }
1105                        n = hpte_page_size(v, r);
1106                        n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
1107                        if (n > npages_dirty)
1108                                npages_dirty = n;
1109                        eieio();
1110                }
1111                v &= ~HPTE_V_ABSENT;
1112                v |= HPTE_V_VALID;
1113                __unlock_hpte(hptep, v);
1114        } while ((i = j) != head);
1115
1116        unlock_rmap(rmapp);
1117        return npages_dirty;
1118}
1119
1120void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
1121                              struct kvm_memory_slot *memslot,
1122                              unsigned long *map)
1123{
1124        unsigned long gfn;
1125
1126        if (!vpa->dirty || !vpa->pinned_addr)
1127                return;
1128        gfn = vpa->gpa >> PAGE_SHIFT;
1129        if (gfn < memslot->base_gfn ||
1130            gfn >= memslot->base_gfn + memslot->npages)
1131                return;
1132
1133        vpa->dirty = false;
1134        if (map)
1135                __set_bit_le(gfn - memslot->base_gfn, map);
1136}
1137
1138long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
1139                        struct kvm_memory_slot *memslot, unsigned long *map)
1140{
1141        unsigned long i, j;
1142        unsigned long *rmapp;
1143
1144        preempt_disable();
1145        rmapp = memslot->arch.rmap;
1146        for (i = 0; i < memslot->npages; ++i) {
1147                int npages = kvm_test_clear_dirty_npages(kvm, rmapp);
1148                /*
1149                 * Note that if npages > 0 then i must be a multiple of npages,
1150                 * since we always put huge-page HPTEs in the rmap chain
1151                 * corresponding to their page base address.
1152                 */
1153                if (npages && map)
1154                        for (j = i; npages; ++j, --npages)
1155                                __set_bit_le(j, map);
1156                ++rmapp;
1157        }
1158        preempt_enable();
1159        return 0;
1160}
1161
1162void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa,
1163                            unsigned long *nb_ret)
1164{
1165        struct kvm_memory_slot *memslot;
1166        unsigned long gfn = gpa >> PAGE_SHIFT;
1167        struct page *page, *pages[1];
1168        int npages;
1169        unsigned long hva, offset;
1170        int srcu_idx;
1171
1172        srcu_idx = srcu_read_lock(&kvm->srcu);
1173        memslot = gfn_to_memslot(kvm, gfn);
1174        if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
1175                goto err;
1176        hva = gfn_to_hva_memslot(memslot, gfn);
1177        npages = get_user_pages_fast(hva, 1, 1, pages);
1178        if (npages < 1)
1179                goto err;
1180        page = pages[0];
1181        srcu_read_unlock(&kvm->srcu, srcu_idx);
1182
1183        offset = gpa & (PAGE_SIZE - 1);
1184        if (nb_ret)
1185                *nb_ret = PAGE_SIZE - offset;
1186        return page_address(page) + offset;
1187
1188 err:
1189        srcu_read_unlock(&kvm->srcu, srcu_idx);
1190        return NULL;
1191}
1192
1193void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
1194                             bool dirty)
1195{
1196        struct page *page = virt_to_page(va);
1197        struct kvm_memory_slot *memslot;
1198        unsigned long gfn;
1199        unsigned long *rmap;
1200        int srcu_idx;
1201
1202        put_page(page);
1203
1204        if (!dirty)
1205                return;
1206
1207        /* We need to mark this page dirty in the rmap chain */
1208        gfn = gpa >> PAGE_SHIFT;
1209        srcu_idx = srcu_read_lock(&kvm->srcu);
1210        memslot = gfn_to_memslot(kvm, gfn);
1211        if (memslot) {
1212                if (!kvm_is_radix(kvm)) {
1213                        rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
1214                        lock_rmap(rmap);
1215                        *rmap |= KVMPPC_RMAP_CHANGED;
1216                        unlock_rmap(rmap);
1217                } else if (memslot->dirty_bitmap) {
1218                        mark_page_dirty(kvm, gfn);
1219                }
1220        }
1221        srcu_read_unlock(&kvm->srcu, srcu_idx);
1222}
1223
1224/*
1225 * HPT resizing
1226 */
1227static int resize_hpt_allocate(struct kvm_resize_hpt *resize)
1228{
1229        int rc;
1230
1231        rc = kvmppc_allocate_hpt(&resize->hpt, resize->order);
1232        if (rc < 0)
1233                return rc;
1234
1235        resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n",
1236                         resize->hpt.virt);
1237
1238        return 0;
1239}
1240
1241static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
1242                                            unsigned long idx)
1243{
1244        struct kvm *kvm = resize->kvm;
1245        struct kvm_hpt_info *old = &kvm->arch.hpt;
1246        struct kvm_hpt_info *new = &resize->hpt;
1247        unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1;
1248        unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1;
1249        __be64 *hptep, *new_hptep;
1250        unsigned long vpte, rpte, guest_rpte;
1251        int ret;
1252        struct revmap_entry *rev;
1253        unsigned long apsize, psize, avpn, pteg, hash;
1254        unsigned long new_idx, new_pteg, replace_vpte;
1255
1256        hptep = (__be64 *)(old->virt + (idx << 4));
1257
1258        /* Guest is stopped, so new HPTEs can't be added or faulted
1259         * in, only unmapped or altered by host actions.  So, it's
1260         * safe to check this before we take the HPTE lock */
1261        vpte = be64_to_cpu(hptep[0]);
1262        if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
1263                return 0; /* nothing to do */
1264
1265        while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
1266                cpu_relax();
1267
1268        vpte = be64_to_cpu(hptep[0]);
1269
1270        ret = 0;
1271        if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
1272                /* Nothing to do */
1273                goto out;
1274
1275        /* Unmap */
1276        rev = &old->rev[idx];
1277        guest_rpte = rev->guest_rpte;
1278
1279        ret = -EIO;
1280        apsize = hpte_page_size(vpte, guest_rpte);
1281        if (!apsize)
1282                goto out;
1283
1284        if (vpte & HPTE_V_VALID) {
1285                unsigned long gfn = hpte_rpn(guest_rpte, apsize);
1286                int srcu_idx = srcu_read_lock(&kvm->srcu);
1287                struct kvm_memory_slot *memslot =
1288                        __gfn_to_memslot(kvm_memslots(kvm), gfn);
1289
1290                if (memslot) {
1291                        unsigned long *rmapp;
1292                        rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
1293
1294                        lock_rmap(rmapp);
1295                        kvmppc_unmap_hpte(kvm, idx, rmapp, gfn);
1296                        unlock_rmap(rmapp);
1297                }
1298
1299                srcu_read_unlock(&kvm->srcu, srcu_idx);
1300        }
1301
1302        /* Reload PTE after unmap */
1303        vpte = be64_to_cpu(hptep[0]);
1304
1305        BUG_ON(vpte & HPTE_V_VALID);
1306        BUG_ON(!(vpte & HPTE_V_ABSENT));
1307
1308        ret = 0;
1309        if (!(vpte & HPTE_V_BOLTED))
1310                goto out;
1311
1312        rpte = be64_to_cpu(hptep[1]);
1313        psize = hpte_base_page_size(vpte, rpte);
1314        avpn = HPTE_V_AVPN_VAL(vpte) & ~((psize - 1) >> 23);
1315        pteg = idx / HPTES_PER_GROUP;
1316        if (vpte & HPTE_V_SECONDARY)
1317                pteg = ~pteg;
1318
1319        if (!(vpte & HPTE_V_1TB_SEG)) {
1320                unsigned long offset, vsid;
1321
1322                /* We only have 28 - 23 bits of offset in avpn */
1323                offset = (avpn & 0x1f) << 23;
1324                vsid = avpn >> 5;
1325                /* We can find more bits from the pteg value */
1326                if (psize < (1ULL << 23))
1327                        offset |= ((vsid ^ pteg) & old_hash_mask) * psize;
1328
1329                hash = vsid ^ (offset / psize);
1330        } else {
1331                unsigned long offset, vsid;
1332
1333                /* We only have 40 - 23 bits of seg_off in avpn */
1334                offset = (avpn & 0x1ffff) << 23;
1335                vsid = avpn >> 17;
1336                if (psize < (1ULL << 23))
1337                        offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) * psize;
1338
1339                hash = vsid ^ (vsid << 25) ^ (offset / psize);
1340        }
1341
1342        new_pteg = hash & new_hash_mask;
1343        if (vpte & HPTE_V_SECONDARY) {
1344                BUG_ON(~pteg != (hash & old_hash_mask));
1345                new_pteg = ~new_pteg;
1346        } else {
1347                BUG_ON(pteg != (hash & old_hash_mask));
1348        }
1349
1350        new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP);
1351        new_hptep = (__be64 *)(new->virt + (new_idx << 4));
1352
1353        replace_vpte = be64_to_cpu(new_hptep[0]);
1354
1355        if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
1356                BUG_ON(new->order >= old->order);
1357
1358                if (replace_vpte & HPTE_V_BOLTED) {
1359                        if (vpte & HPTE_V_BOLTED)
1360                                /* Bolted collision, nothing we can do */
1361                                ret = -ENOSPC;
1362                        /* Discard the new HPTE */
1363                        goto out;
1364                }
1365
1366                /* Discard the previous HPTE */
1367        }
1368
1369        new_hptep[1] = cpu_to_be64(rpte);
1370        new->rev[new_idx].guest_rpte = guest_rpte;
1371        /* No need for a barrier, since new HPT isn't active */
1372        new_hptep[0] = cpu_to_be64(vpte);
1373        unlock_hpte(new_hptep, vpte);
1374
1375out:
1376        unlock_hpte(hptep, vpte);
1377        return ret;
1378}
1379
1380static int resize_hpt_rehash(struct kvm_resize_hpt *resize)
1381{
1382        struct kvm *kvm = resize->kvm;
1383        unsigned  long i;
1384        int rc;
1385
1386        /*
1387         * resize_hpt_rehash_hpte() doesn't handle the new-format HPTEs
1388         * that POWER9 uses, and could well hit a BUG_ON on POWER9.
1389         */
1390        if (cpu_has_feature(CPU_FTR_ARCH_300))
1391                return -EIO;
1392        for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) {
1393                rc = resize_hpt_rehash_hpte(resize, i);
1394                if (rc != 0)
1395                        return rc;
1396        }
1397
1398        return 0;
1399}
1400
1401static void resize_hpt_pivot(struct kvm_resize_hpt *resize)
1402{
1403        struct kvm *kvm = resize->kvm;
1404        struct kvm_hpt_info hpt_tmp;
1405
1406        /* Exchange the pending tables in the resize structure with
1407         * the active tables */
1408
1409        resize_hpt_debug(resize, "resize_hpt_pivot()\n");
1410
1411        spin_lock(&kvm->mmu_lock);
1412        asm volatile("ptesync" : : : "memory");
1413
1414        hpt_tmp = kvm->arch.hpt;
1415        kvmppc_set_hpt(kvm, &resize->hpt);
1416        resize->hpt = hpt_tmp;
1417
1418        spin_unlock(&kvm->mmu_lock);
1419
1420        synchronize_srcu_expedited(&kvm->srcu);
1421
1422        resize_hpt_debug(resize, "resize_hpt_pivot() done\n");
1423}
1424
1425static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
1426{
1427        BUG_ON(kvm->arch.resize_hpt != resize);
1428
1429        if (!resize)
1430                return;
1431
1432        if (resize->hpt.virt)
1433                kvmppc_free_hpt(&resize->hpt);
1434
1435        kvm->arch.resize_hpt = NULL;
1436        kfree(resize);
1437}
1438
1439static void resize_hpt_prepare_work(struct work_struct *work)
1440{
1441        struct kvm_resize_hpt *resize = container_of(work,
1442                                                     struct kvm_resize_hpt,
1443                                                     work);
1444        struct kvm *kvm = resize->kvm;
1445        int err;
1446
1447        resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
1448                         resize->order);
1449
1450        err = resize_hpt_allocate(resize);
1451
1452        mutex_lock(&kvm->lock);
1453
1454        resize->error = err;
1455        resize->prepare_done = true;
1456
1457        mutex_unlock(&kvm->lock);
1458}
1459
1460long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
1461                                     struct kvm_ppc_resize_hpt *rhpt)
1462{
1463        unsigned long flags = rhpt->flags;
1464        unsigned long shift = rhpt->shift;
1465        struct kvm_resize_hpt *resize;
1466        int ret;
1467
1468        if (flags != 0)
1469                return -EINVAL;
1470
1471        if (shift && ((shift < 18) || (shift > 46)))
1472                return -EINVAL;
1473
1474        mutex_lock(&kvm->lock);
1475
1476        resize = kvm->arch.resize_hpt;
1477
1478        if (resize) {
1479                if (resize->order == shift) {
1480                        /* Suitable resize in progress */
1481                        if (resize->prepare_done) {
1482                                ret = resize->error;
1483                                if (ret != 0)
1484                                        resize_hpt_release(kvm, resize);
1485                        } else {
1486                                ret = 100; /* estimated time in ms */
1487                        }
1488
1489                        goto out;
1490                }
1491
1492                /* not suitable, cancel it */
1493                resize_hpt_release(kvm, resize);
1494        }
1495
1496        ret = 0;
1497        if (!shift)
1498                goto out; /* nothing to do */
1499
1500        /* start new resize */
1501
1502        resize = kzalloc(sizeof(*resize), GFP_KERNEL);
1503        if (!resize) {
1504                ret = -ENOMEM;
1505                goto out;
1506        }
1507        resize->order = shift;
1508        resize->kvm = kvm;
1509        INIT_WORK(&resize->work, resize_hpt_prepare_work);
1510        kvm->arch.resize_hpt = resize;
1511
1512        schedule_work(&resize->work);
1513
1514        ret = 100; /* estimated time in ms */
1515
1516out:
1517        mutex_unlock(&kvm->lock);
1518        return ret;
1519}
1520
1521static void resize_hpt_boot_vcpu(void *opaque)
1522{
1523        /* Nothing to do, just force a KVM exit */
1524}
1525
1526long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
1527                                    struct kvm_ppc_resize_hpt *rhpt)
1528{
1529        unsigned long flags = rhpt->flags;
1530        unsigned long shift = rhpt->shift;
1531        struct kvm_resize_hpt *resize;
1532        long ret;
1533
1534        if (flags != 0)
1535                return -EINVAL;
1536
1537        if (shift && ((shift < 18) || (shift > 46)))
1538                return -EINVAL;
1539
1540        mutex_lock(&kvm->lock);
1541
1542        resize = kvm->arch.resize_hpt;
1543
1544        /* This shouldn't be possible */
1545        ret = -EIO;
1546        if (WARN_ON(!kvm->arch.hpte_setup_done))
1547                goto out_no_hpt;
1548
1549        /* Stop VCPUs from running while we mess with the HPT */
1550        kvm->arch.hpte_setup_done = 0;
1551        smp_mb();
1552
1553        /* Boot all CPUs out of the guest so they re-read
1554         * hpte_setup_done */
1555        on_each_cpu(resize_hpt_boot_vcpu, NULL, 1);
1556
1557        ret = -ENXIO;
1558        if (!resize || (resize->order != shift))
1559                goto out;
1560
1561        ret = -EBUSY;
1562        if (!resize->prepare_done)
1563                goto out;
1564
1565        ret = resize->error;
1566        if (ret != 0)
1567                goto out;
1568
1569        ret = resize_hpt_rehash(resize);
1570        if (ret != 0)
1571                goto out;
1572
1573        resize_hpt_pivot(resize);
1574
1575out:
1576        /* Let VCPUs run again */
1577        kvm->arch.hpte_setup_done = 1;
1578        smp_mb();
1579out_no_hpt:
1580        resize_hpt_release(kvm, resize);
1581        mutex_unlock(&kvm->lock);
1582        return ret;
1583}
1584
1585/*
1586 * Functions for reading and writing the hash table via reads and
1587 * writes on a file descriptor.
1588 *
1589 * Reads return the guest view of the hash table, which has to be
1590 * pieced together from the real hash table and the guest_rpte
1591 * values in the revmap array.
1592 *
1593 * On writes, each HPTE written is considered in turn, and if it
1594 * is valid, it is written to the HPT as if an H_ENTER with the
1595 * exact flag set was done.  When the invalid count is non-zero
1596 * in the header written to the stream, the kernel will make
1597 * sure that that many HPTEs are invalid, and invalidate them
1598 * if not.
1599 */
1600
1601struct kvm_htab_ctx {
1602        unsigned long   index;
1603        unsigned long   flags;
1604        struct kvm      *kvm;
1605        int             first_pass;
1606};
1607
1608#define HPTE_SIZE       (2 * sizeof(unsigned long))
1609
1610/*
1611 * Returns 1 if this HPT entry has been modified or has pending
1612 * R/C bit changes.
1613 */
1614static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp)
1615{
1616        unsigned long rcbits_unset;
1617
1618        if (revp->guest_rpte & HPTE_GR_MODIFIED)
1619                return 1;
1620
1621        /* Also need to consider changes in reference and changed bits */
1622        rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
1623        if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) &&
1624            (be64_to_cpu(hptp[1]) & rcbits_unset))
1625                return 1;
1626
1627        return 0;
1628}
1629
1630static long record_hpte(unsigned long flags, __be64 *hptp,
1631                        unsigned long *hpte, struct revmap_entry *revp,
1632                        int want_valid, int first_pass)
1633{
1634        unsigned long v, r, hr;
1635        unsigned long rcbits_unset;
1636        int ok = 1;
1637        int valid, dirty;
1638
1639        /* Unmodified entries are uninteresting except on the first pass */
1640        dirty = hpte_dirty(revp, hptp);
1641        if (!first_pass && !dirty)
1642                return 0;
1643
1644        valid = 0;
1645        if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) {
1646                valid = 1;
1647                if ((flags & KVM_GET_HTAB_BOLTED_ONLY) &&
1648                    !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED))
1649                        valid = 0;
1650        }
1651        if (valid != want_valid)
1652                return 0;
1653
1654        v = r = 0;
1655        if (valid || dirty) {
1656                /* lock the HPTE so it's stable and read it */
1657                preempt_disable();
1658                while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
1659                        cpu_relax();
1660                v = be64_to_cpu(hptp[0]);
1661                hr = be64_to_cpu(hptp[1]);
1662                if (cpu_has_feature(CPU_FTR_ARCH_300)) {
1663                        v = hpte_new_to_old_v(v, hr);
1664                        hr = hpte_new_to_old_r(hr);
1665                }
1666
1667                /* re-evaluate valid and dirty from synchronized HPTE value */
1668                valid = !!(v & HPTE_V_VALID);
1669                dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED);
1670
1671                /* Harvest R and C into guest view if necessary */
1672                rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
1673                if (valid && (rcbits_unset & hr)) {
1674                        revp->guest_rpte |= (hr &
1675                                (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED;
1676                        dirty = 1;
1677                }
1678
1679                if (v & HPTE_V_ABSENT) {
1680                        v &= ~HPTE_V_ABSENT;
1681                        v |= HPTE_V_VALID;
1682                        valid = 1;
1683                }
1684                if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED))
1685                        valid = 0;
1686
1687                r = revp->guest_rpte;
1688                /* only clear modified if this is the right sort of entry */
1689                if (valid == want_valid && dirty) {
1690                        r &= ~HPTE_GR_MODIFIED;
1691                        revp->guest_rpte = r;
1692                }
1693                unlock_hpte(hptp, be64_to_cpu(hptp[0]));
1694                preempt_enable();
1695                if (!(valid == want_valid && (first_pass || dirty)))
1696                        ok = 0;
1697        }
1698        hpte[0] = cpu_to_be64(v);
1699        hpte[1] = cpu_to_be64(r);
1700        return ok;
1701}
1702
1703static ssize_t kvm_htab_read(struct file *file, char __user *buf,
1704                             size_t count, loff_t *ppos)
1705{
1706        struct kvm_htab_ctx *ctx = file->private_data;
1707        struct kvm *kvm = ctx->kvm;
1708        struct kvm_get_htab_header hdr;
1709        __be64 *hptp;
1710        struct revmap_entry *revp;
1711        unsigned long i, nb, nw;
1712        unsigned long __user *lbuf;
1713        struct kvm_get_htab_header __user *hptr;
1714        unsigned long flags;
1715        int first_pass;
1716        unsigned long hpte[2];
1717
1718        if (!access_ok(VERIFY_WRITE, buf, count))
1719                return -EFAULT;
1720
1721        first_pass = ctx->first_pass;
1722        flags = ctx->flags;
1723
1724        i = ctx->index;
1725        hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
1726        revp = kvm->arch.hpt.rev + i;
1727        lbuf = (unsigned long __user *)buf;
1728
1729        nb = 0;
1730        while (nb + sizeof(hdr) + HPTE_SIZE < count) {
1731                /* Initialize header */
1732                hptr = (struct kvm_get_htab_header __user *)buf;
1733                hdr.n_valid = 0;
1734                hdr.n_invalid = 0;
1735                nw = nb;
1736                nb += sizeof(hdr);
1737                lbuf = (unsigned long __user *)(buf + sizeof(hdr));
1738
1739                /* Skip uninteresting entries, i.e. clean on not-first pass */
1740                if (!first_pass) {
1741                        while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
1742                               !hpte_dirty(revp, hptp)) {
1743                                ++i;
1744                                hptp += 2;
1745                                ++revp;
1746                        }
1747                }
1748                hdr.index = i;
1749
1750                /* Grab a series of valid entries */
1751                while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
1752                       hdr.n_valid < 0xffff &&
1753                       nb + HPTE_SIZE < count &&
1754                       record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
1755                        /* valid entry, write it out */
1756                        ++hdr.n_valid;
1757                        if (__put_user(hpte[0], lbuf) ||
1758                            __put_user(hpte[1], lbuf + 1))
1759                                return -EFAULT;
1760                        nb += HPTE_SIZE;
1761                        lbuf += 2;
1762                        ++i;
1763                        hptp += 2;
1764                        ++revp;
1765                }
1766                /* Now skip invalid entries while we can */
1767                while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
1768                       hdr.n_invalid < 0xffff &&
1769                       record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
1770                        /* found an invalid entry */
1771                        ++hdr.n_invalid;
1772                        ++i;
1773                        hptp += 2;
1774                        ++revp;
1775                }
1776
1777                if (hdr.n_valid || hdr.n_invalid) {
1778                        /* write back the header */
1779                        if (__copy_to_user(hptr, &hdr, sizeof(hdr)))
1780                                return -EFAULT;
1781                        nw = nb;
1782                        buf = (char __user *)lbuf;
1783                } else {
1784                        nb = nw;
1785                }
1786
1787                /* Check if we've wrapped around the hash table */
1788                if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) {
1789                        i = 0;
1790                        ctx->first_pass = 0;
1791                        break;
1792                }
1793        }
1794
1795        ctx->index = i;
1796
1797        return nb;
1798}
1799
1800static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
1801                              size_t count, loff_t *ppos)
1802{
1803        struct kvm_htab_ctx *ctx = file->private_data;
1804        struct kvm *kvm = ctx->kvm;
1805        struct kvm_get_htab_header hdr;
1806        unsigned long i, j;
1807        unsigned long v, r;
1808        unsigned long __user *lbuf;
1809        __be64 *hptp;
1810        unsigned long tmp[2];
1811        ssize_t nb;
1812        long int err, ret;
1813        int hpte_setup;
1814
1815        if (!access_ok(VERIFY_READ, buf, count))
1816                return -EFAULT;
1817
1818        /* lock out vcpus from running while we're doing this */
1819        mutex_lock(&kvm->lock);
1820        hpte_setup = kvm->arch.hpte_setup_done;
1821        if (hpte_setup) {
1822                kvm->arch.hpte_setup_done = 0;  /* temporarily */
1823                /* order hpte_setup_done vs. vcpus_running */
1824                smp_mb();
1825                if (atomic_read(&kvm->arch.vcpus_running)) {
1826                        kvm->arch.hpte_setup_done = 1;
1827                        mutex_unlock(&kvm->lock);
1828                        return -EBUSY;
1829                }
1830        }
1831
1832        err = 0;
1833        for (nb = 0; nb + sizeof(hdr) <= count; ) {
1834                err = -EFAULT;
1835                if (__copy_from_user(&hdr, buf, sizeof(hdr)))
1836                        break;
1837
1838                err = 0;
1839                if (nb + hdr.n_valid * HPTE_SIZE > count)
1840                        break;
1841
1842                nb += sizeof(hdr);
1843                buf += sizeof(hdr);
1844
1845                err = -EINVAL;
1846                i = hdr.index;
1847                if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) ||
1848                    i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt))
1849                        break;
1850
1851                hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
1852                lbuf = (unsigned long __user *)buf;
1853                for (j = 0; j < hdr.n_valid; ++j) {
1854                        __be64 hpte_v;
1855                        __be64 hpte_r;
1856
1857                        err = -EFAULT;
1858                        if (__get_user(hpte_v, lbuf) ||
1859                            __get_user(hpte_r, lbuf + 1))
1860                                goto out;
1861                        v = be64_to_cpu(hpte_v);
1862                        r = be64_to_cpu(hpte_r);
1863                        err = -EINVAL;
1864                        if (!(v & HPTE_V_VALID))
1865                                goto out;
1866                        lbuf += 2;
1867                        nb += HPTE_SIZE;
1868
1869                        if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
1870                                kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1871                        err = -EIO;
1872                        ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r,
1873                                                         tmp);
1874                        if (ret != H_SUCCESS) {
1875                                pr_err("kvm_htab_write ret %ld i=%ld v=%lx "
1876                                       "r=%lx\n", ret, i, v, r);
1877                                goto out;
1878                        }
1879                        if (!hpte_setup && is_vrma_hpte(v)) {
1880                                unsigned long psize = hpte_base_page_size(v, r);
1881                                unsigned long senc = slb_pgsize_encoding(psize);
1882                                unsigned long lpcr;
1883
1884                                kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
1885                                        (VRMA_VSID << SLB_VSID_SHIFT_1T);
1886                                lpcr = senc << (LPCR_VRMASD_SH - 4);
1887                                kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
1888                                hpte_setup = 1;
1889                        }
1890                        ++i;
1891                        hptp += 2;
1892                }
1893
1894                for (j = 0; j < hdr.n_invalid; ++j) {
1895                        if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))
1896                                kvmppc_do_h_remove(kvm, 0, i, 0, tmp);
1897                        ++i;
1898                        hptp += 2;
1899                }
1900                err = 0;
1901        }
1902
1903 out:
1904        /* Order HPTE updates vs. hpte_setup_done */
1905        smp_wmb();
1906        kvm->arch.hpte_setup_done = hpte_setup;
1907        mutex_unlock(&kvm->lock);
1908
1909        if (err)
1910                return err;
1911        return nb;
1912}
1913
1914static int kvm_htab_release(struct inode *inode, struct file *filp)
1915{
1916        struct kvm_htab_ctx *ctx = filp->private_data;
1917
1918        filp->private_data = NULL;
1919        if (!(ctx->flags & KVM_GET_HTAB_WRITE))
1920                atomic_dec(&ctx->kvm->arch.hpte_mod_interest);
1921        kvm_put_kvm(ctx->kvm);
1922        kfree(ctx);
1923        return 0;
1924}
1925
1926static const struct file_operations kvm_htab_fops = {
1927        .read           = kvm_htab_read,
1928        .write          = kvm_htab_write,
1929        .llseek         = default_llseek,
1930        .release        = kvm_htab_release,
1931};
1932
1933int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
1934{
1935        int ret;
1936        struct kvm_htab_ctx *ctx;
1937        int rwflag;
1938
1939        /* reject flags we don't recognize */
1940        if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE))
1941                return -EINVAL;
1942        ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
1943        if (!ctx)
1944                return -ENOMEM;
1945        kvm_get_kvm(kvm);
1946        ctx->kvm = kvm;
1947        ctx->index = ghf->start_index;
1948        ctx->flags = ghf->flags;
1949        ctx->first_pass = 1;
1950
1951        rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY;
1952        ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC);
1953        if (ret < 0) {
1954                kfree(ctx);
1955                kvm_put_kvm(kvm);
1956                return ret;
1957        }
1958
1959        if (rwflag == O_RDONLY) {
1960                mutex_lock(&kvm->slots_lock);
1961                atomic_inc(&kvm->arch.hpte_mod_interest);
1962                /* make sure kvmppc_do_h_enter etc. see the increment */
1963                synchronize_srcu_expedited(&kvm->srcu);
1964                mutex_unlock(&kvm->slots_lock);
1965        }
1966
1967        return ret;
1968}
1969
1970struct debugfs_htab_state {
1971        struct kvm      *kvm;
1972        struct mutex    mutex;
1973        unsigned long   hpt_index;
1974        int             chars_left;
1975        int             buf_index;
1976        char            buf[64];
1977};
1978
1979static int debugfs_htab_open(struct inode *inode, struct file *file)
1980{
1981        struct kvm *kvm = inode->i_private;
1982        struct debugfs_htab_state *p;
1983
1984        p = kzalloc(sizeof(*p), GFP_KERNEL);
1985        if (!p)
1986                return -ENOMEM;
1987
1988        kvm_get_kvm(kvm);
1989        p->kvm = kvm;
1990        mutex_init(&p->mutex);
1991        file->private_data = p;
1992
1993        return nonseekable_open(inode, file);
1994}
1995
1996static int debugfs_htab_release(struct inode *inode, struct file *file)
1997{
1998        struct debugfs_htab_state *p = file->private_data;
1999
2000        kvm_put_kvm(p->kvm);
2001        kfree(p);
2002        return 0;
2003}
2004
2005static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
2006                                 size_t len, loff_t *ppos)
2007{
2008        struct debugfs_htab_state *p = file->private_data;
2009        ssize_t ret, r;
2010        unsigned long i, n;
2011        unsigned long v, hr, gr;
2012        struct kvm *kvm;
2013        __be64 *hptp;
2014
2015        ret = mutex_lock_interruptible(&p->mutex);
2016        if (ret)
2017                return ret;
2018
2019        if (p->chars_left) {
2020                n = p->chars_left;
2021                if (n > len)
2022                        n = len;
2023                r = copy_to_user(buf, p->buf + p->buf_index, n);
2024                n -= r;
2025                p->chars_left -= n;
2026                p->buf_index += n;
2027                buf += n;
2028                len -= n;
2029                ret = n;
2030                if (r) {
2031                        if (!n)
2032                                ret = -EFAULT;
2033                        goto out;
2034                }
2035        }
2036
2037        kvm = p->kvm;
2038        i = p->hpt_index;
2039        hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
2040        for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt);
2041             ++i, hptp += 2) {
2042                if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
2043                        continue;
2044
2045                /* lock the HPTE so it's stable and read it */
2046                preempt_disable();
2047                while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
2048                        cpu_relax();
2049                v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
2050                hr = be64_to_cpu(hptp[1]);
2051                gr = kvm->arch.hpt.rev[i].guest_rpte;
2052                unlock_hpte(hptp, v);
2053                preempt_enable();
2054
2055                if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
2056                        continue;
2057
2058                n = scnprintf(p->buf, sizeof(p->buf),
2059                              "%6lx %.16lx %.16lx %.16lx\n",
2060                              i, v, hr, gr);
2061                p->chars_left = n;
2062                if (n > len)
2063                        n = len;
2064                r = copy_to_user(buf, p->buf, n);
2065                n -= r;
2066                p->chars_left -= n;
2067                p->buf_index = n;
2068                buf += n;
2069                len -= n;
2070                ret += n;
2071                if (r) {
2072                        if (!ret)
2073                                ret = -EFAULT;
2074                        goto out;
2075                }
2076        }
2077        p->hpt_index = i;
2078
2079 out:
2080        mutex_unlock(&p->mutex);
2081        return ret;
2082}
2083
2084static ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
2085                           size_t len, loff_t *ppos)
2086{
2087        return -EACCES;
2088}
2089
2090static const struct file_operations debugfs_htab_fops = {
2091        .owner   = THIS_MODULE,
2092        .open    = debugfs_htab_open,
2093        .release = debugfs_htab_release,
2094        .read    = debugfs_htab_read,
2095        .write   = debugfs_htab_write,
2096        .llseek  = generic_file_llseek,
2097};
2098
2099void kvmppc_mmu_debugfs_init(struct kvm *kvm)
2100{
2101        kvm->arch.htab_dentry = debugfs_create_file("htab", 0400,
2102                                                    kvm->arch.debugfs_dir, kvm,
2103                                                    &debugfs_htab_fops);
2104}
2105
2106void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
2107{
2108        struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
2109
2110        vcpu->arch.slb_nr = 32;         /* POWER7/POWER8 */
2111
2112        if (kvm_is_radix(vcpu->kvm))
2113                mmu->xlate = kvmppc_mmu_radix_xlate;
2114        else
2115                mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
2116        mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
2117
2118        vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
2119}
2120