LXR linux/arch/powerpc/kvm/book3s

   1/*
   2 * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   3 * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
   4 *
   5 * Authors:
   6 *    Paul Mackerras <paulus@au1.ibm.com>
   7 *    Alexander Graf <agraf@suse.de>
   8 *    Kevin Wolf <mail@kevin-wolf.de>
   9 *
  10 * Description: KVM functions specific to running on Book 3S
  11 * processors in hypervisor mode (specifically POWER7 and later).
  12 *
  13 * This file is derived from arch/powerpc/kvm/book3s.c,
  14 * by Alexander Graf <agraf@suse.de>.
  15 *
  16 * This program is free software; you can redistribute it and/or modify
  17 * it under the terms of the GNU General Public License, version 2, as
  18 * published by the Free Software Foundation.
  19 */
  20
  21#include <linux/kvm_host.h>
  22#include <linux/err.h>
  23#include <linux/slab.h>
  24#include <linux/preempt.h>
  25#include <linux/sched.h>
  26#include <linux/delay.h>
  27#include <linux/export.h>
  28#include <linux/fs.h>
  29#include <linux/anon_inodes.h>
  30#include <linux/cpumask.h>
  31#include <linux/spinlock.h>
  32#include <linux/page-flags.h>
  33#include <linux/srcu.h>
  34#include <linux/miscdevice.h>
  35#include <linux/debugfs.h>
  36
  37#include <asm/reg.h>
  38#include <asm/cputable.h>
  39#include <asm/cacheflush.h>
  40#include <asm/tlbflush.h>
  41#include <asm/uaccess.h>
  42#include <asm/io.h>
  43#include <asm/kvm_ppc.h>
  44#include <asm/kvm_book3s.h>
  45#include <asm/mmu_context.h>
  46#include <asm/lppaca.h>
  47#include <asm/processor.h>
  48#include <asm/cputhreads.h>
  49#include <asm/page.h>
  50#include <asm/hvcall.h>
  51#include <asm/switch_to.h>
  52#include <asm/smp.h>
  53#include <asm/dbell.h>
  54#include <linux/gfp.h>
  55#include <linux/vmalloc.h>
  56#include <linux/highmem.h>
  57#include <linux/hugetlb.h>
  58#include <linux/module.h>
  59
  60#include "book3s.h"
  61
  62#define CREATE_TRACE_POINTS
  63#include "trace_hv.h"
  64
  65/* #define EXIT_DEBUG */
  66/* #define EXIT_DEBUG_SIMPLE */
  67/* #define EXIT_DEBUG_INT */
  68
  69/* Used to indicate that a guest page fault needs to be handled */
  70#define RESUME_PAGE_FAULT       (RESUME_GUEST | RESUME_FLAG_ARCH1)
  71
  72/* Used as a "null" value for timebase values */
  73#define TB_NIL  (~(u64)0)
  74
  75static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
  76
  77static int dynamic_mt_modes = 6;
  78module_param(dynamic_mt_modes, int, S_IRUGO | S_IWUSR);
  79MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)");
  80static int target_smt_mode;
  81module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
  82MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
  83
  84#ifdef CONFIG_KVM_XICS
  85static struct kernel_param_ops module_param_ops = {
  86        .set = param_set_int,
  87        .get = param_get_int,
  88};
  89
  90module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
  91                                                        S_IRUGO | S_IWUSR);
  92MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
  93#endif
  94
  95static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
  96static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
  97
  98static bool kvmppc_ipi_thread(int cpu)
  99{
 100        /* On POWER8 for IPIs to threads in the same core, use msgsnd */
 101        if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
 102                preempt_disable();
 103                if (cpu_first_thread_sibling(cpu) ==
 104                    cpu_first_thread_sibling(smp_processor_id())) {
 105                        unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
 106                        msg |= cpu_thread_in_core(cpu);
 107                        smp_mb();
 108                        __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
 109                        preempt_enable();
 110                        return true;
 111                }
 112                preempt_enable();
 113        }
 114
 115#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
 116        if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) {
 117                xics_wake_cpu(cpu);
 118                return true;
 119        }
 120#endif
 121
 122        return false;
 123}
 124
 125static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 126{
 127        int cpu;
 128        struct swait_queue_head *wqp;
 129
 130        wqp = kvm_arch_vcpu_wq(vcpu);
 131        if (swait_active(wqp)) {
 132                swake_up(wqp);
 133                ++vcpu->stat.halt_wakeup;
 134        }
 135
 136        if (kvmppc_ipi_thread(vcpu->arch.thread_cpu))
 137                return;
 138
 139        /* CPU points to the first thread of the core */
 140        cpu = vcpu->cpu;
 141        if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
 142                smp_send_reschedule(cpu);
 143}
 144
 145/*
 146 * We use the vcpu_load/put functions to measure stolen time.
 147 * Stolen time is counted as time when either the vcpu is able to
 148 * run as part of a virtual core, but the task running the vcore
 149 * is preempted or sleeping, or when the vcpu needs something done
 150 * in the kernel by the task running the vcpu, but that task is
 151 * preempted or sleeping.  Those two things have to be counted
 152 * separately, since one of the vcpu tasks will take on the job
 153 * of running the core, and the other vcpu tasks in the vcore will
 154 * sleep waiting for it to do that, but that sleep shouldn't count
 155 * as stolen time.
 156 *
 157 * Hence we accumulate stolen time when the vcpu can run as part of
 158 * a vcore using vc->stolen_tb, and the stolen time when the vcpu
 159 * needs its task to do other things in the kernel (for example,
 160 * service a page fault) in busy_stolen.  We don't accumulate
 161 * stolen time for a vcore when it is inactive, or for a vcpu
 162 * when it is in state RUNNING or NOTREADY.  NOTREADY is a bit of
 163 * a misnomer; it means that the vcpu task is not executing in
 164 * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
 165 * the kernel.  We don't have any way of dividing up that time
 166 * between time that the vcpu is genuinely stopped, time that
 167 * the task is actively working on behalf of the vcpu, and time
 168 * that the task is preempted, so we don't count any of it as
 169 * stolen.
 170 *
 171 * Updates to busy_stolen are protected by arch.tbacct_lock;
 172 * updates to vc->stolen_tb are protected by the vcore->stoltb_lock
 173 * lock.  The stolen times are measured in units of timebase ticks.
 174 * (Note that the != TB_NIL checks below are purely defensive;
 175 * they should never fail.)
 176 */
 177
 178static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc)
 179{
 180        unsigned long flags;
 181
 182        spin_lock_irqsave(&vc->stoltb_lock, flags);
 183        vc->preempt_tb = mftb();
 184        spin_unlock_irqrestore(&vc->stoltb_lock, flags);
 185}
 186
 187static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc)
 188{
 189        unsigned long flags;
 190
 191        spin_lock_irqsave(&vc->stoltb_lock, flags);
 192        if (vc->preempt_tb != TB_NIL) {
 193                vc->stolen_tb += mftb() - vc->preempt_tb;
 194                vc->preempt_tb = TB_NIL;
 195        }
 196        spin_unlock_irqrestore(&vc->stoltb_lock, flags);
 197}
 198
 199static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
 200{
 201        struct kvmppc_vcore *vc = vcpu->arch.vcore;
 202        unsigned long flags;
 203
 204        /*
 205         * We can test vc->runner without taking the vcore lock,
 206         * because only this task ever sets vc->runner to this
 207         * vcpu, and once it is set to this vcpu, only this task
 208         * ever sets it to NULL.
 209         */
 210        if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
 211                kvmppc_core_end_stolen(vc);
 212
 213        spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
 214        if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
 215            vcpu->arch.busy_preempt != TB_NIL) {
 216                vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt;
 217                vcpu->arch.busy_preempt = TB_NIL;
 218        }
 219        spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
 220}
 221
 222static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
 223{
 224        struct kvmppc_vcore *vc = vcpu->arch.vcore;
 225        unsigned long flags;
 226
 227        if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
 228                kvmppc_core_start_stolen(vc);
 229
 230        spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
 231        if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
 232                vcpu->arch.busy_preempt = mftb();
 233        spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
 234}
 235
 236static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
 237{
 238        /*
 239         * Check for illegal transactional state bit combination
 240         * and if we find it, force the TS field to a safe state.
 241         */
 242        if ((msr & MSR_TS_MASK) == MSR_TS_MASK)
 243                msr &= ~MSR_TS_MASK;
 244        vcpu->arch.shregs.msr = msr;
 245        kvmppc_end_cede(vcpu);
 246}
 247
 248static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
 249{
 250        vcpu->arch.pvr = pvr;
 251}
 252
 253static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
 254{
 255        unsigned long pcr = 0;
 256        struct kvmppc_vcore *vc = vcpu->arch.vcore;
 257
 258        if (arch_compat) {
 259                switch (arch_compat) {
 260                case PVR_ARCH_205:
 261                        /*
 262                         * If an arch bit is set in PCR, all the defined
 263                         * higher-order arch bits also have to be set.
 264                         */
 265                        pcr = PCR_ARCH_206 | PCR_ARCH_205;
 266                        break;
 267                case PVR_ARCH_206:
 268                case PVR_ARCH_206p:
 269                        pcr = PCR_ARCH_206;
 270                        break;
 271                case PVR_ARCH_207:
 272                        break;
 273                default:
 274                        return -EINVAL;
 275                }
 276
 277                if (!cpu_has_feature(CPU_FTR_ARCH_207S)) {
 278                        /* POWER7 can't emulate POWER8 */
 279                        if (!(pcr & PCR_ARCH_206))
 280                                return -EINVAL;
 281                        pcr &= ~PCR_ARCH_206;
 282                }
 283        }
 284
 285        spin_lock(&vc->lock);
 286        vc->arch_compat = arch_compat;
 287        vc->pcr = pcr;
 288        spin_unlock(&vc->lock);
 289
 290        return 0;
 291}
 292
 293static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
 294{
 295        int r;
 296
 297        pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id);
 298        pr_err("pc  = %.16lx  msr = %.16llx  trap = %x\n",
 299               vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap);
 300        for (r = 0; r < 16; ++r)
 301                pr_err("r%2d = %.16lx  r%d = %.16lx\n",
 302                       r, kvmppc_get_gpr(vcpu, r),
 303                       r+16, kvmppc_get_gpr(vcpu, r+16));
 304        pr_err("ctr = %.16lx  lr  = %.16lx\n",
 305               vcpu->arch.ctr, vcpu->arch.lr);
 306        pr_err("srr0 = %.16llx srr1 = %.16llx\n",
 307               vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1);
 308        pr_err("sprg0 = %.16llx sprg1 = %.16llx\n",
 309               vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
 310        pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
 311               vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
 312        pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
 313               vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr);
 314        pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
 315        pr_err("fault dar = %.16lx dsisr = %.8x\n",
 316               vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
 317        pr_err("SLB (%d entries):\n", vcpu->arch.slb_max);
 318        for (r = 0; r < vcpu->arch.slb_max; ++r)
 319                pr_err("  ESID = %.16llx VSID = %.16llx\n",
 320                       vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv);
 321        pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n",
 322               vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1,
 323               vcpu->arch.last_inst);
 324}
 325
 326static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
 327{
 328        struct kvm_vcpu *ret;
 329
 330        mutex_lock(&kvm->lock);
 331        ret = kvm_get_vcpu_by_id(kvm, id);
 332        mutex_unlock(&kvm->lock);
 333        return ret;
 334}
 335
 336static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
 337{
 338        vpa->__old_status |= LPPACA_OLD_SHARED_PROC;
 339        vpa->yield_count = cpu_to_be32(1);
 340}
 341
 342static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v,
 343                   unsigned long addr, unsigned long len)
 344{
 345        /* check address is cacheline aligned */
 346        if (addr & (L1_CACHE_BYTES - 1))
 347                return -EINVAL;
 348        spin_lock(&vcpu->arch.vpa_update_lock);
 349        if (v->next_gpa != addr || v->len != len) {
 350                v->next_gpa = addr;
 351                v->len = addr ? len : 0;
 352                v->update_pending = 1;
 353        }
 354        spin_unlock(&vcpu->arch.vpa_update_lock);
 355        return 0;
 356}
 357
 358/* Length for a per-processor buffer is passed in at offset 4 in the buffer */
 359struct reg_vpa {
 360        u32 dummy;
 361        union {
 362                __be16 hword;
 363                __be32 word;
 364        } length;
 365};
 366
 367static int vpa_is_registered(struct kvmppc_vpa *vpap)
 368{
 369        if (vpap->update_pending)
 370                return vpap->next_gpa != 0;
 371        return vpap->pinned_addr != NULL;
 372}
 373
 374static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu,
 375                                       unsigned long flags,
 376                                       unsigned long vcpuid, unsigned long vpa)
 377{
 378        struct kvm *kvm = vcpu->kvm;
 379        unsigned long len, nb;
 380        void *va;
 381        struct kvm_vcpu *tvcpu;
 382        int err;
 383        int subfunc;
 384        struct kvmppc_vpa *vpap;
 385
 386        tvcpu = kvmppc_find_vcpu(kvm, vcpuid);
 387        if (!tvcpu)
 388                return H_PARAMETER;
 389
 390        subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK;
 391        if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL ||
 392            subfunc == H_VPA_REG_SLB) {
 393                /* Registering new area - address must be cache-line aligned */
 394                if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa)
 395                        return H_PARAMETER;
 396
 397                /* convert logical addr to kernel addr and read length */
 398                va = kvmppc_pin_guest_page(kvm, vpa, &nb);
 399                if (va == NULL)
 400                        return H_PARAMETER;
 401                if (subfunc == H_VPA_REG_VPA)
 402                        len = be16_to_cpu(((struct reg_vpa *)va)->length.hword);
 403                else
 404                        len = be32_to_cpu(((struct reg_vpa *)va)->length.word);
 405                kvmppc_unpin_guest_page(kvm, va, vpa, false);
 406
 407                /* Check length */
 408                if (len > nb || len < sizeof(struct reg_vpa))
 409                        return H_PARAMETER;
 410        } else {
 411                vpa = 0;
 412                len = 0;
 413        }
 414
 415        err = H_PARAMETER;
 416        vpap = NULL;
 417        spin_lock(&tvcpu->arch.vpa_update_lock);
 418
 419        switch (subfunc) {
 420        case H_VPA_REG_VPA:             /* register VPA */
 421                if (len < sizeof(struct lppaca))
 422                        break;
 423                vpap = &tvcpu->arch.vpa;
 424                err = 0;
 425                break;
 426
 427        case H_VPA_REG_DTL:             /* register DTL */
 428                if (len < sizeof(struct dtl_entry))
 429                        break;
 430                len -= len % sizeof(struct dtl_entry);
 431
 432                /* Check that they have previously registered a VPA */
 433                err = H_RESOURCE;
 434                if (!vpa_is_registered(&tvcpu->arch.vpa))
 435                        break;
 436
 437                vpap = &tvcpu->arch.dtl;
 438                err = 0;
 439                break;
 440
 441        case H_VPA_REG_SLB:             /* register SLB shadow buffer */
 442                /* Check that they have previously registered a VPA */
 443                err = H_RESOURCE;
 444                if (!vpa_is_registered(&tvcpu->arch.vpa))
 445                        break;
 446
 447                vpap = &tvcpu->arch.slb_shadow;
 448                err = 0;
 449                break;
 450
 451        case H_VPA_DEREG_VPA:           /* deregister VPA */
 452                /* Check they don't still have a DTL or SLB buf registered */
 453                err = H_RESOURCE;
 454                if (vpa_is_registered(&tvcpu->arch.dtl) ||
 455                    vpa_is_registered(&tvcpu->arch.slb_shadow))
 456                        break;
 457
 458                vpap = &tvcpu->arch.vpa;
 459                err = 0;
 460                break;
 461
 462        case H_VPA_DEREG_DTL:           /* deregister DTL */
 463                vpap = &tvcpu->arch.dtl;
 464                err = 0;
 465                break;
 466
 467        case H_VPA_DEREG_SLB:           /* deregister SLB shadow buffer */
 468                vpap = &tvcpu->arch.slb_shadow;
 469                err = 0;
 470                break;
 471        }
 472
 473        if (vpap) {
 474                vpap->next_gpa = vpa;
 475                vpap->len = len;
 476                vpap->update_pending = 1;
 477        }
 478
 479        spin_unlock(&tvcpu->arch.vpa_update_lock);
 480
 481        return err;
 482}
 483
 484static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
 485{
 486        struct kvm *kvm = vcpu->kvm;
 487        void *va;
 488        unsigned long nb;
 489        unsigned long gpa;
 490
 491        /*
 492         * We need to pin the page pointed to by vpap->next_gpa,
 493         * but we can't call kvmppc_pin_guest_page under the lock
 494         * as it does get_user_pages() and down_read().  So we
 495         * have to drop the lock, pin the page, then get the lock
 496         * again and check that a new area didn't get registered
 497         * in the meantime.
 498         */
 499        for (;;) {
 500                gpa = vpap->next_gpa;
 501                spin_unlock(&vcpu->arch.vpa_update_lock);
 502                va = NULL;
 503                nb = 0;
 504                if (gpa)
 505                        va = kvmppc_pin_guest_page(kvm, gpa, &nb);
 506                spin_lock(&vcpu->arch.vpa_update_lock);
 507                if (gpa == vpap->next_gpa)
 508                        break;
 509                /* sigh... unpin that one and try again */
 510                if (va)
 511                        kvmppc_unpin_guest_page(kvm, va, gpa, false);
 512        }
 513
 514        vpap->update_pending = 0;
 515        if (va && nb < vpap->len) {
 516                /*
 517                 * If it's now too short, it must be that userspace
 518                 * has changed the mappings underlying guest memory,
 519                 * so unregister the region.
 520                 */
 521                kvmppc_unpin_guest_page(kvm, va, gpa, false);
 522                va = NULL;
 523        }
 524        if (vpap->pinned_addr)
 525                kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa,
 526                                        vpap->dirty);
 527        vpap->gpa = gpa;
 528        vpap->pinned_addr = va;
 529        vpap->dirty = false;
 530        if (va)
 531                vpap->pinned_end = va + vpap->len;
 532}
 533
 534static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
 535{
 536        if (!(vcpu->arch.vpa.update_pending ||
 537              vcpu->arch.slb_shadow.update_pending ||
 538              vcpu->arch.dtl.update_pending))
 539                return;
 540
 541        spin_lock(&vcpu->arch.vpa_update_lock);
 542        if (vcpu->arch.vpa.update_pending) {
 543                kvmppc_update_vpa(vcpu, &vcpu->arch.vpa);
 544                if (vcpu->arch.vpa.pinned_addr)
 545                        init_vpa(vcpu, vcpu->arch.vpa.pinned_addr);
 546        }
 547        if (vcpu->arch.dtl.update_pending) {
 548                kvmppc_update_vpa(vcpu, &vcpu->arch.dtl);
 549                vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr;
 550                vcpu->arch.dtl_index = 0;
 551        }
 552        if (vcpu->arch.slb_shadow.update_pending)
 553                kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow);
 554        spin_unlock(&vcpu->arch.vpa_update_lock);
 555}
 556
 557/*
 558 * Return the accumulated stolen time for the vcore up until `now'.
 559 * The caller should hold the vcore lock.
 560 */
 561static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
 562{
 563        u64 p;
 564        unsigned long flags;
 565
 566        spin_lock_irqsave(&vc->stoltb_lock, flags);
 567        p = vc->stolen_tb;
 568        if (vc->vcore_state != VCORE_INACTIVE &&
 569            vc->preempt_tb != TB_NIL)
 570                p += now - vc->preempt_tb;
 571        spin_unlock_irqrestore(&vc->stoltb_lock, flags);
 572        return p;
 573}
 574
 575static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
 576                                    struct kvmppc_vcore *vc)
 577{
 578        struct dtl_entry *dt;
 579        struct lppaca *vpa;
 580        unsigned long stolen;
 581        unsigned long core_stolen;
 582        u64 now;
 583
 584        dt = vcpu->arch.dtl_ptr;
 585        vpa = vcpu->arch.vpa.pinned_addr;
 586        now = mftb();
 587        core_stolen = vcore_stolen_time(vc, now);
 588        stolen = core_stolen - vcpu->arch.stolen_logged;
 589        vcpu->arch.stolen_logged = core_stolen;
 590        spin_lock_irq(&vcpu->arch.tbacct_lock);
 591        stolen += vcpu->arch.busy_stolen;
 592        vcpu->arch.busy_stolen = 0;
 593        spin_unlock_irq(&vcpu->arch.tbacct_lock);
 594        if (!dt || !vpa)
 595                return;
 596        memset(dt, 0, sizeof(struct dtl_entry));
 597        dt->dispatch_reason = 7;
 598        dt->processor_id = cpu_to_be16(vc->pcpu + vcpu->arch.ptid);
 599        dt->timebase = cpu_to_be64(now + vc->tb_offset);
 600        dt->enqueue_to_dispatch_time = cpu_to_be32(stolen);
 601        dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu));
 602        dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr);
 603        ++dt;
 604        if (dt == vcpu->arch.dtl.pinned_end)
 605                dt = vcpu->arch.dtl.pinned_addr;
 606        vcpu->arch.dtl_ptr = dt;
 607        /* order writing *dt vs. writing vpa->dtl_idx */
 608        smp_wmb();
 609        vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index);
 610        vcpu->arch.dtl.dirty = true;
 611}
 612
 613static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu)
 614{
 615        if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207)
 616                return true;
 617        if ((!vcpu->arch.vcore->arch_compat) &&
 618            cpu_has_feature(CPU_FTR_ARCH_207S))
 619                return true;
 620        return false;
 621}
 622
 623static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags,
 624                             unsigned long resource, unsigned long value1,
 625                             unsigned long value2)
 626{
 627        switch (resource) {
 628        case H_SET_MODE_RESOURCE_SET_CIABR:
 629                if (!kvmppc_power8_compatible(vcpu))
 630                        return H_P2;
 631                if (value2)
 632                        return H_P4;
 633                if (mflags)
 634                        return H_UNSUPPORTED_FLAG_START;
 635                /* Guests can't breakpoint the hypervisor */
 636                if ((value1 & CIABR_PRIV) == CIABR_PRIV_HYPER)
 637                        return H_P3;
 638                vcpu->arch.ciabr  = value1;
 639                return H_SUCCESS;
 640        case H_SET_MODE_RESOURCE_SET_DAWR:
 641                if (!kvmppc_power8_compatible(vcpu))
 642                        return H_P2;
 643                if (mflags)
 644                        return H_UNSUPPORTED_FLAG_START;
 645                if (value2 & DABRX_HYP)
 646                        return H_P4;
 647                vcpu->arch.dawr  = value1;
 648                vcpu->arch.dawrx = value2;
 649                return H_SUCCESS;
 650        default:
 651                return H_TOO_HARD;
 652        }
 653}
 654
 655static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
 656{
 657        struct kvmppc_vcore *vcore = target->arch.vcore;
 658
 659        /*
 660         * We expect to have been called by the real mode handler
 661         * (kvmppc_rm_h_confer()) which would have directly returned
 662         * H_SUCCESS if the source vcore wasn't idle (e.g. if it may
 663         * have useful work to do and should not confer) so we don't
 664         * recheck that here.
 665         */
 666
 667        spin_lock(&vcore->lock);
 668        if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
 669            vcore->vcore_state != VCORE_INACTIVE &&
 670            vcore->runner)
 671                target = vcore->runner;
 672        spin_unlock(&vcore->lock);
 673
 674        return kvm_vcpu_yield_to(target);
 675}
 676
 677static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu)
 678{
 679        int yield_count = 0;
 680        struct lppaca *lppaca;
 681
 682        spin_lock(&vcpu->arch.vpa_update_lock);
 683        lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr;
 684        if (lppaca)
 685                yield_count = be32_to_cpu(lppaca->yield_count);
 686        spin_unlock(&vcpu->arch.vpa_update_lock);
 687        return yield_count;
 688}
 689
 690int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 691{
 692        unsigned long req = kvmppc_get_gpr(vcpu, 3);
 693        unsigned long target, ret = H_SUCCESS;
 694        int yield_count;
 695        struct kvm_vcpu *tvcpu;
 696        int idx, rc;
 697
 698        if (req <= MAX_HCALL_OPCODE &&
 699            !test_bit(req/4, vcpu->kvm->arch.enabled_hcalls))
 700                return RESUME_HOST;
 701
 702        switch (req) {
 703        case H_CEDE:
 704                break;
 705        case H_PROD:
 706                target = kvmppc_get_gpr(vcpu, 4);
 707                tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
 708                if (!tvcpu) {
 709                        ret = H_PARAMETER;
 710                        break;
 711                }
 712                tvcpu->arch.prodded = 1;
 713                smp_mb();
 714                if (vcpu->arch.ceded) {
 715                        if (swait_active(&vcpu->wq)) {
 716                                swake_up(&vcpu->wq);
 717                                vcpu->stat.halt_wakeup++;
 718                        }
 719                }
 720                break;
 721        case H_CONFER:
 722                target = kvmppc_get_gpr(vcpu, 4);
 723                if (target == -1)
 724                        break;
 725                tvcpu = kvmppc_find_vcpu(vcpu->kvm, target);
 726                if (!tvcpu) {
 727                        ret = H_PARAMETER;
 728                        break;
 729                }
 730                yield_count = kvmppc_get_gpr(vcpu, 5);
 731                if (kvmppc_get_yield_count(tvcpu) != yield_count)
 732                        break;
 733                kvm_arch_vcpu_yield_to(tvcpu);
 734                break;
 735        case H_REGISTER_VPA:
 736                ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4),
 737                                        kvmppc_get_gpr(vcpu, 5),
 738                                        kvmppc_get_gpr(vcpu, 6));
 739                break;
 740        case H_RTAS:
 741                if (list_empty(&vcpu->kvm->arch.rtas_tokens))
 742                        return RESUME_HOST;
 743
 744                idx = srcu_read_lock(&vcpu->kvm->srcu);
 745                rc = kvmppc_rtas_hcall(vcpu);
 746                srcu_read_unlock(&vcpu->kvm->srcu, idx);
 747
 748                if (rc == -ENOENT)
 749                        return RESUME_HOST;
 750                else if (rc == 0)
 751                        break;
 752
 753                /* Send the error out to userspace via KVM_RUN */
 754                return rc;
 755        case H_LOGICAL_CI_LOAD:
 756                ret = kvmppc_h_logical_ci_load(vcpu);
 757                if (ret == H_TOO_HARD)
 758                        return RESUME_HOST;
 759                break;
 760        case H_LOGICAL_CI_STORE:
 761                ret = kvmppc_h_logical_ci_store(vcpu);
 762                if (ret == H_TOO_HARD)
 763                        return RESUME_HOST;
 764                break;
 765        case H_SET_MODE:
 766                ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4),
 767                                        kvmppc_get_gpr(vcpu, 5),
 768                                        kvmppc_get_gpr(vcpu, 6),
 769                                        kvmppc_get_gpr(vcpu, 7));
 770                if (ret == H_TOO_HARD)
 771                        return RESUME_HOST;
 772                break;
 773        case H_XIRR:
 774        case H_CPPR:
 775        case H_EOI:
 776        case H_IPI:
 777        case H_IPOLL:
 778        case H_XIRR_X:
 779                if (kvmppc_xics_enabled(vcpu)) {
 780                        ret = kvmppc_xics_hcall(vcpu, req);
 781                        break;
 782                }
 783                return RESUME_HOST;
 784        case H_PUT_TCE:
 785                ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
 786                                                kvmppc_get_gpr(vcpu, 5),
 787                                                kvmppc_get_gpr(vcpu, 6));
 788                if (ret == H_TOO_HARD)
 789                        return RESUME_HOST;
 790                break;
 791        case H_PUT_TCE_INDIRECT:
 792                ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
 793                                                kvmppc_get_gpr(vcpu, 5),
 794                                                kvmppc_get_gpr(vcpu, 6),
 795                                                kvmppc_get_gpr(vcpu, 7));
 796                if (ret == H_TOO_HARD)
 797                        return RESUME_HOST;
 798                break;
 799        case H_STUFF_TCE:
 800                ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
 801                                                kvmppc_get_gpr(vcpu, 5),
 802                                                kvmppc_get_gpr(vcpu, 6),
 803                                                kvmppc_get_gpr(vcpu, 7));
 804                if (ret == H_TOO_HARD)
 805                        return RESUME_HOST;
 806                break;
 807        default:
 808                return RESUME_HOST;
 809        }
 810        kvmppc_set_gpr(vcpu, 3, ret);
 811        vcpu->arch.hcall_needed = 0;
 812        return RESUME_GUEST;
 813}
 814
 815static int kvmppc_hcall_impl_hv(unsigned long cmd)
 816{
 817        switch (cmd) {
 818        case H_CEDE:
 819        case H_PROD:
 820        case H_CONFER:
 821        case H_REGISTER_VPA:
 822        case H_SET_MODE:
 823        case H_LOGICAL_CI_LOAD:
 824        case H_LOGICAL_CI_STORE:
 825#ifdef CONFIG_KVM_XICS
 826        case H_XIRR:
 827        case H_CPPR:
 828        case H_EOI:
 829        case H_IPI:
 830        case H_IPOLL:
 831        case H_XIRR_X:
 832#endif
 833                return 1;
 834        }
 835
 836        /* See if it's in the real-mode table */
 837        return kvmppc_hcall_impl_hv_realmode(cmd);
 838}
 839
 840static int kvmppc_emulate_debug_inst(struct kvm_run *run,
 841                                        struct kvm_vcpu *vcpu)
 842{
 843        u32 last_inst;
 844
 845        if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
 846                                        EMULATE_DONE) {
 847                /*
 848                 * Fetch failed, so return to guest and
 849                 * try executing it again.
 850                 */
 851                return RESUME_GUEST;
 852        }
 853
 854        if (last_inst == KVMPPC_INST_SW_BREAKPOINT) {
 855                run->exit_reason = KVM_EXIT_DEBUG;
 856                run->debug.arch.address = kvmppc_get_pc(vcpu);
 857                return RESUME_HOST;
 858        } else {
 859                kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
 860                return RESUME_GUEST;
 861        }
 862}
 863
 864static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 865                                 struct task_struct *tsk)
 866{
 867        int r = RESUME_HOST;
 868
 869        vcpu->stat.sum_exits++;
 870
 871        /*
 872         * This can happen if an interrupt occurs in the last stages
 873         * of guest entry or the first stages of guest exit (i.e. after
 874         * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
 875         * and before setting it to KVM_GUEST_MODE_HOST_HV).
 876         * That can happen due to a bug, or due to a machine check
 877         * occurring at just the wrong time.
 878         */
 879        if (vcpu->arch.shregs.msr & MSR_HV) {
 880                printk(KERN_EMERG "KVM trap in HV mode!\n");
 881                printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
 882                        vcpu->arch.trap, kvmppc_get_pc(vcpu),
 883                        vcpu->arch.shregs.msr);
 884                kvmppc_dump_regs(vcpu);
 885                run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 886                run->hw.hardware_exit_reason = vcpu->arch.trap;
 887                return RESUME_HOST;
 888        }
 889        run->exit_reason = KVM_EXIT_UNKNOWN;
 890        run->ready_for_interrupt_injection = 1;
 891        switch (vcpu->arch.trap) {
 892        /* We're good on these - the host merely wanted to get our attention */
 893        case BOOK3S_INTERRUPT_HV_DECREMENTER:
 894                vcpu->stat.dec_exits++;
 895                r = RESUME_GUEST;
 896                break;
 897        case BOOK3S_INTERRUPT_EXTERNAL:
 898        case BOOK3S_INTERRUPT_H_DOORBELL:
 899                vcpu->stat.ext_intr_exits++;
 900                r = RESUME_GUEST;
 901                break;
 902        /* HMI is hypervisor interrupt and host has handled it. Resume guest.*/
 903        case BOOK3S_INTERRUPT_HMI:
 904        case BOOK3S_INTERRUPT_PERFMON:
 905                r = RESUME_GUEST;
 906                break;
 907        case BOOK3S_INTERRUPT_MACHINE_CHECK:
 908                /*
 909                 * Deliver a machine check interrupt to the guest.
 910                 * We have to do this, even if the host has handled the
 911                 * machine check, because machine checks use SRR0/1 and
 912                 * the interrupt might have trashed guest state in them.
 913                 */
 914                kvmppc_book3s_queue_irqprio(vcpu,
 915                                            BOOK3S_INTERRUPT_MACHINE_CHECK);
 916                r = RESUME_GUEST;
 917                break;
 918        case BOOK3S_INTERRUPT_PROGRAM:
 919        {
 920                ulong flags;
 921                /*
 922                 * Normally program interrupts are delivered directly
 923                 * to the guest by the hardware, but we can get here
 924                 * as a result of a hypervisor emulation interrupt
 925                 * (e40) getting turned into a 700 by BML RTAS.
 926                 */
 927                flags = vcpu->arch.shregs.msr & 0x1f0000ull;
 928                kvmppc_core_queue_program(vcpu, flags);
 929                r = RESUME_GUEST;
 930                break;
 931        }
 932        case BOOK3S_INTERRUPT_SYSCALL:
 933        {
 934                /* hcall - punt to userspace */
 935                int i;
 936
 937                /* hypercall with MSR_PR has already been handled in rmode,
 938                 * and never reaches here.
 939                 */
 940
 941                run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3);
 942                for (i = 0; i < 9; ++i)
 943                        run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i);
 944                run->exit_reason = KVM_EXIT_PAPR_HCALL;
 945                vcpu->arch.hcall_needed = 1;
 946                r = RESUME_HOST;
 947                break;
 948        }
 949        /*
 950         * We get these next two if the guest accesses a page which it thinks
 951         * it has mapped but which is not actually present, either because
 952         * it is for an emulated I/O device or because the corresonding
 953         * host page has been paged out.  Any other HDSI/HISI interrupts
 954         * have been handled already.
 955         */
 956        case BOOK3S_INTERRUPT_H_DATA_STORAGE:
 957                r = RESUME_PAGE_FAULT;
 958                break;
 959        case BOOK3S_INTERRUPT_H_INST_STORAGE:
 960                vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
 961                vcpu->arch.fault_dsisr = 0;
 962                r = RESUME_PAGE_FAULT;
 963                break;
 964        /*
 965         * This occurs if the guest executes an illegal instruction.
 966         * If the guest debug is disabled, generate a program interrupt
 967         * to the guest. If guest debug is enabled, we need to check
 968         * whether the instruction is a software breakpoint instruction.
 969         * Accordingly return to Guest or Host.
 970         */
 971        case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
 972                if (vcpu->arch.emul_inst != KVM_INST_FETCH_FAILED)
 973                        vcpu->arch.last_inst = kvmppc_need_byteswap(vcpu) ?
 974                                swab32(vcpu->arch.emul_inst) :
 975                                vcpu->arch.emul_inst;
 976                if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
 977                        r = kvmppc_emulate_debug_inst(run, vcpu);
 978                } else {
 979                        kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
 980                        r = RESUME_GUEST;
 981                }
 982                break;
 983        /*
 984         * This occurs if the guest (kernel or userspace), does something that
 985         * is prohibited by HFSCR.  We just generate a program interrupt to
 986         * the guest.
 987         */
 988        case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
 989                kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
 990                r = RESUME_GUEST;
 991                break;
 992        default:
 993                kvmppc_dump_regs(vcpu);
 994                printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n",
 995                        vcpu->arch.trap, kvmppc_get_pc(vcpu),
 996                        vcpu->arch.shregs.msr);
 997                run->hw.hardware_exit_reason = vcpu->arch.trap;
 998                r = RESUME_HOST;
 999                break;
1000        }

1001
1002        return r;
1003}
1004
1005static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
1006                                            struct kvm_sregs *sregs)
1007{
1008        int i;
1009
1010        memset(sregs, 0, sizeof(struct kvm_sregs));
1011        sregs->pvr = vcpu->arch.pvr;
1012        for (i = 0; i < vcpu->arch.slb_max; i++) {
1013                sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige;
1014                sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv;
1015        }
1016
1017        return 0;
1018}
1019
1020static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu,
1021                                            struct kvm_sregs *sregs)
1022{
1023        int i, j;
1024
1025        /* Only accept the same PVR as the host's, since we can't spoof it */
1026        if (sregs->pvr != vcpu->arch.pvr)
1027                return -EINVAL;
1028
1029        j = 0;
1030        for (i = 0; i < vcpu->arch.slb_nr; i++) {
1031                if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) {
1032                        vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe;
1033                        vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv;
1034                        ++j;
1035                }
1036        }
1037        vcpu->arch.slb_max = j;
1038
1039        return 0;
1040}
1041
1042static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
1043                bool preserve_top32)
1044{
1045        struct kvm *kvm = vcpu->kvm;
1046        struct kvmppc_vcore *vc = vcpu->arch.vcore;
1047        u64 mask;
1048
1049        mutex_lock(&kvm->lock);
1050        spin_lock(&vc->lock);
1051        /*
1052         * If ILE (interrupt little-endian) has changed, update the
1053         * MSR_LE bit in the intr_msr for each vcpu in this vcore.
1054         */
1055        if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) {
1056                struct kvm_vcpu *vcpu;
1057                int i;
1058
1059                kvm_for_each_vcpu(i, vcpu, kvm) {
1060                        if (vcpu->arch.vcore != vc)
1061                                continue;
1062                        if (new_lpcr & LPCR_ILE)
1063                                vcpu->arch.intr_msr |= MSR_LE;
1064                        else
1065                                vcpu->arch.intr_msr &= ~MSR_LE;
1066                }
1067        }
1068
1069        /*
1070         * Userspace can only modify DPFD (default prefetch depth),
1071         * ILE (interrupt little-endian) and TC (translation control).
1072         * On POWER8 userspace can also modify AIL (alt. interrupt loc.)
1073         */
1074        mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
1075        if (cpu_has_feature(CPU_FTR_ARCH_207S))
1076                mask |= LPCR_AIL;
1077
1078        /* Broken 32-bit version of LPCR must not clear top bits */
1079        if (preserve_top32)
1080                mask &= 0xFFFFFFFF;
1081        vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask);
1082        spin_unlock(&vc->lock);
1083        mutex_unlock(&kvm->lock);
1084}
1085
1086static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
1087                                 union kvmppc_one_reg *val)
1088{
1089        int r = 0;
1090        long int i;
1091
1092        switch (id) {
1093        case KVM_REG_PPC_DEBUG_INST:
1094                *val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT);
1095                break;
1096        case KVM_REG_PPC_HIOR:
1097                *val = get_reg_val(id, 0);
1098                break;
1099        case KVM_REG_PPC_DABR:
1100                *val = get_reg_val(id, vcpu->arch.dabr);
1101                break;
1102        case KVM_REG_PPC_DABRX:
1103                *val = get_reg_val(id, vcpu->arch.dabrx);
1104                break;
1105        case KVM_REG_PPC_DSCR:
1106                *val = get_reg_val(id, vcpu->arch.dscr);
1107                break;
1108        case KVM_REG_PPC_PURR:
1109                *val = get_reg_val(id, vcpu->arch.purr);
1110                break;
1111        case KVM_REG_PPC_SPURR:
1112                *val = get_reg_val(id, vcpu->arch.spurr);
1113                break;
1114        case KVM_REG_PPC_AMR:
1115                *val = get_reg_val(id, vcpu->arch.amr);
1116                break;
1117        case KVM_REG_PPC_UAMOR:
1118                *val = get_reg_val(id, vcpu->arch.uamor);
1119                break;
1120        case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS:
1121                i = id - KVM_REG_PPC_MMCR0;
1122                *val = get_reg_val(id, vcpu->arch.mmcr[i]);
1123                break;
1124        case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
1125                i = id - KVM_REG_PPC_PMC1;
1126                *val = get_reg_val(id, vcpu->arch.pmc[i]);
1127                break;
1128        case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
1129                i = id - KVM_REG_PPC_SPMC1;
1130                *val = get_reg_val(id, vcpu->arch.spmc[i]);
1131                break;
1132        case KVM_REG_PPC_SIAR:
1133                *val = get_reg_val(id, vcpu->arch.siar);
1134                break;
1135        case KVM_REG_PPC_SDAR:
1136                *val = get_reg_val(id, vcpu->arch.sdar);
1137                break;
1138        case KVM_REG_PPC_SIER:
1139                *val = get_reg_val(id, vcpu->arch.sier);
1140                break;
1141        case KVM_REG_PPC_IAMR:
1142                *val = get_reg_val(id, vcpu->arch.iamr);
1143                break;
1144        case KVM_REG_PPC_PSPB:
1145                *val = get_reg_val(id, vcpu->arch.pspb);
1146                break;
1147        case KVM_REG_PPC_DPDES:
1148                *val = get_reg_val(id, vcpu->arch.vcore->dpdes);
1149                break;
1150        case KVM_REG_PPC_DAWR:
1151                *val = get_reg_val(id, vcpu->arch.dawr);
1152                break;
1153        case KVM_REG_PPC_DAWRX:
1154                *val = get_reg_val(id, vcpu->arch.dawrx);
1155                break;
1156        case KVM_REG_PPC_CIABR:
1157                *val = get_reg_val(id, vcpu->arch.ciabr);
1158                break;
1159        case KVM_REG_PPC_CSIGR:
1160                *val = get_reg_val(id, vcpu->arch.csigr);
1161                break;
1162        case KVM_REG_PPC_TACR:
1163                *val = get_reg_val(id, vcpu->arch.tacr);
1164                break;
1165        case KVM_REG_PPC_TCSCR:
1166                *val = get_reg_val(id, vcpu->arch.tcscr);
1167                break;
1168        case KVM_REG_PPC_PID:
1169                *val = get_reg_val(id, vcpu->arch.pid);
1170                break;
1171        case KVM_REG_PPC_ACOP:
1172                *val = get_reg_val(id, vcpu->arch.acop);
1173                break;
1174        case KVM_REG_PPC_WORT:
1175                *val = get_reg_val(id, vcpu->arch.wort);
1176                break;
1177        case KVM_REG_PPC_VPA_ADDR:
1178                spin_lock(&vcpu->arch.vpa_update_lock);
1179                *val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
1180                spin_unlock(&vcpu->arch.vpa_update_lock);
1181                break;
1182        case KVM_REG_PPC_VPA_SLB:
1183                spin_lock(&vcpu->arch.vpa_update_lock);
1184                val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa;
1185                val->vpaval.length = vcpu->arch.slb_shadow.len;
1186                spin_unlock(&vcpu->arch.vpa_update_lock);
1187                break;
1188        case KVM_REG_PPC_VPA_DTL:
1189                spin_lock(&vcpu->arch.vpa_update_lock);
1190                val->vpaval.addr = vcpu->arch.dtl.next_gpa;
1191                val->vpaval.length = vcpu->arch.dtl.len;
1192                spin_unlock(&vcpu->arch.vpa_update_lock);
1193                break;
1194        case KVM_REG_PPC_TB_OFFSET:
1195                *val = get_reg_val(id, vcpu->arch.vcore->tb_offset);
1196                break;
1197        case KVM_REG_PPC_LPCR:
1198        case KVM_REG_PPC_LPCR_64:
1199                *val = get_reg_val(id, vcpu->arch.vcore->lpcr);
1200                break;
1201        case KVM_REG_PPC_PPR:
1202                *val = get_reg_val(id, vcpu->arch.ppr);
1203                break;
1204#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1205        case KVM_REG_PPC_TFHAR:
1206                *val = get_reg_val(id, vcpu->arch.tfhar);
1207                break;
1208        case KVM_REG_PPC_TFIAR:
1209                *val = get_reg_val(id, vcpu->arch.tfiar);
1210                break;
1211        case KVM_REG_PPC_TEXASR:
1212                *val = get_reg_val(id, vcpu->arch.texasr);
1213                break;
1214        case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
1215                i = id - KVM_REG_PPC_TM_GPR0;
1216                *val = get_reg_val(id, vcpu->arch.gpr_tm[i]);
1217                break;
1218        case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
1219        {
1220                int j;
1221                i = id - KVM_REG_PPC_TM_VSR0;
1222                if (i < 32)
1223                        for (j = 0; j < TS_FPRWIDTH; j++)
1224                                val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j];
1225                else {
1226                        if (cpu_has_feature(CPU_FTR_ALTIVEC))
1227                                val->vval = vcpu->arch.vr_tm.vr[i-32];
1228                        else
1229                                r = -ENXIO;
1230                }
1231                break;
1232        }
1233        case KVM_REG_PPC_TM_CR:
1234                *val = get_reg_val(id, vcpu->arch.cr_tm);
1235                break;
1236        case KVM_REG_PPC_TM_LR:
1237                *val = get_reg_val(id, vcpu->arch.lr_tm);
1238                break;
1239        case KVM_REG_PPC_TM_CTR:
1240                *val = get_reg_val(id, vcpu->arch.ctr_tm);
1241                break;
1242        case KVM_REG_PPC_TM_FPSCR:
1243                *val = get_reg_val(id, vcpu->arch.fp_tm.fpscr);
1244                break;
1245        case KVM_REG_PPC_TM_AMR:
1246                *val = get_reg_val(id, vcpu->arch.amr_tm);
1247                break;
1248        case KVM_REG_PPC_TM_PPR:
1249                *val = get_reg_val(id, vcpu->arch.ppr_tm);
1250                break;
1251        case KVM_REG_PPC_TM_VRSAVE:
1252                *val = get_reg_val(id, vcpu->arch.vrsave_tm);
1253                break;
1254        case KVM_REG_PPC_TM_VSCR:
1255                if (cpu_has_feature(CPU_FTR_ALTIVEC))
1256                        *val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]);
1257                else
1258                        r = -ENXIO;
1259                break;
1260        case KVM_REG_PPC_TM_DSCR:
1261                *val = get_reg_val(id, vcpu->arch.dscr_tm);
1262                break;
1263        case KVM_REG_PPC_TM_TAR:
1264                *val = get_reg_val(id, vcpu->arch.tar_tm);
1265                break;
1266#endif
1267        case KVM_REG_PPC_ARCH_COMPAT:
1268                *val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
1269                break;
1270        default:
1271                r = -EINVAL;
1272                break;
1273        }
1274
1275        return r;
1276}
1277
1278static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
1279                                 union kvmppc_one_reg *val)
1280{
1281        int r = 0;
1282        long int i;
1283        unsigned long addr, len;
1284
1285        switch (id) {
1286        case KVM_REG_PPC_HIOR:
1287                /* Only allow this to be set to zero */
1288                if (set_reg_val(id, *val))
1289                        r = -EINVAL;
1290                break;
1291        case KVM_REG_PPC_DABR:
1292                vcpu->arch.dabr = set_reg_val(id, *val);
1293                break;
1294        case KVM_REG_PPC_DABRX:
1295                vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP;
1296                break;
1297        case KVM_REG_PPC_DSCR:
1298                vcpu->arch.dscr = set_reg_val(id, *val);
1299                break;
1300        case KVM_REG_PPC_PURR:
1301                vcpu->arch.purr = set_reg_val(id, *val);
1302                break;
1303        case KVM_REG_PPC_SPURR:
1304                vcpu->arch.spurr = set_reg_val(id, *val);
1305                break;
1306        case KVM_REG_PPC_AMR:
1307                vcpu->arch.amr = set_reg_val(id, *val);
1308                break;
1309        case KVM_REG_PPC_UAMOR:
1310                vcpu->arch.uamor = set_reg_val(id, *val);
1311                break;
1312        case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS:
1313                i = id - KVM_REG_PPC_MMCR0;
1314                vcpu->arch.mmcr[i] = set_reg_val(id, *val);
1315                break;
1316        case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8:
1317                i = id - KVM_REG_PPC_PMC1;
1318                vcpu->arch.pmc[i] = set_reg_val(id, *val);
1319                break;
1320        case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2:
1321                i = id - KVM_REG_PPC_SPMC1;
1322                vcpu->arch.spmc[i] = set_reg_val(id, *val);
1323                break;
1324        case KVM_REG_PPC_SIAR:
1325                vcpu->arch.siar = set_reg_val(id, *val);
1326                break;
1327        case KVM_REG_PPC_SDAR:
1328                vcpu->arch.sdar = set_reg_val(id, *val);
1329                break;
1330        case KVM_REG_PPC_SIER:
1331                vcpu->arch.sier = set_reg_val(id, *val);
1332                break;
1333        case KVM_REG_PPC_IAMR:
1334                vcpu->arch.iamr = set_reg_val(id, *val);
1335                break;
1336        case KVM_REG_PPC_PSPB:
1337                vcpu->arch.pspb = set_reg_val(id, *val);
1338                break;
1339        case KVM_REG_PPC_DPDES:
1340                vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
1341                break;
1342        case KVM_REG_PPC_DAWR:
1343                vcpu->arch.dawr = set_reg_val(id, *val);
1344                break;
1345        case KVM_REG_PPC_DAWRX:
1346                vcpu->arch.dawrx = set_reg_val(id, *val) & ~DAWRX_HYP;
1347                break;
1348        case KVM_REG_PPC_CIABR:
1349                vcpu->arch.ciabr = set_reg_val(id, *val);
1350                /* Don't allow setting breakpoints in hypervisor code */
1351                if ((vcpu->arch.ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
1352                        vcpu->arch.ciabr &= ~CIABR_PRIV;        /* disable */
1353                break;
1354        case KVM_REG_PPC_CSIGR:
1355                vcpu->arch.csigr = set_reg_val(id, *val);
1356                break;
1357        case KVM_REG_PPC_TACR:
1358                vcpu->arch.tacr = set_reg_val(id, *val);
1359                break;
1360        case KVM_REG_PPC_TCSCR:
1361                vcpu->arch.tcscr = set_reg_val(id, *val);
1362                break;
1363        case KVM_REG_PPC_PID:
1364                vcpu->arch.pid = set_reg_val(id, *val);
1365                break;
1366        case KVM_REG_PPC_ACOP:
1367                vcpu->arch.acop = set_reg_val(id, *val);
1368                break;
1369        case KVM_REG_PPC_WORT:
1370                vcpu->arch.wort = set_reg_val(id, *val);
1371                break;
1372        case KVM_REG_PPC_VPA_ADDR:
1373                addr = set_reg_val(id, *val);
1374                r = -EINVAL;
1375                if (!addr && (vcpu->arch.slb_shadow.next_gpa ||
1376                              vcpu->arch.dtl.next_gpa))
1377                        break;
1378                r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca));
1379                break;
1380        case KVM_REG_PPC_VPA_SLB:
1381                addr = val->vpaval.addr;
1382                len = val->vpaval.length;
1383                r = -EINVAL;
1384                if (addr && !vcpu->arch.vpa.next_gpa)
1385                        break;
1386                r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len);
1387                break;
1388        case KVM_REG_PPC_VPA_DTL:
1389                addr = val->vpaval.addr;
1390                len = val->vpaval.length;
1391                r = -EINVAL;
1392                if (addr && (len < sizeof(struct dtl_entry) ||
1393                             !vcpu->arch.vpa.next_gpa))
1394                        break;
1395                len -= len % sizeof(struct dtl_entry);
1396                r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
1397                break;
1398        case KVM_REG_PPC_TB_OFFSET:
1399                /* round up to multiple of 2^24 */
1400                vcpu->arch.vcore->tb_offset =
1401                        ALIGN(set_reg_val(id, *val), 1UL << 24);
1402                break;
1403        case KVM_REG_PPC_LPCR:
1404                kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), true);
1405                break;
1406        case KVM_REG_PPC_LPCR_64:
1407                kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), false);
1408                break;
1409        case KVM_REG_PPC_PPR:
1410                vcpu->arch.ppr = set_reg_val(id, *val);
1411                break;
1412#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
1413        case KVM_REG_PPC_TFHAR:
1414                vcpu->arch.tfhar = set_reg_val(id, *val);
1415                break;
1416        case KVM_REG_PPC_TFIAR:
1417                vcpu->arch.tfiar = set_reg_val(id, *val);
1418                break;
1419        case KVM_REG_PPC_TEXASR:
1420                vcpu->arch.texasr = set_reg_val(id, *val);
1421                break;
1422        case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31:
1423                i = id - KVM_REG_PPC_TM_GPR0;
1424                vcpu->arch.gpr_tm[i] = set_reg_val(id, *val);
1425                break;
1426        case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63:
1427        {
1428                int j;
1429                i = id - KVM_REG_PPC_TM_VSR0;
1430                if (i < 32)
1431                        for (j = 0; j < TS_FPRWIDTH; j++)
1432                                vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j];
1433                else
1434                        if (cpu_has_feature(CPU_FTR_ALTIVEC))
1435                                vcpu->arch.vr_tm.vr[i-32] = val->vval;
1436                        else
1437                                r = -ENXIO;
1438                break;
1439        }
1440        case KVM_REG_PPC_TM_CR:
1441                vcpu->arch.cr_tm = set_reg_val(id, *val);
1442                break;
1443        case KVM_REG_PPC_TM_LR:
1444                vcpu->arch.lr_tm = set_reg_val(id, *val);
1445                break;
1446        case KVM_REG_PPC_TM_CTR:
1447                vcpu->arch.ctr_tm = set_reg_val(id, *val);
1448                break;
1449        case KVM_REG_PPC_TM_FPSCR:
1450                vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val);
1451                break;
1452        case KVM_REG_PPC_TM_AMR:
1453                vcpu->arch.amr_tm = set_reg_val(id, *val);
1454                break;
1455        case KVM_REG_PPC_TM_PPR:
1456                vcpu->arch.ppr_tm = set_reg_val(id, *val);
1457                break;
1458        case KVM_REG_PPC_TM_VRSAVE:
1459                vcpu->arch.vrsave_tm = set_reg_val(id, *val);
1460                break;
1461        case KVM_REG_PPC_TM_VSCR:
1462                if (cpu_has_feature(CPU_FTR_ALTIVEC))
1463                        vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val);
1464                else
1465                        r = - ENXIO;
1466                break;
1467        case KVM_REG_PPC_TM_DSCR:
1468                vcpu->arch.dscr_tm = set_reg_val(id, *val);
1469                break;
1470        case KVM_REG_PPC_TM_TAR:
1471                vcpu->arch.tar_tm = set_reg_val(id, *val);
1472                break;
1473#endif
1474        case KVM_REG_PPC_ARCH_COMPAT:
1475                r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
1476                break;
1477        default:
1478                r = -EINVAL;
1479                break;
1480        }
1481
1482        return r;
1483}
1484
1485static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
1486{
1487        struct kvmppc_vcore *vcore;
1488
1489        vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL);
1490
1491        if (vcore == NULL)
1492                return NULL;
1493
1494        INIT_LIST_HEAD(&vcore->runnable_threads);
1495        spin_lock_init(&vcore->lock);
1496        spin_lock_init(&vcore->stoltb_lock);
1497        init_swait_queue_head(&vcore->wq);
1498        vcore->preempt_tb = TB_NIL;
1499        vcore->lpcr = kvm->arch.lpcr;
1500        vcore->first_vcpuid = core * threads_per_subcore;
1501        vcore->kvm = kvm;
1502        INIT_LIST_HEAD(&vcore->preempt_list);
1503
1504        return vcore;
1505}
1506
1507#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
1508static struct debugfs_timings_element {
1509        const char *name;
1510        size_t offset;
1511} timings[] = {
1512        {"rm_entry",    offsetof(struct kvm_vcpu, arch.rm_entry)},
1513        {"rm_intr",     offsetof(struct kvm_vcpu, arch.rm_intr)},
1514        {"rm_exit",     offsetof(struct kvm_vcpu, arch.rm_exit)},
1515        {"guest",       offsetof(struct kvm_vcpu, arch.guest_time)},
1516        {"cede",        offsetof(struct kvm_vcpu, arch.cede_time)},
1517};
1518
1519#define N_TIMINGS       (sizeof(timings) / sizeof(timings[0]))
1520
1521struct debugfs_timings_state {
1522        struct kvm_vcpu *vcpu;
1523        unsigned int    buflen;
1524        char            buf[N_TIMINGS * 100];
1525};
1526
1527static int debugfs_timings_open(struct inode *inode, struct file *file)
1528{
1529        struct kvm_vcpu *vcpu = inode->i_private;
1530        struct debugfs_timings_state *p;
1531
1532        p = kzalloc(sizeof(*p), GFP_KERNEL);
1533        if (!p)
1534                return -ENOMEM;
1535
1536        kvm_get_kvm(vcpu->kvm);
1537        p->vcpu = vcpu;
1538        file->private_data = p;
1539
1540        return nonseekable_open(inode, file);
1541}
1542
1543static int debugfs_timings_release(struct inode *inode, struct file *file)
1544{
1545        struct debugfs_timings_state *p = file->private_data;
1546
1547        kvm_put_kvm(p->vcpu->kvm);
1548        kfree(p);
1549        return 0;
1550}
1551
1552static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
1553                                    size_t len, loff_t *ppos)
1554{
1555        struct debugfs_timings_state *p = file->private_data;
1556        struct kvm_vcpu *vcpu = p->vcpu;
1557        char *s, *buf_end;
1558        struct kvmhv_tb_accumulator tb;
1559        u64 count;
1560        loff_t pos;
1561        ssize_t n;
1562        int i, loops;
1563        bool ok;
1564
1565        if (!p->buflen) {
1566                s = p->buf;
1567                buf_end = s + sizeof(p->buf);
1568                for (i = 0; i < N_TIMINGS; ++i) {
1569                        struct kvmhv_tb_accumulator *acc;
1570
1571                        acc = (struct kvmhv_tb_accumulator *)
1572                                ((unsigned long)vcpu + timings[i].offset);
1573                        ok = false;
1574                        for (loops = 0; loops < 1000; ++loops) {
1575                                count = acc->seqcount;
1576                                if (!(count & 1)) {
1577                                        smp_rmb();
1578                                        tb = *acc;
1579                                        smp_rmb();
1580                                        if (count == acc->seqcount) {
1581                                                ok = true;
1582                                                break;
1583                                        }
1584                                }
1585                                udelay(1);
1586                        }
1587                        if (!ok)
1588                                snprintf(s, buf_end - s, "%s: stuck\n",
1589                                        timings[i].name);
1590                        else
1591                                snprintf(s, buf_end - s,
1592                                        "%s: %llu %llu %llu %llu\n",
1593                                        timings[i].name, count / 2,
1594                                        tb_to_ns(tb.tb_total),
1595                                        tb_to_ns(tb.tb_min),
1596                                        tb_to_ns(tb.tb_max));
1597                        s += strlen(s);
1598                }
1599                p->buflen = s - p->buf;
1600        }
1601
1602        pos = *ppos;
1603        if (pos >= p->buflen)
1604                return 0;
1605        if (len > p->buflen - pos)
1606                len = p->buflen - pos;
1607        n = copy_to_user(buf, p->buf + pos, len);
1608        if (n) {
1609                if (n == len)
1610                        return -EFAULT;
1611                len -= n;
1612        }
1613        *ppos = pos + len;
1614        return len;
1615}
1616
1617static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
1618                                     size_t len, loff_t *ppos)
1619{
1620        return -EACCES;
1621}
1622
1623static const struct file_operations debugfs_timings_ops = {
1624        .owner   = THIS_MODULE,
1625        .open    = debugfs_timings_open,
1626        .release = debugfs_timings_release,
1627        .read    = debugfs_timings_read,
1628        .write   = debugfs_timings_write,
1629        .llseek  = generic_file_llseek,
1630};
1631
1632/* Create a debugfs directory for the vcpu */
1633static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
1634{
1635        char buf[16];
1636        struct kvm *kvm = vcpu->kvm;
1637
1638        snprintf(buf, sizeof(buf), "vcpu%u", id);
1639        if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
1640                return;
1641        vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir);
1642        if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir))
1643                return;
1644        vcpu->arch.debugfs_timings =
1645                debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir,
1646                                    vcpu, &debugfs_timings_ops);
1647}
1648
1649#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
1650static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
1651{
1652}
1653#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
1654
1655static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
1656                                                   unsigned int id)
1657{
1658        struct kvm_vcpu *vcpu;
1659        int err = -EINVAL;
1660        int core;
1661        struct kvmppc_vcore *vcore;
1662
1663        core = id / threads_per_subcore;
1664        if (core >= KVM_MAX_VCORES)
1665                goto out;
1666
1667        err = -ENOMEM;
1668        vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
1669        if (!vcpu)
1670                goto out;
1671
1672        err = kvm_vcpu_init(vcpu, kvm, id);
1673        if (err)
1674                goto free_vcpu;
1675
1676        vcpu->arch.shared = &vcpu->arch.shregs;
1677#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
1678        /*
1679         * The shared struct is never shared on HV,
1680         * so we can always use host endianness
1681         */
1682#ifdef __BIG_ENDIAN__
1683        vcpu->arch.shared_big_endian = true;
1684#else
1685        vcpu->arch.shared_big_endian = false;
1686#endif
1687#endif
1688        vcpu->arch.mmcr[0] = MMCR0_FC;
1689        vcpu->arch.ctrl = CTRL_RUNLATCH;
1690        /* default to host PVR, since we can't spoof it */
1691        kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));
1692        spin_lock_init(&vcpu->arch.vpa_update_lock);
1693        spin_lock_init(&vcpu->arch.tbacct_lock);
1694        vcpu->arch.busy_preempt = TB_NIL;
1695        vcpu->arch.intr_msr = MSR_SF | MSR_ME;
1696
1697        kvmppc_mmu_book3s_hv_init(vcpu);
1698
1699        vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
1700
1701        init_waitqueue_head(&vcpu->arch.cpu_run);
1702
1703        mutex_lock(&kvm->lock);
1704        vcore = kvm->arch.vcores[core];
1705        if (!vcore) {
1706                vcore = kvmppc_vcore_create(kvm, core);
1707                kvm->arch.vcores[core] = vcore;
1708                kvm->arch.online_vcores++;
1709        }
1710        mutex_unlock(&kvm->lock);
1711
1712        if (!vcore)
1713                goto free_vcpu;
1714
1715        spin_lock(&vcore->lock);
1716        ++vcore->num_threads;
1717        spin_unlock(&vcore->lock);
1718        vcpu->arch.vcore = vcore;
1719        vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
1720        vcpu->arch.thread_cpu = -1;
1721
1722        vcpu->arch.cpu_type = KVM_CPU_3S_64;
1723        kvmppc_sanity_check(vcpu);
1724
1725        debugfs_vcpu_init(vcpu, id);
1726
1727        return vcpu;
1728
1729free_vcpu:
1730        kmem_cache_free(kvm_vcpu_cache, vcpu);
1731out:
1732        return ERR_PTR(err);
1733}
1734
1735static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
1736{
1737        if (vpa->pinned_addr)
1738                kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa,
1739                                        vpa->dirty);
1740}
1741
1742static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu)
1743{
1744        spin_lock(&vcpu->arch.vpa_update_lock);
1745        unpin_vpa(vcpu->kvm, &vcpu->arch.dtl);
1746        unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
1747        unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
1748        spin_unlock(&vcpu->arch.vpa_update_lock);
1749        kvm_vcpu_uninit(vcpu);
1750        kmem_cache_free(kvm_vcpu_cache, vcpu);
1751}
1752
1753static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu)
1754{
1755        /* Indicate we want to get back into the guest */
1756        return 1;
1757}
1758
1759static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
1760{
1761        unsigned long dec_nsec, now;
1762
1763        now = get_tb();
1764        if (now > vcpu->arch.dec_expires) {
1765                /* decrementer has already gone negative */
1766                kvmppc_core_queue_dec(vcpu);
1767                kvmppc_core_prepare_to_enter(vcpu);
1768                return;
1769        }
1770        dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
1771                   / tb_ticks_per_sec;
1772        hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec),
1773                      HRTIMER_MODE_REL);
1774        vcpu->arch.timer_running = 1;
1775}
1776
1777static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
1778{
1779        vcpu->arch.ceded = 0;
1780        if (vcpu->arch.timer_running) {
1781                hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
1782                vcpu->arch.timer_running = 0;
1783        }
1784}
1785
1786extern void __kvmppc_vcore_entry(void);
1787
1788static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
1789                                   struct kvm_vcpu *vcpu)
1790{
1791        u64 now;
1792
1793        if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
1794                return;
1795        spin_lock_irq(&vcpu->arch.tbacct_lock);
1796        now = mftb();
1797        vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
1798                vcpu->arch.stolen_logged;
1799        vcpu->arch.busy_preempt = now;
1800        vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
1801        spin_unlock_irq(&vcpu->arch.tbacct_lock);
1802        --vc->n_runnable;
1803        list_del(&vcpu->arch.run_list);
1804}
1805
1806static int kvmppc_grab_hwthread(int cpu)
1807{
1808        struct paca_struct *tpaca;
1809        long timeout = 10000;
1810
1811        tpaca = &paca[cpu];
1812
1813        /* Ensure the thread won't go into the kernel if it wakes */
1814        tpaca->kvm_hstate.kvm_vcpu = NULL;
1815        tpaca->kvm_hstate.kvm_vcore = NULL;
1816        tpaca->kvm_hstate.napping = 0;
1817        smp_wmb();
1818        tpaca->kvm_hstate.hwthread_req = 1;
1819
1820        /*
1821         * If the thread is already executing in the kernel (e.g. handling
1822         * a stray interrupt), wait for it to get back to nap mode.
1823         * The smp_mb() is to ensure that our setting of hwthread_req
1824         * is visible before we look at hwthread_state, so if this
1825         * races with the code at system_reset_pSeries and the thread
1826         * misses our setting of hwthread_req, we are sure to see its
1827         * setting of hwthread_state, and vice versa.
1828         */
1829        smp_mb();
1830        while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) {
1831                if (--timeout <= 0) {
1832                        pr_err("KVM: couldn't grab cpu %d\n", cpu);
1833                        return -EBUSY;
1834                }
1835                udelay(1);
1836        }
1837        return 0;
1838}
1839
1840static void kvmppc_release_hwthread(int cpu)
1841{
1842        struct paca_struct *tpaca;
1843
1844        tpaca = &paca[cpu];
1845        tpaca->kvm_hstate.hwthread_req = 0;
1846        tpaca->kvm_hstate.kvm_vcpu = NULL;
1847        tpaca->kvm_hstate.kvm_vcore = NULL;
1848        tpaca->kvm_hstate.kvm_split_mode = NULL;
1849}
1850
1851static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
1852{
1853        int cpu;
1854        struct paca_struct *tpaca;
1855        struct kvmppc_vcore *mvc = vc->master_vcore;
1856
1857        cpu = vc->pcpu;
1858        if (vcpu) {
1859                if (vcpu->arch.timer_running) {
1860                        hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
1861                        vcpu->arch.timer_running = 0;
1862                }
1863                cpu += vcpu->arch.ptid;
1864                vcpu->cpu = mvc->pcpu;
1865                vcpu->arch.thread_cpu = cpu;
1866        }
1867        tpaca = &paca[cpu];
1868        tpaca->kvm_hstate.kvm_vcpu = vcpu;
1869        tpaca->kvm_hstate.ptid = cpu - mvc->pcpu;
1870        /* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
1871        smp_wmb();
1872        tpaca->kvm_hstate.kvm_vcore = mvc;
1873        if (cpu != smp_processor_id())
1874                kvmppc_ipi_thread(cpu);
1875}
1876
1877static void kvmppc_wait_for_nap(void)
1878{
1879        int cpu = smp_processor_id();
1880        int i, loops;
1881
1882        for (loops = 0; loops < 1000000; ++loops) {
1883                /*
1884                 * Check if all threads are finished.
1885                 * We set the vcore pointer when starting a thread
1886                 * and the thread clears it when finished, so we look
1887                 * for any threads that still have a non-NULL vcore ptr.
1888                 */
1889                for (i = 1; i < threads_per_subcore; ++i)
1890                        if (paca[cpu + i].kvm_hstate.kvm_vcore)
1891                                break;
1892                if (i == threads_per_subcore) {
1893                        HMT_medium();
1894                        return;
1895                }
1896                HMT_low();
1897        }
1898        HMT_medium();
1899        for (i = 1; i < threads_per_subcore; ++i)
1900                if (paca[cpu + i].kvm_hstate.kvm_vcore)
1901                        pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
1902}
1903
1904/*
1905 * Check that we are on thread 0 and that any other threads in
1906 * this core are off-line.  Then grab the threads so they can't
1907 * enter the kernel.
1908 */
1909static int on_primary_thread(void)
1910{
1911        int cpu = smp_processor_id();
1912        int thr;
1913
1914        /* Are we on a primary subcore? */
1915        if (cpu_thread_in_subcore(cpu))
1916                return 0;
1917
1918        thr = 0;
1919        while (++thr < threads_per_subcore)
1920                if (cpu_online(cpu + thr))
1921                        return 0;
1922
1923        /* Grab all hw threads so they can't go into the kernel */
1924        for (thr = 1; thr < threads_per_subcore; ++thr) {
1925                if (kvmppc_grab_hwthread(cpu + thr)) {
1926                        /* Couldn't grab one; let the others go */
1927                        do {
1928                                kvmppc_release_hwthread(cpu + thr);
1929                        } while (--thr > 0);
1930                        return 0;
1931                }
1932        }
1933        return 1;
1934}
1935
1936/*
1937 * A list of virtual cores for each physical CPU.
1938 * These are vcores that could run but their runner VCPU tasks are
1939 * (or may be) preempted.
1940 */
1941struct preempted_vcore_list {
1942        struct list_head        list;
1943        spinlock_t              lock;
1944};
1945
1946static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores);
1947
1948static void init_vcore_lists(void)
1949{
1950        int cpu;
1951
1952        for_each_possible_cpu(cpu) {
1953                struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu);
1954                spin_lock_init(&lp->lock);
1955                INIT_LIST_HEAD(&lp->list);
1956        }
1957}
1958
1959static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
1960{
1961        struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
1962
1963        vc->vcore_state = VCORE_PREEMPT;
1964        vc->pcpu = smp_processor_id();
1965        if (vc->num_threads < threads_per_subcore) {
1966                spin_lock(&lp->lock);
1967                list_add_tail(&vc->preempt_list, &lp->list);
1968                spin_unlock(&lp->lock);
1969        }
1970
1971        /* Start accumulating stolen time */
1972        kvmppc_core_start_stolen(vc);
1973}
1974
1975static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc)
1976{
1977        struct preempted_vcore_list *lp;
1978
1979        kvmppc_core_end_stolen(vc);
1980        if (!list_empty(&vc->preempt_list)) {
1981                lp = &per_cpu(preempted_vcores, vc->pcpu);
1982                spin_lock(&lp->lock);
1983                list_del_init(&vc->preempt_list);
1984                spin_unlock(&lp->lock);
1985        }
1986        vc->vcore_state = VCORE_INACTIVE;
1987}
1988
1989/*
1990 * This stores information about the virtual cores currently
1991 * assigned to a physical core.
1992 */
1993struct core_info {
1994        int             n_subcores;
1995        int             max_subcore_threads;
1996        int             total_threads;
1997        int             subcore_threads[MAX_SUBCORES];
1998        struct kvm      *subcore_vm[MAX_SUBCORES];
1999        struct list_head vcs[MAX_SUBCORES];
2000};

2001
2002/*
2003 * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
2004 * respectively in 2-way micro-threading (split-core) mode.
2005 */
2006static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
2007
2008static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
2009{
2010        int sub;
2011
2012        memset(cip, 0, sizeof(*cip));
2013        cip->n_subcores = 1;
2014        cip->max_subcore_threads = vc->num_threads;
2015        cip->total_threads = vc->num_threads;
2016        cip->subcore_threads[0] = vc->num_threads;
2017        cip->subcore_vm[0] = vc->kvm;
2018        for (sub = 0; sub < MAX_SUBCORES; ++sub)
2019                INIT_LIST_HEAD(&cip->vcs[sub]);
2020        list_add_tail(&vc->preempt_list, &cip->vcs[0]);
2021}
2022
2023static bool subcore_config_ok(int n_subcores, int n_threads)
2024{
2025        /* Can only dynamically split if unsplit to begin with */
2026        if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
2027                return false;
2028        if (n_subcores > MAX_SUBCORES)
2029                return false;
2030        if (n_subcores > 1) {
2031                if (!(dynamic_mt_modes & 2))
2032                        n_subcores = 4;
2033                if (n_subcores > 2 && !(dynamic_mt_modes & 4))
2034                        return false;
2035        }
2036
2037        return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
2038}
2039
2040static void init_master_vcore(struct kvmppc_vcore *vc)
2041{
2042        vc->master_vcore = vc;
2043        vc->entry_exit_map = 0;
2044        vc->in_guest = 0;
2045        vc->napping_threads = 0;
2046        vc->conferring_threads = 0;
2047}
2048
2049/*
2050 * See if the existing subcores can be split into 3 (or fewer) subcores
2051 * of at most two threads each, so we can fit in another vcore.  This
2052 * assumes there are at most two subcores and at most 6 threads in total.
2053 */
2054static bool can_split_piggybacked_subcores(struct core_info *cip)
2055{
2056        int sub, new_sub;
2057        int large_sub = -1;
2058        int thr;
2059        int n_subcores = cip->n_subcores;
2060        struct kvmppc_vcore *vc, *vcnext;
2061        struct kvmppc_vcore *master_vc = NULL;
2062
2063        for (sub = 0; sub < cip->n_subcores; ++sub) {
2064                if (cip->subcore_threads[sub] <= 2)
2065                        continue;
2066                if (large_sub >= 0)
2067                        return false;
2068                large_sub = sub;
2069                vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
2070                                      preempt_list);
2071                if (vc->num_threads > 2)
2072                        return false;
2073                n_subcores += (cip->subcore_threads[sub] - 1) >> 1;
2074        }
2075        if (large_sub < 0 || !subcore_config_ok(n_subcores + 1, 2))
2076                return false;
2077
2078        /*
2079         * Seems feasible, so go through and move vcores to new subcores.
2080         * Note that when we have two or more vcores in one subcore,
2081         * all those vcores must have only one thread each.
2082         */
2083        new_sub = cip->n_subcores;
2084        thr = 0;
2085        sub = large_sub;
2086        list_for_each_entry_safe(vc, vcnext, &cip->vcs[sub], preempt_list) {
2087                if (thr >= 2) {
2088                        list_del(&vc->preempt_list);
2089                        list_add_tail(&vc->preempt_list, &cip->vcs[new_sub]);
2090                        /* vc->num_threads must be 1 */
2091                        if (++cip->subcore_threads[new_sub] == 1) {
2092                                cip->subcore_vm[new_sub] = vc->kvm;
2093                                init_master_vcore(vc);
2094                                master_vc = vc;
2095                                ++cip->n_subcores;
2096                        } else {
2097                                vc->master_vcore = master_vc;
2098                                ++new_sub;
2099                        }
2100                }
2101                thr += vc->num_threads;
2102        }
2103        cip->subcore_threads[large_sub] = 2;
2104        cip->max_subcore_threads = 2;
2105
2106        return true;
2107}
2108
2109static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
2110{
2111        int n_threads = vc->num_threads;
2112        int sub;
2113
2114        if (!cpu_has_feature(CPU_FTR_ARCH_207S))
2115                return false;
2116
2117        if (n_threads < cip->max_subcore_threads)
2118                n_threads = cip->max_subcore_threads;
2119        if (subcore_config_ok(cip->n_subcores + 1, n_threads)) {
2120                cip->max_subcore_threads = n_threads;
2121        } else if (cip->n_subcores <= 2 && cip->total_threads <= 6 &&
2122                   vc->num_threads <= 2) {
2123                /*
2124                 * We may be able to fit another subcore in by
2125                 * splitting an existing subcore with 3 or 4
2126                 * threads into two 2-thread subcores, or one
2127                 * with 5 or 6 threads into three subcores.
2128                 * We can only do this if those subcores have
2129                 * piggybacked virtual cores.
2130                 */
2131                if (!can_split_piggybacked_subcores(cip))
2132                        return false;
2133        } else {
2134                return false;
2135        }
2136
2137        sub = cip->n_subcores;
2138        ++cip->n_subcores;
2139        cip->total_threads += vc->num_threads;
2140        cip->subcore_threads[sub] = vc->num_threads;
2141        cip->subcore_vm[sub] = vc->kvm;
2142        init_master_vcore(vc);
2143        list_del(&vc->preempt_list);
2144        list_add_tail(&vc->preempt_list, &cip->vcs[sub]);
2145
2146        return true;
2147}
2148
2149static bool can_piggyback_subcore(struct kvmppc_vcore *pvc,
2150                                  struct core_info *cip, int sub)
2151{
2152        struct kvmppc_vcore *vc;
2153        int n_thr;
2154
2155        vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
2156                              preempt_list);
2157
2158        /* require same VM and same per-core reg values */
2159        if (pvc->kvm != vc->kvm ||
2160            pvc->tb_offset != vc->tb_offset ||
2161            pvc->pcr != vc->pcr ||
2162            pvc->lpcr != vc->lpcr)
2163                return false;
2164
2165        /* P8 guest with > 1 thread per core would see wrong TIR value */
2166        if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
2167            (vc->num_threads > 1 || pvc->num_threads > 1))
2168                return false;
2169
2170        n_thr = cip->subcore_threads[sub] + pvc->num_threads;
2171        if (n_thr > cip->max_subcore_threads) {
2172                if (!subcore_config_ok(cip->n_subcores, n_thr))
2173                        return false;
2174                cip->max_subcore_threads = n_thr;
2175        }
2176
2177        cip->total_threads += pvc->num_threads;
2178        cip->subcore_threads[sub] = n_thr;
2179        pvc->master_vcore = vc;
2180        list_del(&pvc->preempt_list);
2181        list_add_tail(&pvc->preempt_list, &cip->vcs[sub]);
2182
2183        return true;
2184}
2185
2186/*
2187 * Work out whether it is possible to piggyback the execution of
2188 * vcore *pvc onto the execution of the other vcores described in *cip.
2189 */
2190static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
2191                          int target_threads)
2192{
2193        int sub;
2194
2195        if (cip->total_threads + pvc->num_threads > target_threads)
2196                return false;
2197        for (sub = 0; sub < cip->n_subcores; ++sub)
2198                if (cip->subcore_threads[sub] &&
2199                    can_piggyback_subcore(pvc, cip, sub))
2200                        return true;
2201
2202        if (can_dynamic_split(pvc, cip))
2203                return true;
2204
2205        return false;
2206}
2207
2208static void prepare_threads(struct kvmppc_vcore *vc)
2209{
2210        struct kvm_vcpu *vcpu, *vnext;
2211
2212        list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
2213                                 arch.run_list) {
2214                if (signal_pending(vcpu->arch.run_task))
2215                        vcpu->arch.ret = -EINTR;
2216                else if (vcpu->arch.vpa.update_pending ||
2217                         vcpu->arch.slb_shadow.update_pending ||
2218                         vcpu->arch.dtl.update_pending)
2219                        vcpu->arch.ret = RESUME_GUEST;
2220                else
2221                        continue;
2222                kvmppc_remove_runnable(vc, vcpu);
2223                wake_up(&vcpu->arch.cpu_run);
2224        }
2225}
2226
2227static void collect_piggybacks(struct core_info *cip, int target_threads)
2228{
2229        struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
2230        struct kvmppc_vcore *pvc, *vcnext;
2231
2232        spin_lock(&lp->lock);
2233        list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) {
2234                if (!spin_trylock(&pvc->lock))
2235                        continue;
2236                prepare_threads(pvc);
2237                if (!pvc->n_runnable) {
2238                        list_del_init(&pvc->preempt_list);
2239                        if (pvc->runner == NULL) {
2240                                pvc->vcore_state = VCORE_INACTIVE;
2241                                kvmppc_core_end_stolen(pvc);
2242                        }
2243                        spin_unlock(&pvc->lock);
2244                        continue;
2245                }
2246                if (!can_piggyback(pvc, cip, target_threads)) {
2247                        spin_unlock(&pvc->lock);
2248                        continue;
2249                }
2250                kvmppc_core_end_stolen(pvc);
2251                pvc->vcore_state = VCORE_PIGGYBACK;
2252                if (cip->total_threads >= target_threads)
2253                        break;
2254        }
2255        spin_unlock(&lp->lock);
2256}
2257
2258static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
2259{
2260        int still_running = 0;
2261        u64 now;
2262        long ret;
2263        struct kvm_vcpu *vcpu, *vnext;
2264
2265        spin_lock(&vc->lock);
2266        now = get_tb();
2267        list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
2268                                 arch.run_list) {
2269                /* cancel pending dec exception if dec is positive */
2270                if (now < vcpu->arch.dec_expires &&
2271                    kvmppc_core_pending_dec(vcpu))
2272                        kvmppc_core_dequeue_dec(vcpu);
2273
2274                trace_kvm_guest_exit(vcpu);
2275
2276                ret = RESUME_GUEST;
2277                if (vcpu->arch.trap)
2278                        ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
2279                                                    vcpu->arch.run_task);
2280
2281                vcpu->arch.ret = ret;
2282                vcpu->arch.trap = 0;
2283
2284                if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
2285                        if (vcpu->arch.pending_exceptions)
2286                                kvmppc_core_prepare_to_enter(vcpu);
2287                        if (vcpu->arch.ceded)
2288                                kvmppc_set_timer(vcpu);
2289                        else
2290                                ++still_running;
2291                } else {
2292                        kvmppc_remove_runnable(vc, vcpu);
2293                        wake_up(&vcpu->arch.cpu_run);
2294                }
2295        }
2296        list_del_init(&vc->preempt_list);
2297        if (!is_master) {
2298                if (still_running > 0) {
2299                        kvmppc_vcore_preempt(vc);
2300                } else if (vc->runner) {
2301                        vc->vcore_state = VCORE_PREEMPT;
2302                        kvmppc_core_start_stolen(vc);
2303                } else {
2304                        vc->vcore_state = VCORE_INACTIVE;
2305                }
2306                if (vc->n_runnable > 0 && vc->runner == NULL) {
2307                        /* make sure there's a candidate runner awake */
2308                        vcpu = list_first_entry(&vc->runnable_threads,
2309                                                struct kvm_vcpu, arch.run_list);
2310                        wake_up(&vcpu->arch.cpu_run);
2311                }
2312        }
2313        spin_unlock(&vc->lock);
2314}
2315
2316/*
2317 * Clear core from the list of active host cores as we are about to
2318 * enter the guest. Only do this if it is the primary thread of the
2319 * core (not if a subcore) that is entering the guest.
2320 */
2321static inline void kvmppc_clear_host_core(int cpu)
2322{
2323        int core;
2324
2325        if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
2326                return;
2327        /*
2328         * Memory barrier can be omitted here as we will do a smp_wmb()
2329         * later in kvmppc_start_thread and we need ensure that state is
2330         * visible to other CPUs only after we enter guest.
2331         */
2332        core = cpu >> threads_shift;
2333        kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
2334}
2335
2336/*
2337 * Advertise this core as an active host core since we exited the guest
2338 * Only need to do this if it is the primary thread of the core that is
2339 * exiting.
2340 */
2341static inline void kvmppc_set_host_core(int cpu)
2342{
2343        int core;
2344
2345        if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
2346                return;
2347
2348        /*
2349         * Memory barrier can be omitted here because we do a spin_unlock
2350         * immediately after this which provides the memory barrier.
2351         */
2352        core = cpu >> threads_shift;
2353        kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
2354}
2355
2356/*
2357 * Run a set of guest threads on a physical core.
2358 * Called with vc->lock held.
2359 */
2360static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
2361{
2362        struct kvm_vcpu *vcpu, *vnext;
2363        int i;
2364        int srcu_idx;
2365        struct core_info core_info;
2366        struct kvmppc_vcore *pvc, *vcnext;
2367        struct kvm_split_mode split_info, *sip;
2368        int split, subcore_size, active;
2369        int sub;
2370        bool thr0_done;
2371        unsigned long cmd_bit, stat_bit;
2372        int pcpu, thr;
2373        int target_threads;
2374
2375        /*
2376         * Remove from the list any threads that have a signal pending
2377         * or need a VPA update done
2378         */
2379        prepare_threads(vc);
2380
2381        /* if the runner is no longer runnable, let the caller pick a new one */
2382        if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE)
2383                return;
2384
2385        /*
2386         * Initialize *vc.
2387         */
2388        init_master_vcore(vc);
2389        vc->preempt_tb = TB_NIL;
2390
2391        /*
2392         * Make sure we are running on primary threads, and that secondary
2393         * threads are offline.  Also check if the number of threads in this
2394         * guest are greater than the current system threads per guest.
2395         */
2396        if ((threads_per_core > 1) &&
2397            ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
2398                list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
2399                                         arch.run_list) {
2400                        vcpu->arch.ret = -EBUSY;
2401                        kvmppc_remove_runnable(vc, vcpu);
2402                        wake_up(&vcpu->arch.cpu_run);
2403                }
2404                goto out;
2405        }
2406
2407        /*
2408         * See if we could run any other vcores on the physical core
2409         * along with this one.
2410         */
2411        init_core_info(&core_info, vc);
2412        pcpu = smp_processor_id();
2413        target_threads = threads_per_subcore;
2414        if (target_smt_mode && target_smt_mode < target_threads)
2415                target_threads = target_smt_mode;
2416        if (vc->num_threads < target_threads)
2417                collect_piggybacks(&core_info, target_threads);
2418
2419        /* Decide on micro-threading (split-core) mode */
2420        subcore_size = threads_per_subcore;
2421        cmd_bit = stat_bit = 0;
2422        split = core_info.n_subcores;
2423        sip = NULL;
2424        if (split > 1) {
2425                /* threads_per_subcore must be MAX_SMT_THREADS (8) here */
2426                if (split == 2 && (dynamic_mt_modes & 2)) {
2427                        cmd_bit = HID0_POWER8_1TO2LPAR;
2428                        stat_bit = HID0_POWER8_2LPARMODE;
2429                } else {
2430                        split = 4;
2431                        cmd_bit = HID0_POWER8_1TO4LPAR;
2432                        stat_bit = HID0_POWER8_4LPARMODE;
2433                }
2434                subcore_size = MAX_SMT_THREADS / split;
2435                sip = &split_info;
2436                memset(&split_info, 0, sizeof(split_info));
2437                split_info.rpr = mfspr(SPRN_RPR);
2438                split_info.pmmar = mfspr(SPRN_PMMAR);
2439                split_info.ldbar = mfspr(SPRN_LDBAR);
2440                split_info.subcore_size = subcore_size;
2441                for (sub = 0; sub < core_info.n_subcores; ++sub)
2442                        split_info.master_vcs[sub] =
2443                                list_first_entry(&core_info.vcs[sub],
2444                                        struct kvmppc_vcore, preempt_list);
2445                /* order writes to split_info before kvm_split_mode pointer */
2446                smp_wmb();
2447        }
2448        pcpu = smp_processor_id();
2449        for (thr = 0; thr < threads_per_subcore; ++thr)
2450                paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
2451
2452        /* Initiate micro-threading (split-core) if required */
2453        if (cmd_bit) {
2454                unsigned long hid0 = mfspr(SPRN_HID0);
2455
2456                hid0 |= cmd_bit | HID0_POWER8_DYNLPARDIS;
2457                mb();
2458                mtspr(SPRN_HID0, hid0);
2459                isync();
2460                for (;;) {
2461                        hid0 = mfspr(SPRN_HID0);
2462                        if (hid0 & stat_bit)
2463                                break;
2464                        cpu_relax();
2465                }
2466        }
2467
2468        kvmppc_clear_host_core(pcpu);
2469
2470        /* Start all the threads */
2471        active = 0;
2472        for (sub = 0; sub < core_info.n_subcores; ++sub) {
2473                thr = subcore_thread_map[sub];
2474                thr0_done = false;
2475                active |= 1 << thr;
2476                list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) {
2477                        pvc->pcpu = pcpu + thr;
2478                        list_for_each_entry(vcpu, &pvc->runnable_threads,
2479                                            arch.run_list) {
2480                                kvmppc_start_thread(vcpu, pvc);
2481                                kvmppc_create_dtl_entry(vcpu, pvc);
2482                                trace_kvm_guest_enter(vcpu);
2483                                if (!vcpu->arch.ptid)
2484                                        thr0_done = true;
2485                                active |= 1 << (thr + vcpu->arch.ptid);
2486                        }
2487                        /*
2488                         * We need to start the first thread of each subcore
2489                         * even if it doesn't have a vcpu.
2490                         */
2491                        if (pvc->master_vcore == pvc && !thr0_done)
2492                                kvmppc_start_thread(NULL, pvc);
2493                        thr += pvc->num_threads;
2494                }
2495        }
2496
2497        /*
2498         * Ensure that split_info.do_nap is set after setting
2499         * the vcore pointer in the PACA of the secondaries.
2500         */
2501        smp_mb();
2502        if (cmd_bit)
2503                split_info.do_nap = 1;  /* ask secondaries to nap when done */
2504
2505        /*
2506         * When doing micro-threading, poke the inactive threads as well.
2507         * This gets them to the nap instruction after kvm_do_nap,
2508         * which reduces the time taken to unsplit later.
2509         */
2510        if (split > 1)
2511                for (thr = 1; thr < threads_per_subcore; ++thr)
2512                        if (!(active & (1 << thr)))
2513                                kvmppc_ipi_thread(pcpu + thr);
2514
2515        vc->vcore_state = VCORE_RUNNING;
2516        preempt_disable();
2517
2518        trace_kvmppc_run_core(vc, 0);
2519
2520        for (sub = 0; sub < core_info.n_subcores; ++sub)
2521                list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list)
2522                        spin_unlock(&pvc->lock);
2523
2524        kvm_guest_enter();
2525
2526        srcu_idx = srcu_read_lock(&vc->kvm->srcu);
2527
2528        __kvmppc_vcore_entry();
2529
2530        srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
2531
2532        spin_lock(&vc->lock);
2533        /* prevent other vcpu threads from doing kvmppc_start_thread() now */
2534        vc->vcore_state = VCORE_EXITING;
2535
2536        /* wait for secondary threads to finish writing their state to memory */
2537        kvmppc_wait_for_nap();
2538
2539        /* Return to whole-core mode if we split the core earlier */
2540        if (split > 1) {
2541                unsigned long hid0 = mfspr(SPRN_HID0);
2542                unsigned long loops = 0;
2543
2544                hid0 &= ~HID0_POWER8_DYNLPARDIS;
2545                stat_bit = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
2546                mb();
2547                mtspr(SPRN_HID0, hid0);
2548                isync();
2549                for (;;) {
2550                        hid0 = mfspr(SPRN_HID0);
2551                        if (!(hid0 & stat_bit))
2552                                break;
2553                        cpu_relax();
2554                        ++loops;
2555                }
2556                split_info.do_nap = 0;
2557        }
2558
2559        /* Let secondaries go back to the offline loop */
2560        for (i = 0; i < threads_per_subcore; ++i) {
2561                kvmppc_release_hwthread(pcpu + i);
2562                if (sip && sip->napped[i])
2563                        kvmppc_ipi_thread(pcpu + i);
2564        }
2565
2566        kvmppc_set_host_core(pcpu);
2567
2568        spin_unlock(&vc->lock);
2569
2570        /* make sure updates to secondary vcpu structs are visible now */
2571        smp_mb();
2572        kvm_guest_exit();
2573
2574        for (sub = 0; sub < core_info.n_subcores; ++sub)
2575                list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub],
2576                                         preempt_list)
2577                        post_guest_process(pvc, pvc == vc);
2578
2579        spin_lock(&vc->lock);
2580        preempt_enable();
2581
2582 out:
2583        vc->vcore_state = VCORE_INACTIVE;
2584        trace_kvmppc_run_core(vc, 1);
2585}
2586
2587/*
2588 * Wait for some other vcpu thread to execute us, and
2589 * wake us up when we need to handle something in the host.
2590 */
2591static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
2592                                 struct kvm_vcpu *vcpu, int wait_state)
2593{
2594        DEFINE_WAIT(wait);
2595
2596        prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
2597        if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
2598                spin_unlock(&vc->lock);
2599                schedule();
2600                spin_lock(&vc->lock);
2601        }
2602        finish_wait(&vcpu->arch.cpu_run, &wait);
2603}
2604
2605/*
2606 * All the vcpus in this vcore are idle, so wait for a decrementer
2607 * or external interrupt to one of the vcpus.  vc->lock is held.
2608 */
2609static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
2610{
2611        struct kvm_vcpu *vcpu;
2612        int do_sleep = 1;
2613        DECLARE_SWAITQUEUE(wait);
2614
2615        prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
2616
2617        /*
2618         * Check one last time for pending exceptions and ceded state after
2619         * we put ourselves on the wait queue
2620         */
2621        list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
2622                if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) {
2623                        do_sleep = 0;
2624                        break;
2625                }
2626        }
2627
2628        if (!do_sleep) {
2629                finish_swait(&vc->wq, &wait);
2630                return;
2631        }
2632
2633        vc->vcore_state = VCORE_SLEEPING;
2634        trace_kvmppc_vcore_blocked(vc, 0);
2635        spin_unlock(&vc->lock);
2636        schedule();
2637        finish_swait(&vc->wq, &wait);
2638        spin_lock(&vc->lock);
2639        vc->vcore_state = VCORE_INACTIVE;
2640        trace_kvmppc_vcore_blocked(vc, 1);
2641}
2642
2643static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2644{
2645        int n_ceded;
2646        struct kvmppc_vcore *vc;
2647        struct kvm_vcpu *v, *vn;
2648
2649        trace_kvmppc_run_vcpu_enter(vcpu);
2650
2651        kvm_run->exit_reason = 0;
2652        vcpu->arch.ret = RESUME_GUEST;
2653        vcpu->arch.trap = 0;
2654        kvmppc_update_vpas(vcpu);
2655
2656        /*
2657         * Synchronize with other threads in this virtual core
2658         */
2659        vc = vcpu->arch.vcore;
2660        spin_lock(&vc->lock);
2661        vcpu->arch.ceded = 0;
2662        vcpu->arch.run_task = current;
2663        vcpu->arch.kvm_run = kvm_run;
2664        vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
2665        vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
2666        vcpu->arch.busy_preempt = TB_NIL;
2667        list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
2668        ++vc->n_runnable;
2669
2670        /*
2671         * This happens the first time this is called for a vcpu.
2672         * If the vcore is already running, we may be able to start
2673         * this thread straight away and have it join in.
2674         */
2675        if (!signal_pending(current)) {
2676                if (vc->vcore_state == VCORE_PIGGYBACK) {
2677                        struct kvmppc_vcore *mvc = vc->master_vcore;
2678                        if (spin_trylock(&mvc->lock)) {
2679                                if (mvc->vcore_state == VCORE_RUNNING &&
2680                                    !VCORE_IS_EXITING(mvc)) {
2681                                        kvmppc_create_dtl_entry(vcpu, vc);
2682                                        kvmppc_start_thread(vcpu, vc);
2683                                        trace_kvm_guest_enter(vcpu);
2684                                }
2685                                spin_unlock(&mvc->lock);
2686                        }
2687                } else if (vc->vcore_state == VCORE_RUNNING &&
2688                           !VCORE_IS_EXITING(vc)) {
2689                        kvmppc_create_dtl_entry(vcpu, vc);
2690                        kvmppc_start_thread(vcpu, vc);
2691                        trace_kvm_guest_enter(vcpu);
2692                } else if (vc->vcore_state == VCORE_SLEEPING) {
2693                        swake_up(&vc->wq);
2694                }
2695
2696        }
2697
2698        while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
2699               !signal_pending(current)) {
2700                if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
2701                        kvmppc_vcore_end_preempt(vc);
2702
2703                if (vc->vcore_state != VCORE_INACTIVE) {
2704                        kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
2705                        continue;
2706                }
2707                list_for_each_entry_safe(v, vn, &vc->runnable_threads,
2708                                         arch.run_list) {
2709                        kvmppc_core_prepare_to_enter(v);
2710                        if (signal_pending(v->arch.run_task)) {
2711                                kvmppc_remove_runnable(vc, v);
2712                                v->stat.signal_exits++;
2713                                v->arch.kvm_run->exit_reason = KVM_EXIT_INTR;
2714                                v->arch.ret = -EINTR;
2715                                wake_up(&v->arch.cpu_run);
2716                        }
2717                }
2718                if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
2719                        break;
2720                n_ceded = 0;
2721                list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
2722                        if (!v->arch.pending_exceptions)
2723                                n_ceded += v->arch.ceded;
2724                        else
2725                                v->arch.ceded = 0;
2726                }
2727                vc->runner = vcpu;
2728                if (n_ceded == vc->n_runnable) {
2729                        kvmppc_vcore_blocked(vc);
2730                } else if (need_resched()) {
2731                        kvmppc_vcore_preempt(vc);
2732                        /* Let something else run */
2733                        cond_resched_lock(&vc->lock);
2734                        if (vc->vcore_state == VCORE_PREEMPT)
2735                                kvmppc_vcore_end_preempt(vc);
2736                } else {
2737                        kvmppc_run_core(vc);
2738                }
2739                vc->runner = NULL;
2740        }
2741
2742        while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
2743               (vc->vcore_state == VCORE_RUNNING ||
2744                vc->vcore_state == VCORE_EXITING ||
2745                vc->vcore_state == VCORE_PIGGYBACK))
2746                kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE);
2747
2748        if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
2749                kvmppc_vcore_end_preempt(vc);
2750
2751        if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
2752                kvmppc_remove_runnable(vc, vcpu);
2753                vcpu->stat.signal_exits++;
2754                kvm_run->exit_reason = KVM_EXIT_INTR;
2755                vcpu->arch.ret = -EINTR;
2756        }
2757
2758        if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
2759                /* Wake up some vcpu to run the core */
2760                v = list_first_entry(&vc->runnable_threads,
2761                                     struct kvm_vcpu, arch.run_list);
2762                wake_up(&v->arch.cpu_run);
2763        }
2764
2765        trace_kvmppc_run_vcpu_exit(vcpu, kvm_run);
2766        spin_unlock(&vc->lock);
2767        return vcpu->arch.ret;
2768}
2769
2770static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
2771{
2772        int r;
2773        int srcu_idx;
2774
2775        if (!vcpu->arch.sane) {
2776                run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2777                return -EINVAL;
2778        }
2779
2780        kvmppc_core_prepare_to_enter(vcpu);
2781
2782        /* No need to go into the guest when all we'll do is come back out */
2783        if (signal_pending(current)) {
2784                run->exit_reason = KVM_EXIT_INTR;
2785                return -EINTR;
2786        }
2787
2788        atomic_inc(&vcpu->kvm->arch.vcpus_running);
2789        /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */
2790        smp_mb();
2791
2792        /* On the first time here, set up HTAB and VRMA */
2793        if (!vcpu->kvm->arch.hpte_setup_done) {
2794                r = kvmppc_hv_setup_htab_rma(vcpu);
2795                if (r)
2796                        goto out;
2797        }
2798
2799        flush_all_to_thread(current);
2800
2801        vcpu->arch.wqp = &vcpu->arch.vcore->wq;
2802        vcpu->arch.pgdir = current->mm->pgd;
2803        vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
2804
2805        do {
2806                r = kvmppc_run_vcpu(run, vcpu);
2807
2808                if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
2809                    !(vcpu->arch.shregs.msr & MSR_PR)) {
2810                        trace_kvm_hcall_enter(vcpu);
2811                        r = kvmppc_pseries_do_hcall(vcpu);
2812                        trace_kvm_hcall_exit(vcpu, r);
2813                        kvmppc_core_prepare_to_enter(vcpu);
2814                } else if (r == RESUME_PAGE_FAULT) {
2815                        srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
2816                        r = kvmppc_book3s_hv_page_fault(run, vcpu,
2817                                vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
2818                        srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
2819                }
2820        } while (is_kvmppc_resume_guest(r));
2821
2822 out:
2823        vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
2824        atomic_dec(&vcpu->kvm->arch.vcpus_running);
2825        return r;
2826}
2827
2828static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
2829                                     int linux_psize)
2830{
2831        struct mmu_psize_def *def = &mmu_psize_defs[linux_psize];
2832
2833        if (!def->shift)
2834                return;
2835        (*sps)->page_shift = def->shift;
2836        (*sps)->slb_enc = def->sllp;
2837        (*sps)->enc[0].page_shift = def->shift;
2838        (*sps)->enc[0].pte_enc = def->penc[linux_psize];
2839        /*
2840         * Add 16MB MPSS support if host supports it
2841         */
2842        if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) {
2843                (*sps)->enc[1].page_shift = 24;
2844                (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
2845        }
2846        (*sps)++;
2847}
2848
2849static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
2850                                         struct kvm_ppc_smmu_info *info)
2851{
2852        struct kvm_ppc_one_seg_page_size *sps;
2853
2854        info->flags = KVM_PPC_PAGE_SIZES_REAL;
2855        if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
2856                info->flags |= KVM_PPC_1T_SEGMENTS;
2857        info->slb_size = mmu_slb_size;
2858
2859        /* We only support these sizes for now, and no muti-size segments */
2860        sps = &info->sps[0];
2861        kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K);
2862        kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K);
2863        kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M);
2864
2865        return 0;
2866}
2867
2868/*
2869 * Get (and clear) the dirty memory log for a memory slot.
2870 */
2871static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
2872                                         struct kvm_dirty_log *log)
2873{
2874        struct kvm_memslots *slots;
2875        struct kvm_memory_slot *memslot;
2876        int r;
2877        unsigned long n;
2878
2879        mutex_lock(&kvm->slots_lock);
2880
2881        r = -EINVAL;
2882        if (log->slot >= KVM_USER_MEM_SLOTS)
2883                goto out;
2884
2885        slots = kvm_memslots(kvm);
2886        memslot = id_to_memslot(slots, log->slot);
2887        r = -ENOENT;
2888        if (!memslot->dirty_bitmap)
2889                goto out;
2890
2891        n = kvm_dirty_bitmap_bytes(memslot);
2892        memset(memslot->dirty_bitmap, 0, n);
2893
2894        r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap);
2895        if (r)
2896                goto out;
2897
2898        r = -EFAULT;
2899        if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
2900                goto out;
2901
2902        r = 0;
2903out:
2904        mutex_unlock(&kvm->slots_lock);
2905        return r;
2906}
2907
2908static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
2909                                        struct kvm_memory_slot *dont)
2910{
2911        if (!dont || free->arch.rmap != dont->arch.rmap) {
2912                vfree(free->arch.rmap);
2913                free->arch.rmap = NULL;
2914        }
2915}
2916
2917static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
2918                                         unsigned long npages)
2919{
2920        slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
2921        if (!slot->arch.rmap)
2922                return -ENOMEM;
2923
2924        return 0;
2925}
2926
2927static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm,
2928                                        struct kvm_memory_slot *memslot,
2929                                        const struct kvm_userspace_memory_region *mem)
2930{
2931        return 0;
2932}
2933
2934static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
2935                                const struct kvm_userspace_memory_region *mem,
2936                                const struct kvm_memory_slot *old,
2937                                const struct kvm_memory_slot *new)
2938{
2939        unsigned long npages = mem->memory_size >> PAGE_SHIFT;
2940        struct kvm_memslots *slots;
2941        struct kvm_memory_slot *memslot;
2942
2943        if (npages && old->npages) {
2944                /*
2945                 * If modifying a memslot, reset all the rmap dirty bits.
2946                 * If this is a new memslot, we don't need to do anything
2947                 * since the rmap array starts out as all zeroes,
2948                 * i.e. no pages are dirty.
2949                 */
2950                slots = kvm_memslots(kvm);
2951                memslot = id_to_memslot(slots, mem->slot);
2952                kvmppc_hv_get_dirty_log(kvm, memslot, NULL);
2953        }
2954}
2955
2956/*
2957 * Update LPCR values in kvm->arch and in vcores.
2958 * Caller must hold kvm->lock.
2959 */
2960void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
2961{
2962        long int i;
2963        u32 cores_done = 0;
2964
2965        if ((kvm->arch.lpcr & mask) == lpcr)
2966                return;
2967
2968        kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr;
2969
2970        for (i = 0; i < KVM_MAX_VCORES; ++i) {
2971                struct kvmppc_vcore *vc = kvm->arch.vcores[i];
2972                if (!vc)
2973                        continue;
2974                spin_lock(&vc->lock);
2975                vc->lpcr = (vc->lpcr & ~mask) | lpcr;
2976                spin_unlock(&vc->lock);
2977                if (++cores_done >= kvm->arch.online_vcores)
2978                        break;
2979        }
2980}
2981
2982static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu)
2983{
2984        return;
2985}
2986
2987static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
2988{
2989        int err = 0;
2990        struct kvm *kvm = vcpu->kvm;
2991        unsigned long hva;
2992        struct kvm_memory_slot *memslot;
2993        struct vm_area_struct *vma;
2994        unsigned long lpcr = 0, senc;
2995        unsigned long psize, porder;
2996        int srcu_idx;
2997
2998        mutex_lock(&kvm->lock);
2999        if (kvm->arch.hpte_setup_done)
3000                goto out;       /* another vcpu beat us to it */

3001
3002        /* Allocate hashed page table (if not done already) and reset it */
3003        if (!kvm->arch.hpt_virt) {
3004                err = kvmppc_alloc_hpt(kvm, NULL);
3005                if (err) {
3006                        pr_err("KVM: Couldn't alloc HPT\n");
3007                        goto out;
3008                }
3009        }
3010
3011        /* Look up the memslot for guest physical address 0 */
3012        srcu_idx = srcu_read_lock(&kvm->srcu);
3013        memslot = gfn_to_memslot(kvm, 0);
3014
3015        /* We must have some memory at 0 by now */
3016        err = -EINVAL;
3017        if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
3018                goto out_srcu;
3019
3020        /* Look up the VMA for the start of this memory slot */
3021        hva = memslot->userspace_addr;
3022        down_read(&current->mm->mmap_sem);
3023        vma = find_vma(current->mm, hva);
3024        if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO))
3025                goto up_out;
3026
3027        psize = vma_kernel_pagesize(vma);
3028        porder = __ilog2(psize);
3029
3030        up_read(&current->mm->mmap_sem);
3031
3032        /* We can handle 4k, 64k or 16M pages in the VRMA */
3033        err = -EINVAL;
3034        if (!(psize == 0x1000 || psize == 0x10000 ||
3035              psize == 0x1000000))
3036                goto out_srcu;
3037
3038        /* Update VRMASD field in the LPCR */
3039        senc = slb_pgsize_encoding(psize);
3040        kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
3041                (VRMA_VSID << SLB_VSID_SHIFT_1T);
3042        /* the -4 is to account for senc values starting at 0x10 */
3043        lpcr = senc << (LPCR_VRMASD_SH - 4);
3044
3045        /* Create HPTEs in the hash page table for the VRMA */
3046        kvmppc_map_vrma(vcpu, memslot, porder);
3047
3048        kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
3049
3050        /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
3051        smp_wmb();
3052        kvm->arch.hpte_setup_done = 1;
3053        err = 0;
3054 out_srcu:
3055        srcu_read_unlock(&kvm->srcu, srcu_idx);
3056 out:
3057        mutex_unlock(&kvm->lock);
3058        return err;
3059
3060 up_out:
3061        up_read(&current->mm->mmap_sem);
3062        goto out_srcu;
3063}
3064
3065#ifdef CONFIG_KVM_XICS
3066static int kvmppc_cpu_notify(struct notifier_block *self, unsigned long action,
3067                        void *hcpu)
3068{
3069        unsigned long cpu = (long)hcpu;
3070
3071        switch (action) {
3072        case CPU_UP_PREPARE:
3073        case CPU_UP_PREPARE_FROZEN:
3074                kvmppc_set_host_core(cpu);
3075                break;
3076
3077#ifdef CONFIG_HOTPLUG_CPU
3078        case CPU_DEAD:
3079        case CPU_DEAD_FROZEN:
3080        case CPU_UP_CANCELED:
3081        case CPU_UP_CANCELED_FROZEN:
3082                kvmppc_clear_host_core(cpu);
3083                break;
3084#endif
3085        default:
3086                break;
3087        }
3088
3089        return NOTIFY_OK;
3090}
3091
3092static struct notifier_block kvmppc_cpu_notifier = {
3093            .notifier_call = kvmppc_cpu_notify,
3094};
3095
3096/*
3097 * Allocate a per-core structure for managing state about which cores are
3098 * running in the host versus the guest and for exchanging data between
3099 * real mode KVM and CPU running in the host.
3100 * This is only done for the first VM.
3101 * The allocated structure stays even if all VMs have stopped.
3102 * It is only freed when the kvm-hv module is unloaded.
3103 * It's OK for this routine to fail, we just don't support host
3104 * core operations like redirecting H_IPI wakeups.
3105 */
3106void kvmppc_alloc_host_rm_ops(void)
3107{
3108        struct kvmppc_host_rm_ops *ops;
3109        unsigned long l_ops;
3110        int cpu, core;
3111        int size;
3112
3113        /* Not the first time here ? */
3114        if (kvmppc_host_rm_ops_hv != NULL)
3115                return;
3116
3117        ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
3118        if (!ops)
3119                return;
3120
3121        size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
3122        ops->rm_core = kzalloc(size, GFP_KERNEL);
3123
3124        if (!ops->rm_core) {
3125                kfree(ops);
3126                return;
3127        }
3128
3129        get_online_cpus();
3130
3131        for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
3132                if (!cpu_online(cpu))
3133                        continue;
3134
3135                core = cpu >> threads_shift;
3136                ops->rm_core[core].rm_state.in_host = 1;
3137        }
3138
3139        ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
3140
3141        /*
3142         * Make the contents of the kvmppc_host_rm_ops structure visible
3143         * to other CPUs before we assign it to the global variable.
3144         * Do an atomic assignment (no locks used here), but if someone
3145         * beats us to it, just free our copy and return.
3146         */
3147        smp_wmb();
3148        l_ops = (unsigned long) ops;
3149
3150        if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
3151                put_online_cpus();
3152                kfree(ops->rm_core);
3153                kfree(ops);
3154                return;
3155        }
3156
3157        register_cpu_notifier(&kvmppc_cpu_notifier);
3158
3159        put_online_cpus();
3160}
3161
3162void kvmppc_free_host_rm_ops(void)
3163{
3164        if (kvmppc_host_rm_ops_hv) {
3165                unregister_cpu_notifier(&kvmppc_cpu_notifier);
3166                kfree(kvmppc_host_rm_ops_hv->rm_core);
3167                kfree(kvmppc_host_rm_ops_hv);
3168                kvmppc_host_rm_ops_hv = NULL;
3169        }
3170}
3171#endif
3172
3173static int kvmppc_core_init_vm_hv(struct kvm *kvm)
3174{
3175        unsigned long lpcr, lpid;
3176        char buf[32];
3177
3178        /* Allocate the guest's logical partition ID */
3179
3180        lpid = kvmppc_alloc_lpid();
3181        if ((long)lpid < 0)
3182                return -ENOMEM;
3183        kvm->arch.lpid = lpid;
3184
3185        kvmppc_alloc_host_rm_ops();
3186
3187        /*
3188         * Since we don't flush the TLB when tearing down a VM,
3189         * and this lpid might have previously been used,
3190         * make sure we flush on each core before running the new VM.
3191         */
3192        cpumask_setall(&kvm->arch.need_tlb_flush);
3193
3194        /* Start out with the default set of hcalls enabled */
3195        memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls,
3196               sizeof(kvm->arch.enabled_hcalls));
3197
3198        kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
3199
3200        /* Init LPCR for virtual RMA mode */
3201        kvm->arch.host_lpid = mfspr(SPRN_LPID);
3202        kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
3203        lpcr &= LPCR_PECE | LPCR_LPES;
3204        lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
3205                LPCR_VPM0 | LPCR_VPM1;
3206        kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
3207                (VRMA_VSID << SLB_VSID_SHIFT_1T);
3208        /* On POWER8 turn on online bit to enable PURR/SPURR */
3209        if (cpu_has_feature(CPU_FTR_ARCH_207S))
3210                lpcr |= LPCR_ONL;
3211        kvm->arch.lpcr = lpcr;
3212
3213        /*
3214         * Track that we now have a HV mode VM active. This blocks secondary
3215         * CPU threads from coming online.
3216         */
3217        kvm_hv_vm_activated();
3218
3219        /*
3220         * Create a debugfs directory for the VM
3221         */
3222        snprintf(buf, sizeof(buf), "vm%d", current->pid);
3223        kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
3224        if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
3225                kvmppc_mmu_debugfs_init(kvm);
3226
3227        return 0;
3228}
3229
3230static void kvmppc_free_vcores(struct kvm *kvm)
3231{
3232        long int i;
3233
3234        for (i = 0; i < KVM_MAX_VCORES; ++i)
3235                kfree(kvm->arch.vcores[i]);
3236        kvm->arch.online_vcores = 0;
3237}
3238
3239static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
3240{
3241        debugfs_remove_recursive(kvm->arch.debugfs_dir);
3242
3243        kvm_hv_vm_deactivated();
3244
3245        kvmppc_free_vcores(kvm);
3246
3247        kvmppc_free_hpt(kvm);
3248}
3249
3250/* We don't need to emulate any privileged instructions or dcbz */
3251static int kvmppc_core_emulate_op_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
3252                                     unsigned int inst, int *advance)
3253{
3254        return EMULATE_FAIL;
3255}
3256
3257static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn,
3258                                        ulong spr_val)
3259{
3260        return EMULATE_FAIL;
3261}
3262
3263static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
3264                                        ulong *spr_val)
3265{
3266        return EMULATE_FAIL;
3267}
3268
3269static int kvmppc_core_check_processor_compat_hv(void)
3270{
3271        if (!cpu_has_feature(CPU_FTR_HVMODE) ||
3272            !cpu_has_feature(CPU_FTR_ARCH_206))
3273                return -EIO;
3274        return 0;
3275}
3276
3277static long kvm_arch_vm_ioctl_hv(struct file *filp,
3278                                 unsigned int ioctl, unsigned long arg)
3279{
3280        struct kvm *kvm __maybe_unused = filp->private_data;
3281        void __user *argp = (void __user *)arg;
3282        long r;
3283
3284        switch (ioctl) {
3285
3286        case KVM_PPC_ALLOCATE_HTAB: {
3287                u32 htab_order;
3288
3289                r = -EFAULT;
3290                if (get_user(htab_order, (u32 __user *)argp))
3291                        break;
3292                r = kvmppc_alloc_reset_hpt(kvm, &htab_order);
3293                if (r)
3294                        break;
3295                r = -EFAULT;
3296                if (put_user(htab_order, (u32 __user *)argp))
3297                        break;
3298                r = 0;
3299                break;
3300        }
3301
3302        case KVM_PPC_GET_HTAB_FD: {
3303                struct kvm_get_htab_fd ghf;
3304
3305                r = -EFAULT;
3306                if (copy_from_user(&ghf, argp, sizeof(ghf)))
3307                        break;
3308                r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf);
3309                break;
3310        }
3311
3312        default:
3313                r = -ENOTTY;
3314        }
3315
3316        return r;
3317}
3318
3319/*
3320 * List of hcall numbers to enable by default.
3321 * For compatibility with old userspace, we enable by default
3322 * all hcalls that were implemented before the hcall-enabling
3323 * facility was added.  Note this list should not include H_RTAS.
3324 */
3325static unsigned int default_hcall_list[] = {
3326        H_REMOVE,
3327        H_ENTER,
3328        H_READ,
3329        H_PROTECT,
3330        H_BULK_REMOVE,
3331        H_GET_TCE,
3332        H_PUT_TCE,
3333        H_SET_DABR,
3334        H_SET_XDABR,
3335        H_CEDE,
3336        H_PROD,
3337        H_CONFER,
3338        H_REGISTER_VPA,
3339#ifdef CONFIG_KVM_XICS
3340        H_EOI,
3341        H_CPPR,
3342        H_IPI,
3343        H_IPOLL,
3344        H_XIRR,
3345        H_XIRR_X,
3346#endif
3347        0
3348};
3349
3350static void init_default_hcalls(void)
3351{
3352        int i;
3353        unsigned int hcall;
3354
3355        for (i = 0; default_hcall_list[i]; ++i) {
3356                hcall = default_hcall_list[i];
3357                WARN_ON(!kvmppc_hcall_impl_hv(hcall));
3358                __set_bit(hcall / 4, default_enabled_hcalls);
3359        }
3360}
3361
3362static struct kvmppc_ops kvm_ops_hv = {
3363        .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
3364        .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
3365        .get_one_reg = kvmppc_get_one_reg_hv,
3366        .set_one_reg = kvmppc_set_one_reg_hv,
3367        .vcpu_load   = kvmppc_core_vcpu_load_hv,
3368        .vcpu_put    = kvmppc_core_vcpu_put_hv,
3369        .set_msr     = kvmppc_set_msr_hv,
3370        .vcpu_run    = kvmppc_vcpu_run_hv,
3371        .vcpu_create = kvmppc_core_vcpu_create_hv,
3372        .vcpu_free   = kvmppc_core_vcpu_free_hv,
3373        .check_requests = kvmppc_core_check_requests_hv,
3374        .get_dirty_log  = kvm_vm_ioctl_get_dirty_log_hv,
3375        .flush_memslot  = kvmppc_core_flush_memslot_hv,
3376        .prepare_memory_region = kvmppc_core_prepare_memory_region_hv,
3377        .commit_memory_region  = kvmppc_core_commit_memory_region_hv,
3378        .unmap_hva = kvm_unmap_hva_hv,
3379        .unmap_hva_range = kvm_unmap_hva_range_hv,
3380        .age_hva  = kvm_age_hva_hv,
3381        .test_age_hva = kvm_test_age_hva_hv,
3382        .set_spte_hva = kvm_set_spte_hva_hv,
3383        .mmu_destroy  = kvmppc_mmu_destroy_hv,
3384        .free_memslot = kvmppc_core_free_memslot_hv,
3385        .create_memslot = kvmppc_core_create_memslot_hv,
3386        .init_vm =  kvmppc_core_init_vm_hv,
3387        .destroy_vm = kvmppc_core_destroy_vm_hv,
3388        .get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv,
3389        .emulate_op = kvmppc_core_emulate_op_hv,
3390        .emulate_mtspr = kvmppc_core_emulate_mtspr_hv,
3391        .emulate_mfspr = kvmppc_core_emulate_mfspr_hv,
3392        .fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv,
3393        .arch_vm_ioctl  = kvm_arch_vm_ioctl_hv,
3394        .hcall_implemented = kvmppc_hcall_impl_hv,
3395};
3396
3397static int kvmppc_book3s_init_hv(void)
3398{
3399        int r;
3400        /*
3401         * FIXME!! Do we need to check on all cpus ?
3402         */
3403        r = kvmppc_core_check_processor_compat_hv();
3404        if (r < 0)
3405                return -ENODEV;
3406
3407        kvm_ops_hv.owner = THIS_MODULE;
3408        kvmppc_hv_ops = &kvm_ops_hv;
3409
3410        init_default_hcalls();
3411
3412        init_vcore_lists();
3413
3414        r = kvmppc_mmu_hv_init();
3415        return r;
3416}
3417
3418static void kvmppc_book3s_exit_hv(void)
3419{
3420        kvmppc_free_host_rm_ops();
3421        kvmppc_hv_ops = NULL;
3422}
3423
3424module_init(kvmppc_book3s_init_hv);
3425module_exit(kvmppc_book3s_exit_hv);
3426MODULE_LICENSE("GPL");
3427MODULE_ALIAS_MISCDEV(KVM_MINOR);
3428MODULE_ALIAS("devname:kvm");
3429