LXR qemu/target/arm/kvm64.c

   1/*
   2 * ARM implementation of KVM hooks, 64 bit specific code
   3 *
   4 * Copyright Mian-M. Hamayun 2013, Virtual Open Systems
   5 * Copyright Alex Bennée 2014, Linaro
   6 *
   7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
   8 * See the COPYING file in the top-level directory.
   9 *
  10 */
  11
  12#include "qemu/osdep.h"
  13#include <sys/ioctl.h>
  14#include <sys/ptrace.h>
  15
  16#include <linux/elf.h>
  17#include <linux/kvm.h>
  18
  19#include "qapi/error.h"
  20#include "cpu.h"
  21#include "qemu/timer.h"
  22#include "qemu/error-report.h"
  23#include "qemu/host-utils.h"
  24#include "qemu/main-loop.h"
  25#include "exec/gdbstub.h"
  26#include "sysemu/runstate.h"
  27#include "sysemu/kvm.h"
  28#include "sysemu/kvm_int.h"
  29#include "kvm_arm.h"
  30#include "internals.h"
  31#include "hw/acpi/acpi.h"
  32#include "hw/acpi/ghes.h"
  33#include "hw/arm/virt.h"
  34
  35static bool have_guest_debug;
  36
  37/*
  38 * Although the ARM implementation of hardware assisted debugging
  39 * allows for different breakpoints per-core, the current GDB
  40 * interface treats them as a global pool of registers (which seems to
  41 * be the case for x86, ppc and s390). As a result we store one copy
  42 * of registers which is used for all active cores.
  43 *
  44 * Write access is serialised by virtue of the GDB protocol which
  45 * updates things. Read access (i.e. when the values are copied to the
  46 * vCPU) is also gated by GDB's run control.
  47 *
  48 * This is not unreasonable as most of the time debugging kernels you
  49 * never know which core will eventually execute your function.
  50 */
  51
  52typedef struct {
  53    uint64_t bcr;
  54    uint64_t bvr;
  55} HWBreakpoint;
  56
  57/* The watchpoint registers can cover more area than the requested
  58 * watchpoint so we need to store the additional information
  59 * somewhere. We also need to supply a CPUWatchpoint to the GDB stub
  60 * when the watchpoint is hit.
  61 */
  62typedef struct {
  63    uint64_t wcr;
  64    uint64_t wvr;
  65    CPUWatchpoint details;
  66} HWWatchpoint;
  67
  68/* Maximum and current break/watch point counts */
  69int max_hw_bps, max_hw_wps;
  70GArray *hw_breakpoints, *hw_watchpoints;
  71
  72#define cur_hw_wps      (hw_watchpoints->len)
  73#define cur_hw_bps      (hw_breakpoints->len)
  74#define get_hw_bp(i)    (&g_array_index(hw_breakpoints, HWBreakpoint, i))
  75#define get_hw_wp(i)    (&g_array_index(hw_watchpoints, HWWatchpoint, i))
  76
  77/**
  78 * kvm_arm_init_debug() - check for guest debug capabilities
  79 * @cs: CPUState
  80 *
  81 * kvm_check_extension returns the number of debug registers we have
  82 * or 0 if we have none.
  83 *
  84 */
  85static void kvm_arm_init_debug(CPUState *cs)
  86{
  87    have_guest_debug = kvm_check_extension(cs->kvm_state,
  88                                           KVM_CAP_SET_GUEST_DEBUG);
  89
  90    max_hw_wps = kvm_check_extension(cs->kvm_state, KVM_CAP_GUEST_DEBUG_HW_WPS);
  91    hw_watchpoints = g_array_sized_new(true, true,
  92                                       sizeof(HWWatchpoint), max_hw_wps);
  93
  94    max_hw_bps = kvm_check_extension(cs->kvm_state, KVM_CAP_GUEST_DEBUG_HW_BPS);
  95    hw_breakpoints = g_array_sized_new(true, true,
  96                                       sizeof(HWBreakpoint), max_hw_bps);
  97    return;
  98}
  99
 100/**
 101 * insert_hw_breakpoint()
 102 * @addr: address of breakpoint
 103 *
 104 * See ARM ARM D2.9.1 for details but here we are only going to create
 105 * simple un-linked breakpoints (i.e. we don't chain breakpoints
 106 * together to match address and context or vmid). The hardware is
 107 * capable of fancier matching but that will require exposing that
 108 * fanciness to GDB's interface
 109 *
 110 * DBGBCR<n>_EL1, Debug Breakpoint Control Registers
 111 *
 112 *  31  24 23  20 19   16 15 14  13  12   9 8   5 4    3 2   1  0
 113 * +------+------+-------+-----+----+------+-----+------+-----+---+
 114 * | RES0 |  BT  |  LBN  | SSC | HMC| RES0 | BAS | RES0 | PMC | E |
 115 * +------+------+-------+-----+----+------+-----+------+-----+---+
 116 *
 117 * BT: Breakpoint type (0 = unlinked address match)
 118 * LBN: Linked BP number (0 = unused)
 119 * SSC/HMC/PMC: Security, Higher and Priv access control (Table D-12)
 120 * BAS: Byte Address Select (RES1 for AArch64)
 121 * E: Enable bit
 122 *
 123 * DBGBVR<n>_EL1, Debug Breakpoint Value Registers
 124 *
 125 *  63  53 52       49 48       2  1 0
 126 * +------+-----------+----------+-----+
 127 * | RESS | VA[52:49] | VA[48:2] | 0 0 |
 128 * +------+-----------+----------+-----+
 129 *
 130 * Depending on the addressing mode bits the top bits of the register
 131 * are a sign extension of the highest applicable VA bit. Some
 132 * versions of GDB don't do it correctly so we ensure they are correct
 133 * here so future PC comparisons will work properly.
 134 */
 135
 136static int insert_hw_breakpoint(target_ulong addr)
 137{
 138    HWBreakpoint brk = {
 139        .bcr = 0x1,                             /* BCR E=1, enable */
 140        .bvr = sextract64(addr, 0, 53)
 141    };
 142
 143    if (cur_hw_bps >= max_hw_bps) {
 144        return -ENOBUFS;
 145    }
 146
 147    brk.bcr = deposit32(brk.bcr, 1, 2, 0x3);   /* PMC = 11 */
 148    brk.bcr = deposit32(brk.bcr, 5, 4, 0xf);   /* BAS = RES1 */
 149
 150    g_array_append_val(hw_breakpoints, brk);
 151
 152    return 0;
 153}
 154
 155/**
 156 * delete_hw_breakpoint()
 157 * @pc: address of breakpoint
 158 *
 159 * Delete a breakpoint and shuffle any above down
 160 */
 161
 162static int delete_hw_breakpoint(target_ulong pc)
 163{
 164    int i;
 165    for (i = 0; i < hw_breakpoints->len; i++) {
 166        HWBreakpoint *brk = get_hw_bp(i);
 167        if (brk->bvr == pc) {
 168            g_array_remove_index(hw_breakpoints, i);
 169            return 0;
 170        }
 171    }
 172    return -ENOENT;
 173}
 174
 175/**
 176 * insert_hw_watchpoint()
 177 * @addr: address of watch point
 178 * @len: size of area
 179 * @type: type of watch point
 180 *
 181 * See ARM ARM D2.10. As with the breakpoints we can do some advanced
 182 * stuff if we want to. The watch points can be linked with the break
 183 * points above to make them context aware. However for simplicity
 184 * currently we only deal with simple read/write watch points.
 185 *
 186 * D7.3.11 DBGWCR<n>_EL1, Debug Watchpoint Control Registers
 187 *
 188 *  31  29 28   24 23  21  20  19 16 15 14  13   12  5 4   3 2   1  0
 189 * +------+-------+------+----+-----+-----+-----+-----+-----+-----+---+
 190 * | RES0 |  MASK | RES0 | WT | LBN | SSC | HMC | BAS | LSC | PAC | E |
 191 * +------+-------+------+----+-----+-----+-----+-----+-----+-----+---+
 192 *
 193 * MASK: num bits addr mask (0=none,01/10=res,11=3 bits (8 bytes))
 194 * WT: 0 - unlinked, 1 - linked (not currently used)
 195 * LBN: Linked BP number (not currently used)
 196 * SSC/HMC/PAC: Security, Higher and Priv access control (Table D2-11)
 197 * BAS: Byte Address Select
 198 * LSC: Load/Store control (01: load, 10: store, 11: both)
 199 * E: Enable
 200 *
 201 * The bottom 2 bits of the value register are masked. Therefore to
 202 * break on any sizes smaller than an unaligned word you need to set
 203 * MASK=0, BAS=bit per byte in question. For larger regions (^2) you
 204 * need to ensure you mask the address as required and set BAS=0xff
 205 */
 206
 207static int insert_hw_watchpoint(target_ulong addr,
 208                                target_ulong len, int type)
 209{
 210    HWWatchpoint wp = {
 211        .wcr = R_DBGWCR_E_MASK, /* E=1, enable */
 212        .wvr = addr & (~0x7ULL),
 213        .details = { .vaddr = addr, .len = len }
 214    };
 215
 216    if (cur_hw_wps >= max_hw_wps) {
 217        return -ENOBUFS;
 218    }
 219
 220    /*
 221     * HMC=0 SSC=0 PAC=3 will hit EL0 or EL1, any security state,
 222     * valid whether EL3 is implemented or not
 223     */
 224    wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, PAC, 3);
 225
 226    switch (type) {
 227    case GDB_WATCHPOINT_READ:
 228        wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, LSC, 1);
 229        wp.details.flags = BP_MEM_READ;
 230        break;
 231    case GDB_WATCHPOINT_WRITE:
 232        wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, LSC, 2);
 233        wp.details.flags = BP_MEM_WRITE;
 234        break;
 235    case GDB_WATCHPOINT_ACCESS:
 236        wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, LSC, 3);
 237        wp.details.flags = BP_MEM_ACCESS;
 238        break;
 239    default:
 240        g_assert_not_reached();
 241        break;
 242    }
 243    if (len <= 8) {
 244        /* we align the address and set the bits in BAS */
 245        int off = addr & 0x7;
 246        int bas = (1 << len) - 1;
 247
 248        wp.wcr = deposit32(wp.wcr, 5 + off, 8 - off, bas);
 249    } else {
 250        /* For ranges above 8 bytes we need to be a power of 2 */
 251        if (is_power_of_2(len)) {
 252            int bits = ctz64(len);
 253
 254            wp.wvr &= ~((1 << bits) - 1);
 255            wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, MASK, bits);
 256            wp.wcr = FIELD_DP64(wp.wcr, DBGWCR, BAS, 0xff);
 257        } else {
 258            return -ENOBUFS;
 259        }
 260    }
 261
 262    g_array_append_val(hw_watchpoints, wp);
 263    return 0;
 264}
 265
 266
 267static bool check_watchpoint_in_range(int i, target_ulong addr)
 268{
 269    HWWatchpoint *wp = get_hw_wp(i);
 270    uint64_t addr_top, addr_bottom = wp->wvr;
 271    int bas = extract32(wp->wcr, 5, 8);
 272    int mask = extract32(wp->wcr, 24, 4);
 273
 274    if (mask) {
 275        addr_top = addr_bottom + (1 << mask);
 276    } else {
 277        /* BAS must be contiguous but can offset against the base
 278         * address in DBGWVR */
 279        addr_bottom = addr_bottom + ctz32(bas);
 280        addr_top = addr_bottom + clo32(bas);
 281    }
 282
 283    if (addr >= addr_bottom && addr <= addr_top) {
 284        return true;
 285    }
 286
 287    return false;
 288}
 289
 290/**
 291 * delete_hw_watchpoint()
 292 * @addr: address of breakpoint
 293 *
 294 * Delete a breakpoint and shuffle any above down
 295 */
 296
 297static int delete_hw_watchpoint(target_ulong addr,
 298                                target_ulong len, int type)
 299{
 300    int i;
 301    for (i = 0; i < cur_hw_wps; i++) {
 302        if (check_watchpoint_in_range(i, addr)) {
 303            g_array_remove_index(hw_watchpoints, i);
 304            return 0;
 305        }
 306    }
 307    return -ENOENT;
 308}
 309
 310
 311int kvm_arch_insert_hw_breakpoint(target_ulong addr,
 312                                  target_ulong len, int type)
 313{
 314    switch (type) {
 315    case GDB_BREAKPOINT_HW:
 316        return insert_hw_breakpoint(addr);
 317        break;
 318    case GDB_WATCHPOINT_READ:
 319    case GDB_WATCHPOINT_WRITE:
 320    case GDB_WATCHPOINT_ACCESS:
 321        return insert_hw_watchpoint(addr, len, type);
 322    default:
 323        return -ENOSYS;
 324    }
 325}
 326
 327int kvm_arch_remove_hw_breakpoint(target_ulong addr,
 328                                  target_ulong len, int type)
 329{
 330    switch (type) {
 331    case GDB_BREAKPOINT_HW:
 332        return delete_hw_breakpoint(addr);
 333    case GDB_WATCHPOINT_READ:
 334    case GDB_WATCHPOINT_WRITE:
 335    case GDB_WATCHPOINT_ACCESS:
 336        return delete_hw_watchpoint(addr, len, type);
 337    default:
 338        return -ENOSYS;
 339    }
 340}
 341
 342
 343void kvm_arch_remove_all_hw_breakpoints(void)
 344{
 345    if (cur_hw_wps > 0) {
 346        g_array_remove_range(hw_watchpoints, 0, cur_hw_wps);
 347    }
 348    if (cur_hw_bps > 0) {
 349        g_array_remove_range(hw_breakpoints, 0, cur_hw_bps);
 350    }
 351}
 352
 353void kvm_arm_copy_hw_debug_data(struct kvm_guest_debug_arch *ptr)
 354{
 355    int i;
 356    memset(ptr, 0, sizeof(struct kvm_guest_debug_arch));
 357
 358    for (i = 0; i < max_hw_wps; i++) {
 359        HWWatchpoint *wp = get_hw_wp(i);
 360        ptr->dbg_wcr[i] = wp->wcr;
 361        ptr->dbg_wvr[i] = wp->wvr;
 362    }
 363    for (i = 0; i < max_hw_bps; i++) {
 364        HWBreakpoint *bp = get_hw_bp(i);
 365        ptr->dbg_bcr[i] = bp->bcr;
 366        ptr->dbg_bvr[i] = bp->bvr;
 367    }
 368}
 369
 370bool kvm_arm_hw_debug_active(CPUState *cs)
 371{
 372    return ((cur_hw_wps > 0) || (cur_hw_bps > 0));
 373}
 374
 375static bool find_hw_breakpoint(CPUState *cpu, target_ulong pc)
 376{
 377    int i;
 378
 379    for (i = 0; i < cur_hw_bps; i++) {
 380        HWBreakpoint *bp = get_hw_bp(i);
 381        if (bp->bvr == pc) {
 382            return true;
 383        }
 384    }
 385    return false;
 386}
 387
 388static CPUWatchpoint *find_hw_watchpoint(CPUState *cpu, target_ulong addr)
 389{
 390    int i;
 391
 392    for (i = 0; i < cur_hw_wps; i++) {
 393        if (check_watchpoint_in_range(i, addr)) {
 394            return &get_hw_wp(i)->details;
 395        }
 396    }
 397    return NULL;
 398}
 399
 400static bool kvm_arm_set_device_attr(CPUState *cs, struct kvm_device_attr *attr,
 401                                    const char *name)
 402{
 403    int err;
 404
 405    err = kvm_vcpu_ioctl(cs, KVM_HAS_DEVICE_ATTR, attr);
 406    if (err != 0) {
 407        error_report("%s: KVM_HAS_DEVICE_ATTR: %s", name, strerror(-err));
 408        return false;
 409    }
 410
 411    err = kvm_vcpu_ioctl(cs, KVM_SET_DEVICE_ATTR, attr);
 412    if (err != 0) {
 413        error_report("%s: KVM_SET_DEVICE_ATTR: %s", name, strerror(-err));
 414        return false;
 415    }
 416
 417    return true;
 418}
 419
 420void kvm_arm_pmu_init(CPUState *cs)
 421{
 422    struct kvm_device_attr attr = {
 423        .group = KVM_ARM_VCPU_PMU_V3_CTRL,
 424        .attr = KVM_ARM_VCPU_PMU_V3_INIT,
 425    };
 426
 427    if (!ARM_CPU(cs)->has_pmu) {
 428        return;
 429    }
 430    if (!kvm_arm_set_device_attr(cs, &attr, "PMU")) {
 431        error_report("failed to init PMU");
 432        abort();
 433    }
 434}
 435
 436void kvm_arm_pmu_set_irq(CPUState *cs, int irq)
 437{
 438    struct kvm_device_attr attr = {
 439        .group = KVM_ARM_VCPU_PMU_V3_CTRL,
 440        .addr = (intptr_t)&irq,
 441        .attr = KVM_ARM_VCPU_PMU_V3_IRQ,
 442    };
 443
 444    if (!ARM_CPU(cs)->has_pmu) {
 445        return;
 446    }
 447    if (!kvm_arm_set_device_attr(cs, &attr, "PMU")) {
 448        error_report("failed to set irq for PMU");
 449        abort();
 450    }
 451}
 452
 453void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa)
 454{
 455    struct kvm_device_attr attr = {
 456        .group = KVM_ARM_VCPU_PVTIME_CTRL,
 457        .attr = KVM_ARM_VCPU_PVTIME_IPA,
 458        .addr = (uint64_t)&ipa,
 459    };
 460
 461    if (ARM_CPU(cs)->kvm_steal_time == ON_OFF_AUTO_OFF) {
 462        return;
 463    }
 464    if (!kvm_arm_set_device_attr(cs, &attr, "PVTIME IPA")) {
 465        error_report("failed to init PVTIME IPA");
 466        abort();
 467    }
 468}
 469
 470static int read_sys_reg32(int fd, uint32_t *pret, uint64_t id)
 471{
 472    uint64_t ret;
 473    struct kvm_one_reg idreg = { .id = id, .addr = (uintptr_t)&ret };
 474    int err;
 475
 476    assert((id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U64);
 477    err = ioctl(fd, KVM_GET_ONE_REG, &idreg);
 478    if (err < 0) {
 479        return -1;
 480    }
 481    *pret = ret;
 482    return 0;
 483}
 484
 485static int read_sys_reg64(int fd, uint64_t *pret, uint64_t id)
 486{
 487    struct kvm_one_reg idreg = { .id = id, .addr = (uintptr_t)pret };
 488
 489    assert((id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U64);
 490    return ioctl(fd, KVM_GET_ONE_REG, &idreg);
 491}
 492
 493static bool kvm_arm_pauth_supported(void)
 494{
 495    return (kvm_check_extension(kvm_state, KVM_CAP_ARM_PTRAUTH_ADDRESS) &&
 496            kvm_check_extension(kvm_state, KVM_CAP_ARM_PTRAUTH_GENERIC));
 497}
 498
 499bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
 500{
 501    /* Identify the feature bits corresponding to the host CPU, and
 502     * fill out the ARMHostCPUClass fields accordingly. To do this
 503     * we have to create a scratch VM, create a single CPU inside it,
 504     * and then query that CPU for the relevant ID registers.
 505     */
 506    int fdarray[3];
 507    bool sve_supported;
 508    bool pmu_supported = false;
 509    uint64_t features = 0;
 510    int err;
 511
 512    /* Old kernels may not know about the PREFERRED_TARGET ioctl: however
 513     * we know these will only support creating one kind of guest CPU,
 514     * which is its preferred CPU type. Fortunately these old kernels
 515     * support only a very limited number of CPUs.
 516     */
 517    static const uint32_t cpus_to_try[] = {
 518        KVM_ARM_TARGET_AEM_V8,
 519        KVM_ARM_TARGET_FOUNDATION_V8,
 520        KVM_ARM_TARGET_CORTEX_A57,
 521        QEMU_KVM_ARM_TARGET_NONE
 522    };
 523    /*
 524     * target = -1 informs kvm_arm_create_scratch_host_vcpu()
 525     * to use the preferred target
 526     */
 527    struct kvm_vcpu_init init = { .target = -1, };
 528
 529    /*
 530     * Ask for SVE if supported, so that we can query ID_AA64ZFR0,
 531     * which is otherwise RAZ.
 532     */
 533    sve_supported = kvm_arm_sve_supported();
 534    if (sve_supported) {
 535        init.features[0] |= 1 << KVM_ARM_VCPU_SVE;
 536    }
 537
 538    /*
 539     * Ask for Pointer Authentication if supported, so that we get
 540     * the unsanitized field values for AA64ISAR1_EL1.
 541     */
 542    if (kvm_arm_pauth_supported()) {
 543        init.features[0] |= (1 << KVM_ARM_VCPU_PTRAUTH_ADDRESS |
 544                             1 << KVM_ARM_VCPU_PTRAUTH_GENERIC);
 545    }
 546
 547    if (kvm_arm_pmu_supported()) {
 548        init.features[0] |= 1 << KVM_ARM_VCPU_PMU_V3;
 549        pmu_supported = true;
 550    }
 551
 552    if (!kvm_arm_create_scratch_host_vcpu(cpus_to_try, fdarray, &init)) {
 553        return false;
 554    }
 555
 556    ahcf->target = init.target;
 557    ahcf->dtb_compatible = "arm,arm-v8";
 558
 559    err = read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64pfr0,
 560                         ARM64_SYS_REG(3, 0, 0, 4, 0));
 561    if (unlikely(err < 0)) {
 562        /*
 563         * Before v4.15, the kernel only exposed a limited number of system
 564         * registers, not including any of the interesting AArch64 ID regs.
 565         * For the most part we could leave these fields as zero with minimal
 566         * effect, since this does not affect the values seen by the guest.
 567         *
 568         * However, it could cause problems down the line for QEMU,
 569         * so provide a minimal v8.0 default.
 570         *
 571         * ??? Could read MIDR and use knowledge from cpu64.c.
 572         * ??? Could map a page of memory into our temp guest and
 573         *     run the tiniest of hand-crafted kernels to extract
 574         *     the values seen by the guest.
 575         * ??? Either of these sounds like too much effort just
 576         *     to work around running a modern host kernel.
 577         */
 578        ahcf->isar.id_aa64pfr0 = 0x00000011; /* EL1&0, AArch64 only */
 579        err = 0;
 580    } else {
 581        err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64pfr1,
 582                              ARM64_SYS_REG(3, 0, 0, 4, 1));
 583        err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64smfr0,
 584                              ARM64_SYS_REG(3, 0, 0, 4, 5));
 585        err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64dfr0,
 586                              ARM64_SYS_REG(3, 0, 0, 5, 0));
 587        err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64dfr1,
 588                              ARM64_SYS_REG(3, 0, 0, 5, 1));
 589        err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64isar0,
 590                              ARM64_SYS_REG(3, 0, 0, 6, 0));
 591        err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64isar1,
 592                              ARM64_SYS_REG(3, 0, 0, 6, 1));
 593        err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64mmfr0,
 594                              ARM64_SYS_REG(3, 0, 0, 7, 0));
 595        err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64mmfr1,
 596                              ARM64_SYS_REG(3, 0, 0, 7, 1));
 597        err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64mmfr2,
 598                              ARM64_SYS_REG(3, 0, 0, 7, 2));
 599
 600        /*
 601         * Note that if AArch32 support is not present in the host,
 602         * the AArch32 sysregs are present to be read, but will
 603         * return UNKNOWN values.  This is neither better nor worse
 604         * than skipping the reads and leaving 0, as we must avoid
 605         * considering the values in every case.
 606         */
 607        err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_pfr0,
 608                              ARM64_SYS_REG(3, 0, 0, 1, 0));
 609        err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_pfr1,
 610                              ARM64_SYS_REG(3, 0, 0, 1, 1));
 611        err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_pfr2,
 612                              ARM64_SYS_REG(3, 0, 0, 3, 4));
 613        err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_dfr0,
 614                              ARM64_SYS_REG(3, 0, 0, 1, 2));
 615        err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr0,
 616                              ARM64_SYS_REG(3, 0, 0, 1, 4));
 617        err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr1,
 618                              ARM64_SYS_REG(3, 0, 0, 1, 5));
 619        err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr2,
 620                              ARM64_SYS_REG(3, 0, 0, 1, 6));
 621        err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr3,
 622                              ARM64_SYS_REG(3, 0, 0, 1, 7));
 623        err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar0,
 624                              ARM64_SYS_REG(3, 0, 0, 2, 0));
 625        err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar1,
 626                              ARM64_SYS_REG(3, 0, 0, 2, 1));
 627        err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar2,
 628                              ARM64_SYS_REG(3, 0, 0, 2, 2));
 629        err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar3,
 630                              ARM64_SYS_REG(3, 0, 0, 2, 3));
 631        err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar4,
 632                              ARM64_SYS_REG(3, 0, 0, 2, 4));
 633        err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar5,
 634                              ARM64_SYS_REG(3, 0, 0, 2, 5));
 635        err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_mmfr4,
 636                              ARM64_SYS_REG(3, 0, 0, 2, 6));
 637        err |= read_sys_reg32(fdarray[2], &ahcf->isar.id_isar6,
 638                              ARM64_SYS_REG(3, 0, 0, 2, 7));
 639
 640        err |= read_sys_reg32(fdarray[2], &ahcf->isar.mvfr0,
 641                              ARM64_SYS_REG(3, 0, 0, 3, 0));
 642        err |= read_sys_reg32(fdarray[2], &ahcf->isar.mvfr1,
 643                              ARM64_SYS_REG(3, 0, 0, 3, 1));
 644        err |= read_sys_reg32(fdarray[2], &ahcf->isar.mvfr2,
 645                              ARM64_SYS_REG(3, 0, 0, 3, 2));
 646
 647        /*
 648         * DBGDIDR is a bit complicated because the kernel doesn't
 649         * provide an accessor for it in 64-bit mode, which is what this
 650         * scratch VM is in, and there's no architected "64-bit sysreg
 651         * which reads the same as the 32-bit register" the way there is
 652         * for other ID registers. Instead we synthesize a value from the
 653         * AArch64 ID_AA64DFR0, the same way the kernel code in
 654         * arch/arm64/kvm/sys_regs.c:trap_dbgidr() does.
 655         * We only do this if the CPU supports AArch32 at EL1.
 656         */
 657        if (FIELD_EX32(ahcf->isar.id_aa64pfr0, ID_AA64PFR0, EL1) >= 2) {
 658            int wrps = FIELD_EX64(ahcf->isar.id_aa64dfr0, ID_AA64DFR0, WRPS);
 659            int brps = FIELD_EX64(ahcf->isar.id_aa64dfr0, ID_AA64DFR0, BRPS);
 660            int ctx_cmps =
 661                FIELD_EX64(ahcf->isar.id_aa64dfr0, ID_AA64DFR0, CTX_CMPS);
 662            int version = 6; /* ARMv8 debug architecture */
 663            bool has_el3 =
 664                !!FIELD_EX32(ahcf->isar.id_aa64pfr0, ID_AA64PFR0, EL3);
 665            uint32_t dbgdidr = 0;
 666
 667            dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, WRPS, wrps);
 668            dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, BRPS, brps);
 669            dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, CTX_CMPS, ctx_cmps);
 670            dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, VERSION, version);
 671            dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, NSUHD_IMP, has_el3);
 672            dbgdidr = FIELD_DP32(dbgdidr, DBGDIDR, SE_IMP, has_el3);
 673            dbgdidr |= (1 << 15); /* RES1 bit */
 674            ahcf->isar.dbgdidr = dbgdidr;
 675        }
 676
 677        if (pmu_supported) {
 678            /* PMCR_EL0 is only accessible if the vCPU has feature PMU_V3 */
 679            err |= read_sys_reg64(fdarray[2], &ahcf->isar.reset_pmcr_el0,
 680                                  ARM64_SYS_REG(3, 3, 9, 12, 0));
 681        }
 682
 683        if (sve_supported) {
 684            /*
 685             * There is a range of kernels between kernel commit 73433762fcae
 686             * and f81cb2c3ad41 which have a bug where the kernel doesn't
 687             * expose SYS_ID_AA64ZFR0_EL1 via the ONE_REG API unless the VM has
 688             * enabled SVE support, which resulted in an error rather than RAZ.
 689             * So only read the register if we set KVM_ARM_VCPU_SVE above.
 690             */
 691            err |= read_sys_reg64(fdarray[2], &ahcf->isar.id_aa64zfr0,
 692                                  ARM64_SYS_REG(3, 0, 0, 4, 4));
 693        }
 694    }
 695
 696    kvm_arm_destroy_scratch_host_vcpu(fdarray);
 697
 698    if (err < 0) {
 699        return false;
 700    }
 701
 702    /*
 703     * We can assume any KVM supporting CPU is at least a v8
 704     * with VFPv4+Neon; this in turn implies most of the other
 705     * feature bits.
 706     */
 707    features |= 1ULL << ARM_FEATURE_V8;
 708    features |= 1ULL << ARM_FEATURE_NEON;
 709    features |= 1ULL << ARM_FEATURE_AARCH64;
 710    features |= 1ULL << ARM_FEATURE_PMU;
 711    features |= 1ULL << ARM_FEATURE_GENERIC_TIMER;
 712
 713    ahcf->features = features;
 714
 715    return true;
 716}
 717
 718void kvm_arm_steal_time_finalize(ARMCPU *cpu, Error **errp)
 719{
 720    bool has_steal_time = kvm_arm_steal_time_supported();
 721
 722    if (cpu->kvm_steal_time == ON_OFF_AUTO_AUTO) {
 723        if (!has_steal_time || !arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
 724            cpu->kvm_steal_time = ON_OFF_AUTO_OFF;
 725        } else {
 726            cpu->kvm_steal_time = ON_OFF_AUTO_ON;
 727        }
 728    } else if (cpu->kvm_steal_time == ON_OFF_AUTO_ON) {
 729        if (!has_steal_time) {
 730            error_setg(errp, "'kvm-steal-time' cannot be enabled "
 731                             "on this host");
 732            return;
 733        } else if (!arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
 734            /*
 735             * DEN0057A chapter 2 says "This specification only covers
 736             * systems in which the Execution state of the hypervisor
 737             * as well as EL1 of virtual machines is AArch64.". And,
 738             * to ensure that, the smc/hvc calls are only specified as
 739             * smc64/hvc64.
 740             */
 741            error_setg(errp, "'kvm-steal-time' cannot be enabled "
 742                             "for AArch32 guests");
 743            return;
 744        }
 745    }
 746}
 747
 748bool kvm_arm_aarch32_supported(void)
 749{
 750    return kvm_check_extension(kvm_state, KVM_CAP_ARM_EL1_32BIT);
 751}
 752
 753bool kvm_arm_sve_supported(void)
 754{
 755    return kvm_check_extension(kvm_state, KVM_CAP_ARM_SVE);
 756}
 757
 758bool kvm_arm_steal_time_supported(void)
 759{
 760    return kvm_check_extension(kvm_state, KVM_CAP_STEAL_TIME);
 761}
 762
 763QEMU_BUILD_BUG_ON(KVM_ARM64_SVE_VQ_MIN != 1);
 764
 765uint32_t kvm_arm_sve_get_vls(CPUState *cs)
 766{
 767    /* Only call this function if kvm_arm_sve_supported() returns true. */
 768    static uint64_t vls[KVM_ARM64_SVE_VLS_WORDS];
 769    static bool probed;
 770    uint32_t vq = 0;
 771    int i;
 772
 773    /*
 774     * KVM ensures all host CPUs support the same set of vector lengths.
 775     * So we only need to create the scratch VCPUs once and then cache
 776     * the results.
 777     */
 778    if (!probed) {
 779        struct kvm_vcpu_init init = {
 780            .target = -1,
 781            .features[0] = (1 << KVM_ARM_VCPU_SVE),
 782        };
 783        struct kvm_one_reg reg = {
 784            .id = KVM_REG_ARM64_SVE_VLS,
 785            .addr = (uint64_t)&vls[0],
 786        };
 787        int fdarray[3], ret;
 788
 789        probed = true;
 790
 791        if (!kvm_arm_create_scratch_host_vcpu(NULL, fdarray, &init)) {
 792            error_report("failed to create scratch VCPU with SVE enabled");
 793            abort();
 794        }
 795        ret = ioctl(fdarray[2], KVM_GET_ONE_REG, &reg);
 796        kvm_arm_destroy_scratch_host_vcpu(fdarray);
 797        if (ret) {
 798            error_report("failed to get KVM_REG_ARM64_SVE_VLS: %s",
 799                         strerror(errno));
 800            abort();
 801        }
 802
 803        for (i = KVM_ARM64_SVE_VLS_WORDS - 1; i >= 0; --i) {
 804            if (vls[i]) {
 805                vq = 64 - clz64(vls[i]) + i * 64;
 806                break;
 807            }
 808        }
 809        if (vq > ARM_MAX_VQ) {
 810            warn_report("KVM supports vector lengths larger than "
 811                        "QEMU can enable");
 812            vls[0] &= MAKE_64BIT_MASK(0, ARM_MAX_VQ);
 813        }
 814    }
 815
 816    return vls[0];
 817}
 818
 819static int kvm_arm_sve_set_vls(CPUState *cs)
 820{
 821    ARMCPU *cpu = ARM_CPU(cs);
 822    uint64_t vls[KVM_ARM64_SVE_VLS_WORDS] = { cpu->sve_vq.map };
 823    struct kvm_one_reg reg = {
 824        .id = KVM_REG_ARM64_SVE_VLS,
 825        .addr = (uint64_t)&vls[0],
 826    };
 827
 828    assert(cpu->sve_max_vq <= KVM_ARM64_SVE_VQ_MAX);
 829
 830    return kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
 831}
 832
 833#define ARM_CPU_ID_MPIDR       3, 0, 0, 0, 5
 834
 835int kvm_arch_init_vcpu(CPUState *cs)
 836{
 837    int ret;
 838    uint64_t mpidr;
 839    ARMCPU *cpu = ARM_CPU(cs);
 840    CPUARMState *env = &cpu->env;
 841    uint64_t psciver;
 842
 843    if (cpu->kvm_target == QEMU_KVM_ARM_TARGET_NONE ||
 844        !object_dynamic_cast(OBJECT(cpu), TYPE_AARCH64_CPU)) {
 845        error_report("KVM is not supported for this guest CPU type");
 846        return -EINVAL;
 847    }
 848
 849    qemu_add_vm_change_state_handler(kvm_arm_vm_state_change, cs);
 850
 851    /* Determine init features for this CPU */
 852    memset(cpu->kvm_init_features, 0, sizeof(cpu->kvm_init_features));
 853    if (cs->start_powered_off) {
 854        cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_POWER_OFF;
 855    }
 856    if (kvm_check_extension(cs->kvm_state, KVM_CAP_ARM_PSCI_0_2)) {
 857        cpu->psci_version = QEMU_PSCI_VERSION_0_2;
 858        cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_PSCI_0_2;
 859    }
 860    if (!arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
 861        cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_EL1_32BIT;
 862    }
 863    if (!kvm_check_extension(cs->kvm_state, KVM_CAP_ARM_PMU_V3)) {
 864        cpu->has_pmu = false;
 865    }
 866    if (cpu->has_pmu) {
 867        cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_PMU_V3;
 868    } else {
 869        env->features &= ~(1ULL << ARM_FEATURE_PMU);
 870    }
 871    if (cpu_isar_feature(aa64_sve, cpu)) {
 872        assert(kvm_arm_sve_supported());
 873        cpu->kvm_init_features[0] |= 1 << KVM_ARM_VCPU_SVE;
 874    }
 875    if (cpu_isar_feature(aa64_pauth, cpu)) {
 876        cpu->kvm_init_features[0] |= (1 << KVM_ARM_VCPU_PTRAUTH_ADDRESS |
 877                                      1 << KVM_ARM_VCPU_PTRAUTH_GENERIC);
 878    }
 879
 880    /* Do KVM_ARM_VCPU_INIT ioctl */
 881    ret = kvm_arm_vcpu_init(cs);
 882    if (ret) {
 883        return ret;
 884    }
 885
 886    if (cpu_isar_feature(aa64_sve, cpu)) {
 887        ret = kvm_arm_sve_set_vls(cs);
 888        if (ret) {
 889            return ret;
 890        }
 891        ret = kvm_arm_vcpu_finalize(cs, KVM_ARM_VCPU_SVE);
 892        if (ret) {
 893            return ret;
 894        }
 895    }
 896
 897    /*
 898     * KVM reports the exact PSCI version it is implementing via a
 899     * special sysreg. If it is present, use its contents to determine
 900     * what to report to the guest in the dtb (it is the PSCI version,
 901     * in the same 15-bits major 16-bits minor format that PSCI_VERSION
 902     * returns).
 903     */
 904    if (!kvm_get_one_reg(cs, KVM_REG_ARM_PSCI_VERSION, &psciver)) {
 905        cpu->psci_version = psciver;
 906    }
 907
 908    /*
 909     * When KVM is in use, PSCI is emulated in-kernel and not by qemu.
 910     * Currently KVM has its own idea about MPIDR assignment, so we
 911     * override our defaults with what we get from KVM.
 912     */
 913    ret = kvm_get_one_reg(cs, ARM64_SYS_REG(ARM_CPU_ID_MPIDR), &mpidr);
 914    if (ret) {
 915        return ret;
 916    }
 917    cpu->mp_affinity = mpidr & ARM64_AFFINITY_MASK;
 918
 919    kvm_arm_init_debug(cs);
 920
 921    /* Check whether user space can specify guest syndrome value */
 922    kvm_arm_init_serror_injection(cs);
 923
 924    return kvm_arm_init_cpreg_list(cpu);
 925}
 926
 927int kvm_arch_destroy_vcpu(CPUState *cs)
 928{
 929    return 0;
 930}
 931
 932bool kvm_arm_reg_syncs_via_cpreg_list(uint64_t regidx)
 933{
 934    /* Return true if the regidx is a register we should synchronize
 935     * via the cpreg_tuples array (ie is not a core or sve reg that
 936     * we sync by hand in kvm_arch_get/put_registers())
 937     */
 938    switch (regidx & KVM_REG_ARM_COPROC_MASK) {
 939    case KVM_REG_ARM_CORE:
 940    case KVM_REG_ARM64_SVE:
 941        return false;
 942    default:
 943        return true;
 944    }
 945}
 946
 947typedef struct CPRegStateLevel {
 948    uint64_t regidx;
 949    int level;
 950} CPRegStateLevel;
 951
 952/* All system registers not listed in the following table are assumed to be
 953 * of the level KVM_PUT_RUNTIME_STATE. If a register should be written less
 954 * often, you must add it to this table with a state of either
 955 * KVM_PUT_RESET_STATE or KVM_PUT_FULL_STATE.
 956 */
 957static const CPRegStateLevel non_runtime_cpregs[] = {
 958    { KVM_REG_ARM_TIMER_CNT, KVM_PUT_FULL_STATE },
 959};
 960
 961int kvm_arm_cpreg_level(uint64_t regidx)
 962{
 963    int i;
 964
 965    for (i = 0; i < ARRAY_SIZE(non_runtime_cpregs); i++) {
 966        const CPRegStateLevel *l = &non_runtime_cpregs[i];
 967        if (l->regidx == regidx) {
 968            return l->level;
 969        }
 970    }
 971
 972    return KVM_PUT_RUNTIME_STATE;
 973}
 974
 975/* Callers must hold the iothread mutex lock */
 976static void kvm_inject_arm_sea(CPUState *c)
 977{
 978    ARMCPU *cpu = ARM_CPU(c);
 979    CPUARMState *env = &cpu->env;
 980    uint32_t esr;
 981    bool same_el;
 982
 983    c->exception_index = EXCP_DATA_ABORT;
 984    env->exception.target_el = 1;
 985
 986    /*
 987     * Set the DFSC to synchronous external abort and set FnV to not valid,
 988     * this will tell guest the FAR_ELx is UNKNOWN for this abort.
 989     */
 990    same_el = arm_current_el(env) == env->exception.target_el;
 991    esr = syn_data_abort_no_iss(same_el, 1, 0, 0, 0, 0, 0x10);
 992
 993    env->exception.syndrome = esr;
 994
 995    arm_cpu_do_interrupt(c);
 996}
 997
 998#define AARCH64_CORE_REG(x)   (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
 999                 KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
1000

1001#define AARCH64_SIMD_CORE_REG(x)   (KVM_REG_ARM64 | KVM_REG_SIZE_U128 | \
1002                 KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
1003
1004#define AARCH64_SIMD_CTRL_REG(x)   (KVM_REG_ARM64 | KVM_REG_SIZE_U32 | \
1005                 KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
1006
1007static int kvm_arch_put_fpsimd(CPUState *cs)
1008{
1009    CPUARMState *env = &ARM_CPU(cs)->env;
1010    struct kvm_one_reg reg;
1011    int i, ret;
1012
1013    for (i = 0; i < 32; i++) {
1014        uint64_t *q = aa64_vfp_qreg(env, i);
1015#if HOST_BIG_ENDIAN
1016        uint64_t fp_val[2] = { q[1], q[0] };
1017        reg.addr = (uintptr_t)fp_val;
1018#else
1019        reg.addr = (uintptr_t)q;
1020#endif
1021        reg.id = AARCH64_SIMD_CORE_REG(fp_regs.vregs[i]);
1022        ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1023        if (ret) {
1024            return ret;
1025        }
1026    }
1027
1028    return 0;
1029}
1030
1031/*
1032 * KVM SVE registers come in slices where ZREGs have a slice size of 2048 bits
1033 * and PREGS and the FFR have a slice size of 256 bits. However we simply hard
1034 * code the slice index to zero for now as it's unlikely we'll need more than
1035 * one slice for quite some time.
1036 */
1037static int kvm_arch_put_sve(CPUState *cs)
1038{
1039    ARMCPU *cpu = ARM_CPU(cs);
1040    CPUARMState *env = &cpu->env;
1041    uint64_t tmp[ARM_MAX_VQ * 2];
1042    uint64_t *r;
1043    struct kvm_one_reg reg;
1044    int n, ret;
1045
1046    for (n = 0; n < KVM_ARM64_SVE_NUM_ZREGS; ++n) {
1047        r = sve_bswap64(tmp, &env->vfp.zregs[n].d[0], cpu->sve_max_vq * 2);
1048        reg.addr = (uintptr_t)r;
1049        reg.id = KVM_REG_ARM64_SVE_ZREG(n, 0);
1050        ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1051        if (ret) {
1052            return ret;
1053        }
1054    }
1055
1056    for (n = 0; n < KVM_ARM64_SVE_NUM_PREGS; ++n) {
1057        r = sve_bswap64(tmp, r = &env->vfp.pregs[n].p[0],
1058                        DIV_ROUND_UP(cpu->sve_max_vq * 2, 8));
1059        reg.addr = (uintptr_t)r;
1060        reg.id = KVM_REG_ARM64_SVE_PREG(n, 0);
1061        ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1062        if (ret) {
1063            return ret;
1064        }
1065    }
1066
1067    r = sve_bswap64(tmp, &env->vfp.pregs[FFR_PRED_NUM].p[0],
1068                    DIV_ROUND_UP(cpu->sve_max_vq * 2, 8));
1069    reg.addr = (uintptr_t)r;
1070    reg.id = KVM_REG_ARM64_SVE_FFR(0);
1071    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1072    if (ret) {
1073        return ret;
1074    }
1075
1076    return 0;
1077}
1078
1079int kvm_arch_put_registers(CPUState *cs, int level)
1080{
1081    struct kvm_one_reg reg;
1082    uint64_t val;
1083    uint32_t fpr;
1084    int i, ret;
1085    unsigned int el;
1086
1087    ARMCPU *cpu = ARM_CPU(cs);
1088    CPUARMState *env = &cpu->env;
1089
1090    /* If we are in AArch32 mode then we need to copy the AArch32 regs to the
1091     * AArch64 registers before pushing them out to 64-bit KVM.
1092     */
1093    if (!is_a64(env)) {
1094        aarch64_sync_32_to_64(env);
1095    }
1096
1097    for (i = 0; i < 31; i++) {
1098        reg.id = AARCH64_CORE_REG(regs.regs[i]);
1099        reg.addr = (uintptr_t) &env->xregs[i];
1100        ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1101        if (ret) {
1102            return ret;
1103        }
1104    }
1105
1106    /* KVM puts SP_EL0 in regs.sp and SP_EL1 in regs.sp_el1. On the
1107     * QEMU side we keep the current SP in xregs[31] as well.
1108     */
1109    aarch64_save_sp(env, 1);
1110
1111    reg.id = AARCH64_CORE_REG(regs.sp);
1112    reg.addr = (uintptr_t) &env->sp_el[0];
1113    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1114    if (ret) {
1115        return ret;
1116    }
1117
1118    reg.id = AARCH64_CORE_REG(sp_el1);
1119    reg.addr = (uintptr_t) &env->sp_el[1];
1120    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1121    if (ret) {
1122        return ret;
1123    }
1124
1125    /* Note that KVM thinks pstate is 64 bit but we use a uint32_t */
1126    if (is_a64(env)) {
1127        val = pstate_read(env);
1128    } else {
1129        val = cpsr_read(env);
1130    }
1131    reg.id = AARCH64_CORE_REG(regs.pstate);
1132    reg.addr = (uintptr_t) &val;
1133    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1134    if (ret) {
1135        return ret;
1136    }
1137
1138    reg.id = AARCH64_CORE_REG(regs.pc);
1139    reg.addr = (uintptr_t) &env->pc;
1140    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1141    if (ret) {
1142        return ret;
1143    }
1144
1145    reg.id = AARCH64_CORE_REG(elr_el1);
1146    reg.addr = (uintptr_t) &env->elr_el[1];
1147    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1148    if (ret) {
1149        return ret;
1150    }
1151
1152    /* Saved Program State Registers
1153     *
1154     * Before we restore from the banked_spsr[] array we need to
1155     * ensure that any modifications to env->spsr are correctly
1156     * reflected in the banks.
1157     */
1158    el = arm_current_el(env);
1159    if (el > 0 && !is_a64(env)) {
1160        i = bank_number(env->uncached_cpsr & CPSR_M);
1161        env->banked_spsr[i] = env->spsr;
1162    }
1163
1164    /* KVM 0-4 map to QEMU banks 1-5 */
1165    for (i = 0; i < KVM_NR_SPSR; i++) {
1166        reg.id = AARCH64_CORE_REG(spsr[i]);
1167        reg.addr = (uintptr_t) &env->banked_spsr[i + 1];
1168        ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1169        if (ret) {
1170            return ret;
1171        }
1172    }
1173
1174    if (cpu_isar_feature(aa64_sve, cpu)) {
1175        ret = kvm_arch_put_sve(cs);
1176    } else {
1177        ret = kvm_arch_put_fpsimd(cs);
1178    }
1179    if (ret) {
1180        return ret;
1181    }
1182
1183    reg.addr = (uintptr_t)(&fpr);
1184    fpr = vfp_get_fpsr(env);
1185    reg.id = AARCH64_SIMD_CTRL_REG(fp_regs.fpsr);
1186    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1187    if (ret) {
1188        return ret;
1189    }
1190
1191    reg.addr = (uintptr_t)(&fpr);
1192    fpr = vfp_get_fpcr(env);
1193    reg.id = AARCH64_SIMD_CTRL_REG(fp_regs.fpcr);
1194    ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
1195    if (ret) {
1196        return ret;
1197    }
1198
1199    write_cpustate_to_list(cpu, true);
1200
1201    if (!write_list_to_kvmstate(cpu, level)) {
1202        return -EINVAL;
1203    }
1204
1205   /*
1206    * Setting VCPU events should be triggered after syncing the registers
1207    * to avoid overwriting potential changes made by KVM upon calling
1208    * KVM_SET_VCPU_EVENTS ioctl
1209    */
1210    ret = kvm_put_vcpu_events(cpu);
1211    if (ret) {
1212        return ret;
1213    }
1214
1215    kvm_arm_sync_mpstate_to_kvm(cpu);
1216
1217    return ret;
1218}
1219
1220static int kvm_arch_get_fpsimd(CPUState *cs)
1221{
1222    CPUARMState *env = &ARM_CPU(cs)->env;
1223    struct kvm_one_reg reg;
1224    int i, ret;
1225
1226    for (i = 0; i < 32; i++) {
1227        uint64_t *q = aa64_vfp_qreg(env, i);
1228        reg.id = AARCH64_SIMD_CORE_REG(fp_regs.vregs[i]);
1229        reg.addr = (uintptr_t)q;
1230        ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1231        if (ret) {
1232            return ret;
1233        } else {
1234#if HOST_BIG_ENDIAN
1235            uint64_t t;
1236            t = q[0], q[0] = q[1], q[1] = t;
1237#endif
1238        }
1239    }
1240
1241    return 0;
1242}
1243
1244/*
1245 * KVM SVE registers come in slices where ZREGs have a slice size of 2048 bits
1246 * and PREGS and the FFR have a slice size of 256 bits. However we simply hard
1247 * code the slice index to zero for now as it's unlikely we'll need more than
1248 * one slice for quite some time.
1249 */
1250static int kvm_arch_get_sve(CPUState *cs)
1251{
1252    ARMCPU *cpu = ARM_CPU(cs);
1253    CPUARMState *env = &cpu->env;
1254    struct kvm_one_reg reg;
1255    uint64_t *r;
1256    int n, ret;
1257
1258    for (n = 0; n < KVM_ARM64_SVE_NUM_ZREGS; ++n) {
1259        r = &env->vfp.zregs[n].d[0];
1260        reg.addr = (uintptr_t)r;
1261        reg.id = KVM_REG_ARM64_SVE_ZREG(n, 0);
1262        ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1263        if (ret) {
1264            return ret;
1265        }
1266        sve_bswap64(r, r, cpu->sve_max_vq * 2);
1267    }
1268
1269    for (n = 0; n < KVM_ARM64_SVE_NUM_PREGS; ++n) {
1270        r = &env->vfp.pregs[n].p[0];
1271        reg.addr = (uintptr_t)r;
1272        reg.id = KVM_REG_ARM64_SVE_PREG(n, 0);
1273        ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1274        if (ret) {
1275            return ret;
1276        }
1277        sve_bswap64(r, r, DIV_ROUND_UP(cpu->sve_max_vq * 2, 8));
1278    }
1279
1280    r = &env->vfp.pregs[FFR_PRED_NUM].p[0];
1281    reg.addr = (uintptr_t)r;
1282    reg.id = KVM_REG_ARM64_SVE_FFR(0);
1283    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1284    if (ret) {
1285        return ret;
1286    }
1287    sve_bswap64(r, r, DIV_ROUND_UP(cpu->sve_max_vq * 2, 8));
1288
1289    return 0;
1290}
1291
1292int kvm_arch_get_registers(CPUState *cs)
1293{
1294    struct kvm_one_reg reg;
1295    uint64_t val;
1296    unsigned int el;
1297    uint32_t fpr;
1298    int i, ret;
1299
1300    ARMCPU *cpu = ARM_CPU(cs);
1301    CPUARMState *env = &cpu->env;
1302
1303    for (i = 0; i < 31; i++) {
1304        reg.id = AARCH64_CORE_REG(regs.regs[i]);
1305        reg.addr = (uintptr_t) &env->xregs[i];
1306        ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1307        if (ret) {
1308            return ret;
1309        }
1310    }
1311
1312    reg.id = AARCH64_CORE_REG(regs.sp);
1313    reg.addr = (uintptr_t) &env->sp_el[0];
1314    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1315    if (ret) {
1316        return ret;
1317    }
1318
1319    reg.id = AARCH64_CORE_REG(sp_el1);
1320    reg.addr = (uintptr_t) &env->sp_el[1];
1321    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1322    if (ret) {
1323        return ret;
1324    }
1325
1326    reg.id = AARCH64_CORE_REG(regs.pstate);
1327    reg.addr = (uintptr_t) &val;
1328    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1329    if (ret) {
1330        return ret;
1331    }
1332
1333    env->aarch64 = ((val & PSTATE_nRW) == 0);
1334    if (is_a64(env)) {
1335        pstate_write(env, val);
1336    } else {
1337        cpsr_write(env, val, 0xffffffff, CPSRWriteRaw);
1338    }
1339
1340    /* KVM puts SP_EL0 in regs.sp and SP_EL1 in regs.sp_el1. On the
1341     * QEMU side we keep the current SP in xregs[31] as well.
1342     */
1343    aarch64_restore_sp(env, 1);
1344
1345    reg.id = AARCH64_CORE_REG(regs.pc);
1346    reg.addr = (uintptr_t) &env->pc;
1347    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1348    if (ret) {
1349        return ret;
1350    }
1351
1352    /* If we are in AArch32 mode then we need to sync the AArch32 regs with the
1353     * incoming AArch64 regs received from 64-bit KVM.
1354     * We must perform this after all of the registers have been acquired from
1355     * the kernel.
1356     */
1357    if (!is_a64(env)) {
1358        aarch64_sync_64_to_32(env);
1359    }
1360
1361    reg.id = AARCH64_CORE_REG(elr_el1);
1362    reg.addr = (uintptr_t) &env->elr_el[1];
1363    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1364    if (ret) {
1365        return ret;
1366    }
1367
1368    /* Fetch the SPSR registers
1369     *
1370     * KVM SPSRs 0-4 map to QEMU banks 1-5
1371     */
1372    for (i = 0; i < KVM_NR_SPSR; i++) {
1373        reg.id = AARCH64_CORE_REG(spsr[i]);
1374        reg.addr = (uintptr_t) &env->banked_spsr[i + 1];
1375        ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1376        if (ret) {
1377            return ret;
1378        }
1379    }
1380
1381    el = arm_current_el(env);
1382    if (el > 0 && !is_a64(env)) {
1383        i = bank_number(env->uncached_cpsr & CPSR_M);
1384        env->spsr = env->banked_spsr[i];
1385    }
1386
1387    if (cpu_isar_feature(aa64_sve, cpu)) {
1388        ret = kvm_arch_get_sve(cs);
1389    } else {
1390        ret = kvm_arch_get_fpsimd(cs);
1391    }
1392    if (ret) {
1393        return ret;
1394    }
1395
1396    reg.addr = (uintptr_t)(&fpr);
1397    reg.id = AARCH64_SIMD_CTRL_REG(fp_regs.fpsr);
1398    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1399    if (ret) {
1400        return ret;
1401    }
1402    vfp_set_fpsr(env, fpr);
1403
1404    reg.addr = (uintptr_t)(&fpr);
1405    reg.id = AARCH64_SIMD_CTRL_REG(fp_regs.fpcr);
1406    ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
1407    if (ret) {
1408        return ret;
1409    }
1410    vfp_set_fpcr(env, fpr);
1411
1412    ret = kvm_get_vcpu_events(cpu);
1413    if (ret) {
1414        return ret;
1415    }
1416
1417    if (!write_kvmstate_to_list(cpu)) {
1418        return -EINVAL;
1419    }
1420    /* Note that it's OK to have registers which aren't in CPUState,
1421     * so we can ignore a failure return here.
1422     */
1423    write_list_to_cpustate(cpu);
1424
1425    kvm_arm_sync_mpstate_to_qemu(cpu);
1426
1427    /* TODO: other registers */
1428    return ret;
1429}
1430
1431void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
1432{
1433    ram_addr_t ram_addr;
1434    hwaddr paddr;
1435
1436    assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
1437
1438    if (acpi_ghes_present() && addr) {
1439        ram_addr = qemu_ram_addr_from_host(addr);
1440        if (ram_addr != RAM_ADDR_INVALID &&
1441            kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
1442            kvm_hwpoison_page_add(ram_addr);
1443            /*
1444             * If this is a BUS_MCEERR_AR, we know we have been called
1445             * synchronously from the vCPU thread, so we can easily
1446             * synchronize the state and inject an error.
1447             *
1448             * TODO: we currently don't tell the guest at all about
1449             * BUS_MCEERR_AO. In that case we might either be being
1450             * called synchronously from the vCPU thread, or a bit
1451             * later from the main thread, so doing the injection of
1452             * the error would be more complicated.
1453             */
1454            if (code == BUS_MCEERR_AR) {
1455                kvm_cpu_synchronize_state(c);
1456                if (!acpi_ghes_record_errors(ACPI_HEST_SRC_ID_SEA, paddr)) {
1457                    kvm_inject_arm_sea(c);
1458                } else {
1459                    error_report("failed to record the error");
1460                    abort();
1461                }
1462            }
1463            return;
1464        }
1465        if (code == BUS_MCEERR_AO) {
1466            error_report("Hardware memory error at addr %p for memory used by "
1467                "QEMU itself instead of guest system!", addr);
1468        }
1469    }
1470
1471    if (code == BUS_MCEERR_AR) {
1472        error_report("Hardware memory error!");
1473        exit(1);
1474    }
1475}
1476
1477/* C6.6.29 BRK instruction */
1478static const uint32_t brk_insn = 0xd4200000;
1479
1480int kvm_arch_insert_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
1481{
1482    if (have_guest_debug) {
1483        if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 4, 0) ||
1484            cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&brk_insn, 4, 1)) {
1485            return -EINVAL;
1486        }
1487        return 0;
1488    } else {
1489        error_report("guest debug not supported on this kernel");
1490        return -EINVAL;
1491    }
1492}
1493
1494int kvm_arch_remove_sw_breakpoint(CPUState *cs, struct kvm_sw_breakpoint *bp)
1495{
1496    static uint32_t brk;
1497
1498    if (have_guest_debug) {
1499        if (cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&brk, 4, 0) ||
1500            brk != brk_insn ||
1501            cpu_memory_rw_debug(cs, bp->pc, (uint8_t *)&bp->saved_insn, 4, 1)) {
1502            return -EINVAL;
1503        }
1504        return 0;
1505    } else {
1506        error_report("guest debug not supported on this kernel");
1507        return -EINVAL;
1508    }
1509}
1510
1511/* See v8 ARM ARM D7.2.27 ESR_ELx, Exception Syndrome Register
1512 *
1513 * To minimise translating between kernel and user-space the kernel
1514 * ABI just provides user-space with the full exception syndrome
1515 * register value to be decoded in QEMU.
1516 */
1517
1518bool kvm_arm_handle_debug(CPUState *cs, struct kvm_debug_exit_arch *debug_exit)
1519{
1520    int hsr_ec = syn_get_ec(debug_exit->hsr);
1521    ARMCPU *cpu = ARM_CPU(cs);
1522    CPUARMState *env = &cpu->env;
1523
1524    /* Ensure PC is synchronised */
1525    kvm_cpu_synchronize_state(cs);
1526
1527    switch (hsr_ec) {
1528    case EC_SOFTWARESTEP:
1529        if (cs->singlestep_enabled) {
1530            return true;
1531        } else {
1532            /*
1533             * The kernel should have suppressed the guest's ability to
1534             * single step at this point so something has gone wrong.
1535             */
1536            error_report("%s: guest single-step while debugging unsupported"
1537                         " (%"PRIx64", %"PRIx32")",
1538                         __func__, env->pc, debug_exit->hsr);
1539            return false;
1540        }
1541        break;
1542    case EC_AA64_BKPT:
1543        if (kvm_find_sw_breakpoint(cs, env->pc)) {
1544            return true;
1545        }
1546        break;
1547    case EC_BREAKPOINT:
1548        if (find_hw_breakpoint(cs, env->pc)) {
1549            return true;
1550        }
1551        break;
1552    case EC_WATCHPOINT:
1553    {
1554        CPUWatchpoint *wp = find_hw_watchpoint(cs, debug_exit->far);
1555        if (wp) {
1556            cs->watchpoint_hit = wp;
1557            return true;
1558        }
1559        break;
1560    }
1561    default:
1562        error_report("%s: unhandled debug exit (%"PRIx32", %"PRIx64")",
1563                     __func__, debug_exit->hsr, env->pc);
1564    }
1565
1566    /* If we are not handling the debug exception it must belong to
1567     * the guest. Let's re-use the existing TCG interrupt code to set
1568     * everything up properly.
1569     */
1570    cs->exception_index = EXCP_BKPT;
1571    env->exception.syndrome = debug_exit->hsr;
1572    env->exception.vaddress = debug_exit->far;
1573    env->exception.target_el = 1;
1574    qemu_mutex_lock_iothread();
1575    arm_cpu_do_interrupt(cs);
1576    qemu_mutex_unlock_iothread();
1577
1578    return false;
1579}
1580
1581#define ARM64_REG_ESR_EL1 ARM64_SYS_REG(3, 0, 5, 2, 0)
1582#define ARM64_REG_TCR_EL1 ARM64_SYS_REG(3, 0, 2, 0, 2)
1583
1584/*
1585 * ESR_EL1
1586 * ISS encoding
1587 * AARCH64: DFSC,   bits [5:0]
1588 * AARCH32:
1589 *      TTBCR.EAE == 0
1590 *          FS[4]   - DFSR[10]
1591 *          FS[3:0] - DFSR[3:0]
1592 *      TTBCR.EAE == 1
1593 *          FS, bits [5:0]
1594 */
1595#define ESR_DFSC(aarch64, lpae, v)        \
1596    ((aarch64 || (lpae)) ? ((v) & 0x3F)   \
1597               : (((v) >> 6) | ((v) & 0x1F)))
1598
1599#define ESR_DFSC_EXTABT(aarch64, lpae) \
1600    ((aarch64) ? 0x10 : (lpae) ? 0x10 : 0x8)
1601
1602bool kvm_arm_verify_ext_dabt_pending(CPUState *cs)
1603{
1604    uint64_t dfsr_val;
1605
1606    if (!kvm_get_one_reg(cs, ARM64_REG_ESR_EL1, &dfsr_val)) {
1607        ARMCPU *cpu = ARM_CPU(cs);
1608        CPUARMState *env = &cpu->env;
1609        int aarch64_mode = arm_feature(env, ARM_FEATURE_AARCH64);
1610        int lpae = 0;
1611
1612        if (!aarch64_mode) {
1613            uint64_t ttbcr;
1614
1615            if (!kvm_get_one_reg(cs, ARM64_REG_TCR_EL1, &ttbcr)) {
1616                lpae = arm_feature(env, ARM_FEATURE_LPAE)
1617                        && (ttbcr & TTBCR_EAE);
1618            }
1619        }
1620        /*
1621         * The verification here is based on the DFSC bits
1622         * of the ESR_EL1 reg only
1623         */
1624         return (ESR_DFSC(aarch64_mode, lpae, dfsr_val) ==
1625                ESR_DFSC_EXTABT(aarch64_mode, lpae));
1626    }
1627    return false;
1628}
1629