qemu/accel/kvm/kvm-all.c
<<
>>
Prefs
   1/*
   2 * QEMU KVM support
   3 *
   4 * Copyright IBM, Corp. 2008
   5 *           Red Hat, Inc. 2008
   6 *
   7 * Authors:
   8 *  Anthony Liguori   <aliguori@us.ibm.com>
   9 *  Glauber Costa     <gcosta@redhat.com>
  10 *
  11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
  12 * See the COPYING file in the top-level directory.
  13 *
  14 */
  15
  16#include "qemu/osdep.h"
  17#include <sys/ioctl.h>
  18
  19#include <linux/kvm.h>
  20
  21#include "qemu-common.h"
  22#include "qemu/atomic.h"
  23#include "qemu/option.h"
  24#include "qemu/config-file.h"
  25#include "qemu/error-report.h"
  26#include "qapi/error.h"
  27#include "hw/hw.h"
  28#include "hw/pci/msi.h"
  29#include "hw/pci/msix.h"
  30#include "hw/s390x/adapter.h"
  31#include "exec/gdbstub.h"
  32#include "sysemu/kvm_int.h"
  33#include "sysemu/cpus.h"
  34#include "qemu/bswap.h"
  35#include "exec/memory.h"
  36#include "exec/ram_addr.h"
  37#include "exec/address-spaces.h"
  38#include "qemu/event_notifier.h"
  39#include "trace.h"
  40#include "hw/irq.h"
  41#include "sysemu/sev.h"
  42
  43#include "hw/boards.h"
  44
  45/* This check must be after config-host.h is included */
  46#ifdef CONFIG_EVENTFD
  47#include <sys/eventfd.h>
  48#endif
  49
  50/* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
  51 * need to use the real host PAGE_SIZE, as that's what KVM will use.
  52 */
  53#define PAGE_SIZE getpagesize()
  54
  55//#define DEBUG_KVM
  56
  57#ifdef DEBUG_KVM
  58#define DPRINTF(fmt, ...) \
  59    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
  60#else
  61#define DPRINTF(fmt, ...) \
  62    do { } while (0)
  63#endif
  64
  65#define KVM_MSI_HASHTAB_SIZE    256
  66
  67struct KVMParkedVcpu {
  68    unsigned long vcpu_id;
  69    int kvm_fd;
  70    QLIST_ENTRY(KVMParkedVcpu) node;
  71};
  72
  73struct KVMState
  74{
  75    AccelState parent_obj;
  76
  77    int nr_slots;
  78    int fd;
  79    int vmfd;
  80    int coalesced_mmio;
  81    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
  82    bool coalesced_flush_in_progress;
  83    int vcpu_events;
  84    int robust_singlestep;
  85    int debugregs;
  86#ifdef KVM_CAP_SET_GUEST_DEBUG
  87    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
  88#endif
  89    int many_ioeventfds;
  90    int intx_set_mask;
  91    bool sync_mmu;
  92    /* The man page (and posix) say ioctl numbers are signed int, but
  93     * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
  94     * unsigned, and treating them as signed here can break things */
  95    unsigned irq_set_ioctl;
  96    unsigned int sigmask_len;
  97    GHashTable *gsimap;
  98#ifdef KVM_CAP_IRQ_ROUTING
  99    struct kvm_irq_routing *irq_routes;
 100    int nr_allocated_irq_routes;
 101    unsigned long *used_gsi_bitmap;
 102    unsigned int gsi_count;
 103    QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
 104#endif
 105    KVMMemoryListener memory_listener;
 106    QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
 107
 108    /* memory encryption */
 109    void *memcrypt_handle;
 110    int (*memcrypt_encrypt_data)(void *handle, uint8_t *ptr, uint64_t len);
 111};
 112
 113KVMState *kvm_state;
 114bool kvm_kernel_irqchip;
 115bool kvm_split_irqchip;
 116bool kvm_async_interrupts_allowed;
 117bool kvm_halt_in_kernel_allowed;
 118bool kvm_eventfds_allowed;
 119bool kvm_irqfds_allowed;
 120bool kvm_resamplefds_allowed;
 121bool kvm_msi_via_irqfd_allowed;
 122bool kvm_gsi_routing_allowed;
 123bool kvm_gsi_direct_mapping;
 124bool kvm_allowed;
 125bool kvm_readonly_mem_allowed;
 126bool kvm_vm_attributes_allowed;
 127bool kvm_direct_msi_allowed;
 128bool kvm_ioeventfd_any_length_allowed;
 129bool kvm_msi_use_devid;
 130static bool kvm_immediate_exit;
 131
 132static const KVMCapabilityInfo kvm_required_capabilites[] = {
 133    KVM_CAP_INFO(USER_MEMORY),
 134    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
 135    KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
 136    KVM_CAP_LAST_INFO
 137};
 138
 139int kvm_get_max_memslots(void)
 140{
 141    KVMState *s = KVM_STATE(current_machine->accelerator);
 142
 143    return s->nr_slots;
 144}
 145
 146bool kvm_memcrypt_enabled(void)
 147{
 148    if (kvm_state && kvm_state->memcrypt_handle) {
 149        return true;
 150    }
 151
 152    return false;
 153}
 154
 155int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len)
 156{
 157    if (kvm_state->memcrypt_handle &&
 158        kvm_state->memcrypt_encrypt_data) {
 159        return kvm_state->memcrypt_encrypt_data(kvm_state->memcrypt_handle,
 160                                              ptr, len);
 161    }
 162
 163    return 1;
 164}
 165
 166static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
 167{
 168    KVMState *s = kvm_state;
 169    int i;
 170
 171    for (i = 0; i < s->nr_slots; i++) {
 172        if (kml->slots[i].memory_size == 0) {
 173            return &kml->slots[i];
 174        }
 175    }
 176
 177    return NULL;
 178}
 179
 180bool kvm_has_free_slot(MachineState *ms)
 181{
 182    KVMState *s = KVM_STATE(ms->accelerator);
 183
 184    return kvm_get_free_slot(&s->memory_listener);
 185}
 186
 187static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
 188{
 189    KVMSlot *slot = kvm_get_free_slot(kml);
 190
 191    if (slot) {
 192        return slot;
 193    }
 194
 195    fprintf(stderr, "%s: no free slot available\n", __func__);
 196    abort();
 197}
 198
 199static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
 200                                         hwaddr start_addr,
 201                                         hwaddr size)
 202{
 203    KVMState *s = kvm_state;
 204    int i;
 205
 206    for (i = 0; i < s->nr_slots; i++) {
 207        KVMSlot *mem = &kml->slots[i];
 208
 209        if (start_addr == mem->start_addr && size == mem->memory_size) {
 210            return mem;
 211        }
 212    }
 213
 214    return NULL;
 215}
 216
 217/*
 218 * Calculate and align the start address and the size of the section.
 219 * Return the size. If the size is 0, the aligned section is empty.
 220 */
 221static hwaddr kvm_align_section(MemoryRegionSection *section,
 222                                hwaddr *start)
 223{
 224    hwaddr size = int128_get64(section->size);
 225    hwaddr delta, aligned;
 226
 227    /* kvm works in page size chunks, but the function may be called
 228       with sub-page size and unaligned start address. Pad the start
 229       address to next and truncate size to previous page boundary. */
 230    aligned = ROUND_UP(section->offset_within_address_space,
 231                       qemu_real_host_page_size);
 232    delta = aligned - section->offset_within_address_space;
 233    *start = aligned;
 234    if (delta > size) {
 235        return 0;
 236    }
 237
 238    return (size - delta) & qemu_real_host_page_mask;
 239}
 240
 241int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
 242                                       hwaddr *phys_addr)
 243{
 244    KVMMemoryListener *kml = &s->memory_listener;
 245    int i;
 246
 247    for (i = 0; i < s->nr_slots; i++) {
 248        KVMSlot *mem = &kml->slots[i];
 249
 250        if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
 251            *phys_addr = mem->start_addr + (ram - mem->ram);
 252            return 1;
 253        }
 254    }
 255
 256    return 0;
 257}
 258
 259static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot)
 260{
 261    KVMState *s = kvm_state;
 262    struct kvm_userspace_memory_region mem;
 263    int ret;
 264
 265    mem.slot = slot->slot | (kml->as_id << 16);
 266    mem.guest_phys_addr = slot->start_addr;
 267    mem.userspace_addr = (unsigned long)slot->ram;
 268    mem.flags = slot->flags;
 269
 270    if (slot->memory_size && mem.flags & KVM_MEM_READONLY) {
 271        /* Set the slot size to 0 before setting the slot to the desired
 272         * value. This is needed based on KVM commit 75d61fbc. */
 273        mem.memory_size = 0;
 274        kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 275    }
 276    mem.memory_size = slot->memory_size;
 277    ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 278    trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr,
 279                              mem.memory_size, mem.userspace_addr, ret);
 280    return ret;
 281}
 282
 283int kvm_destroy_vcpu(CPUState *cpu)
 284{
 285    KVMState *s = kvm_state;
 286    long mmap_size;
 287    struct KVMParkedVcpu *vcpu = NULL;
 288    int ret = 0;
 289
 290    DPRINTF("kvm_destroy_vcpu\n");
 291
 292    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
 293    if (mmap_size < 0) {
 294        ret = mmap_size;
 295        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
 296        goto err;
 297    }
 298
 299    ret = munmap(cpu->kvm_run, mmap_size);
 300    if (ret < 0) {
 301        goto err;
 302    }
 303
 304    vcpu = g_malloc0(sizeof(*vcpu));
 305    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
 306    vcpu->kvm_fd = cpu->kvm_fd;
 307    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
 308err:
 309    return ret;
 310}
 311
 312static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
 313{
 314    struct KVMParkedVcpu *cpu;
 315
 316    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
 317        if (cpu->vcpu_id == vcpu_id) {
 318            int kvm_fd;
 319
 320            QLIST_REMOVE(cpu, node);
 321            kvm_fd = cpu->kvm_fd;
 322            g_free(cpu);
 323            return kvm_fd;
 324        }
 325    }
 326
 327    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
 328}
 329
 330int kvm_init_vcpu(CPUState *cpu)
 331{
 332    KVMState *s = kvm_state;
 333    long mmap_size;
 334    int ret;
 335
 336    DPRINTF("kvm_init_vcpu\n");
 337
 338    ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
 339    if (ret < 0) {
 340        DPRINTF("kvm_create_vcpu failed\n");
 341        goto err;
 342    }
 343
 344    cpu->kvm_fd = ret;
 345    cpu->kvm_state = s;
 346    cpu->vcpu_dirty = true;
 347
 348    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
 349    if (mmap_size < 0) {
 350        ret = mmap_size;
 351        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
 352        goto err;
 353    }
 354
 355    cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
 356                        cpu->kvm_fd, 0);
 357    if (cpu->kvm_run == MAP_FAILED) {
 358        ret = -errno;
 359        DPRINTF("mmap'ing vcpu state failed\n");
 360        goto err;
 361    }
 362
 363    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
 364        s->coalesced_mmio_ring =
 365            (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
 366    }
 367
 368    ret = kvm_arch_init_vcpu(cpu);
 369err:
 370    return ret;
 371}
 372
 373/*
 374 * dirty pages logging control
 375 */
 376
 377static int kvm_mem_flags(MemoryRegion *mr)
 378{
 379    bool readonly = mr->readonly || memory_region_is_romd(mr);
 380    int flags = 0;
 381
 382    if (memory_region_get_dirty_log_mask(mr) != 0) {
 383        flags |= KVM_MEM_LOG_DIRTY_PAGES;
 384    }
 385    if (readonly && kvm_readonly_mem_allowed) {
 386        flags |= KVM_MEM_READONLY;
 387    }
 388    return flags;
 389}
 390
 391static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
 392                                 MemoryRegion *mr)
 393{
 394    int old_flags;
 395
 396    old_flags = mem->flags;
 397    mem->flags = kvm_mem_flags(mr);
 398
 399    /* If nothing changed effectively, no need to issue ioctl */
 400    if (mem->flags == old_flags) {
 401        return 0;
 402    }
 403
 404    return kvm_set_user_memory_region(kml, mem);
 405}
 406
 407static int kvm_section_update_flags(KVMMemoryListener *kml,
 408                                    MemoryRegionSection *section)
 409{
 410    hwaddr start_addr, size;
 411    KVMSlot *mem;
 412
 413    size = kvm_align_section(section, &start_addr);
 414    if (!size) {
 415        return 0;
 416    }
 417
 418    mem = kvm_lookup_matching_slot(kml, start_addr, size);
 419    if (!mem) {
 420        /* We don't have a slot if we want to trap every access. */
 421        return 0;
 422    }
 423
 424    return kvm_slot_update_flags(kml, mem, section->mr);
 425}
 426
 427static void kvm_log_start(MemoryListener *listener,
 428                          MemoryRegionSection *section,
 429                          int old, int new)
 430{
 431    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 432    int r;
 433
 434    if (old != 0) {
 435        return;
 436    }
 437
 438    r = kvm_section_update_flags(kml, section);
 439    if (r < 0) {
 440        abort();
 441    }
 442}
 443
 444static void kvm_log_stop(MemoryListener *listener,
 445                          MemoryRegionSection *section,
 446                          int old, int new)
 447{
 448    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 449    int r;
 450
 451    if (new != 0) {
 452        return;
 453    }
 454
 455    r = kvm_section_update_flags(kml, section);
 456    if (r < 0) {
 457        abort();
 458    }
 459}
 460
 461/* get kvm's dirty pages bitmap and update qemu's */
 462static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
 463                                         unsigned long *bitmap)
 464{
 465    ram_addr_t start = section->offset_within_region +
 466                       memory_region_get_ram_addr(section->mr);
 467    ram_addr_t pages = int128_get64(section->size) / getpagesize();
 468
 469    cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages);
 470    return 0;
 471}
 472
 473#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
 474
 475/**
 476 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
 477 * This function updates qemu's dirty bitmap using
 478 * memory_region_set_dirty().  This means all bits are set
 479 * to dirty.
 480 *
 481 * @start_add: start of logged region.
 482 * @end_addr: end of logged region.
 483 */
 484static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
 485                                          MemoryRegionSection *section)
 486{
 487    KVMState *s = kvm_state;
 488    struct kvm_dirty_log d = {};
 489    KVMSlot *mem;
 490    hwaddr start_addr, size;
 491
 492    size = kvm_align_section(section, &start_addr);
 493    if (size) {
 494        mem = kvm_lookup_matching_slot(kml, start_addr, size);
 495        if (!mem) {
 496            /* We don't have a slot if we want to trap every access. */
 497            return 0;
 498        }
 499
 500        /* XXX bad kernel interface alert
 501         * For dirty bitmap, kernel allocates array of size aligned to
 502         * bits-per-long.  But for case when the kernel is 64bits and
 503         * the userspace is 32bits, userspace can't align to the same
 504         * bits-per-long, since sizeof(long) is different between kernel
 505         * and user space.  This way, userspace will provide buffer which
 506         * may be 4 bytes less than the kernel will use, resulting in
 507         * userspace memory corruption (which is not detectable by valgrind
 508         * too, in most cases).
 509         * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
 510         * a hope that sizeof(long) won't become >8 any time soon.
 511         */
 512        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
 513                     /*HOST_LONG_BITS*/ 64) / 8;
 514        d.dirty_bitmap = g_malloc0(size);
 515
 516        d.slot = mem->slot | (kml->as_id << 16);
 517        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
 518            DPRINTF("ioctl failed %d\n", errno);
 519            g_free(d.dirty_bitmap);
 520            return -1;
 521        }
 522
 523        kvm_get_dirty_pages_log_range(section, d.dirty_bitmap);
 524        g_free(d.dirty_bitmap);
 525    }
 526
 527    return 0;
 528}
 529
 530static void kvm_coalesce_mmio_region(MemoryListener *listener,
 531                                     MemoryRegionSection *secion,
 532                                     hwaddr start, hwaddr size)
 533{
 534    KVMState *s = kvm_state;
 535
 536    if (s->coalesced_mmio) {
 537        struct kvm_coalesced_mmio_zone zone;
 538
 539        zone.addr = start;
 540        zone.size = size;
 541        zone.pad = 0;
 542
 543        (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
 544    }
 545}
 546
 547static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
 548                                       MemoryRegionSection *secion,
 549                                       hwaddr start, hwaddr size)
 550{
 551    KVMState *s = kvm_state;
 552
 553    if (s->coalesced_mmio) {
 554        struct kvm_coalesced_mmio_zone zone;
 555
 556        zone.addr = start;
 557        zone.size = size;
 558        zone.pad = 0;
 559
 560        (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
 561    }
 562}
 563
 564int kvm_check_extension(KVMState *s, unsigned int extension)
 565{
 566    int ret;
 567
 568    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
 569    if (ret < 0) {
 570        ret = 0;
 571    }
 572
 573    return ret;
 574}
 575
 576int kvm_vm_check_extension(KVMState *s, unsigned int extension)
 577{
 578    int ret;
 579
 580    ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
 581    if (ret < 0) {
 582        /* VM wide version not implemented, use global one instead */
 583        ret = kvm_check_extension(s, extension);
 584    }
 585
 586    return ret;
 587}
 588
 589static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
 590{
 591#if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN)
 592    /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN
 593     * endianness, but the memory core hands them in target endianness.
 594     * For example, PPC is always treated as big-endian even if running
 595     * on KVM and on PPC64LE.  Correct here.
 596     */
 597    switch (size) {
 598    case 2:
 599        val = bswap16(val);
 600        break;
 601    case 4:
 602        val = bswap32(val);
 603        break;
 604    }
 605#endif
 606    return val;
 607}
 608
 609static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
 610                                  bool assign, uint32_t size, bool datamatch)
 611{
 612    int ret;
 613    struct kvm_ioeventfd iofd = {
 614        .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
 615        .addr = addr,
 616        .len = size,
 617        .flags = 0,
 618        .fd = fd,
 619    };
 620
 621    if (!kvm_enabled()) {
 622        return -ENOSYS;
 623    }
 624
 625    if (datamatch) {
 626        iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
 627    }
 628    if (!assign) {
 629        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
 630    }
 631
 632    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
 633
 634    if (ret < 0) {
 635        return -errno;
 636    }
 637
 638    return 0;
 639}
 640
 641static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
 642                                 bool assign, uint32_t size, bool datamatch)
 643{
 644    struct kvm_ioeventfd kick = {
 645        .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
 646        .addr = addr,
 647        .flags = KVM_IOEVENTFD_FLAG_PIO,
 648        .len = size,
 649        .fd = fd,
 650    };
 651    int r;
 652    if (!kvm_enabled()) {
 653        return -ENOSYS;
 654    }
 655    if (datamatch) {
 656        kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
 657    }
 658    if (!assign) {
 659        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
 660    }
 661    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
 662    if (r < 0) {
 663        return r;
 664    }
 665    return 0;
 666}
 667
 668
 669static int kvm_check_many_ioeventfds(void)
 670{
 671    /* Userspace can use ioeventfd for io notification.  This requires a host
 672     * that supports eventfd(2) and an I/O thread; since eventfd does not
 673     * support SIGIO it cannot interrupt the vcpu.
 674     *
 675     * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
 676     * can avoid creating too many ioeventfds.
 677     */
 678#if defined(CONFIG_EVENTFD)
 679    int ioeventfds[7];
 680    int i, ret = 0;
 681    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
 682        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
 683        if (ioeventfds[i] < 0) {
 684            break;
 685        }
 686        ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true);
 687        if (ret < 0) {
 688            close(ioeventfds[i]);
 689            break;
 690        }
 691    }
 692
 693    /* Decide whether many devices are supported or not */
 694    ret = i == ARRAY_SIZE(ioeventfds);
 695
 696    while (i-- > 0) {
 697        kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true);
 698        close(ioeventfds[i]);
 699    }
 700    return ret;
 701#else
 702    return 0;
 703#endif
 704}
 705
 706static const KVMCapabilityInfo *
 707kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
 708{
 709    while (list->name) {
 710        if (!kvm_check_extension(s, list->value)) {
 711            return list;
 712        }
 713        list++;
 714    }
 715    return NULL;
 716}
 717
 718static void kvm_set_phys_mem(KVMMemoryListener *kml,
 719                             MemoryRegionSection *section, bool add)
 720{
 721    KVMSlot *mem;
 722    int err;
 723    MemoryRegion *mr = section->mr;
 724    bool writeable = !mr->readonly && !mr->rom_device;
 725    hwaddr start_addr, size;
 726    void *ram;
 727
 728    if (!memory_region_is_ram(mr)) {
 729        if (writeable || !kvm_readonly_mem_allowed) {
 730            return;
 731        } else if (!mr->romd_mode) {
 732            /* If the memory device is not in romd_mode, then we actually want
 733             * to remove the kvm memory slot so all accesses will trap. */
 734            add = false;
 735        }
 736    }
 737
 738    size = kvm_align_section(section, &start_addr);
 739    if (!size) {
 740        return;
 741    }
 742
 743    /* use aligned delta to align the ram address */
 744    ram = memory_region_get_ram_ptr(mr) + section->offset_within_region +
 745          (start_addr - section->offset_within_address_space);
 746
 747    if (!add) {
 748        mem = kvm_lookup_matching_slot(kml, start_addr, size);
 749        if (!mem) {
 750            return;
 751        }
 752        if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
 753            kvm_physical_sync_dirty_bitmap(kml, section);
 754        }
 755
 756        /* unregister the slot */
 757        mem->memory_size = 0;
 758        err = kvm_set_user_memory_region(kml, mem);
 759        if (err) {
 760            fprintf(stderr, "%s: error unregistering slot: %s\n",
 761                    __func__, strerror(-err));
 762            abort();
 763        }
 764        return;
 765    }
 766
 767    /* register the new slot */
 768    mem = kvm_alloc_slot(kml);
 769    mem->memory_size = size;
 770    mem->start_addr = start_addr;
 771    mem->ram = ram;
 772    mem->flags = kvm_mem_flags(mr);
 773
 774    err = kvm_set_user_memory_region(kml, mem);
 775    if (err) {
 776        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
 777                strerror(-err));
 778        abort();
 779    }
 780}
 781
 782static void kvm_region_add(MemoryListener *listener,
 783                           MemoryRegionSection *section)
 784{
 785    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 786
 787    memory_region_ref(section->mr);
 788    kvm_set_phys_mem(kml, section, true);
 789}
 790
 791static void kvm_region_del(MemoryListener *listener,
 792                           MemoryRegionSection *section)
 793{
 794    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 795
 796    kvm_set_phys_mem(kml, section, false);
 797    memory_region_unref(section->mr);
 798}
 799
 800static void kvm_log_sync(MemoryListener *listener,
 801                         MemoryRegionSection *section)
 802{
 803    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 804    int r;
 805
 806    r = kvm_physical_sync_dirty_bitmap(kml, section);
 807    if (r < 0) {
 808        abort();
 809    }
 810}
 811
 812static void kvm_mem_ioeventfd_add(MemoryListener *listener,
 813                                  MemoryRegionSection *section,
 814                                  bool match_data, uint64_t data,
 815                                  EventNotifier *e)
 816{
 817    int fd = event_notifier_get_fd(e);
 818    int r;
 819
 820    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
 821                               data, true, int128_get64(section->size),
 822                               match_data);
 823    if (r < 0) {
 824        fprintf(stderr, "%s: error adding ioeventfd: %s\n",
 825                __func__, strerror(-r));
 826        abort();
 827    }
 828}
 829
 830static void kvm_mem_ioeventfd_del(MemoryListener *listener,
 831                                  MemoryRegionSection *section,
 832                                  bool match_data, uint64_t data,
 833                                  EventNotifier *e)
 834{
 835    int fd = event_notifier_get_fd(e);
 836    int r;
 837
 838    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
 839                               data, false, int128_get64(section->size),
 840                               match_data);
 841    if (r < 0) {
 842        abort();
 843    }
 844}
 845
 846static void kvm_io_ioeventfd_add(MemoryListener *listener,
 847                                 MemoryRegionSection *section,
 848                                 bool match_data, uint64_t data,
 849                                 EventNotifier *e)
 850{
 851    int fd = event_notifier_get_fd(e);
 852    int r;
 853
 854    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
 855                              data, true, int128_get64(section->size),
 856                              match_data);
 857    if (r < 0) {
 858        fprintf(stderr, "%s: error adding ioeventfd: %s\n",
 859                __func__, strerror(-r));
 860        abort();
 861    }
 862}
 863
 864static void kvm_io_ioeventfd_del(MemoryListener *listener,
 865                                 MemoryRegionSection *section,
 866                                 bool match_data, uint64_t data,
 867                                 EventNotifier *e)
 868
 869{
 870    int fd = event_notifier_get_fd(e);
 871    int r;
 872
 873    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
 874                              data, false, int128_get64(section->size),
 875                              match_data);
 876    if (r < 0) {
 877        abort();
 878    }
 879}
 880
 881void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
 882                                  AddressSpace *as, int as_id)
 883{
 884    int i;
 885
 886    kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot));
 887    kml->as_id = as_id;
 888
 889    for (i = 0; i < s->nr_slots; i++) {
 890        kml->slots[i].slot = i;
 891    }
 892
 893    kml->listener.region_add = kvm_region_add;
 894    kml->listener.region_del = kvm_region_del;
 895    kml->listener.log_start = kvm_log_start;
 896    kml->listener.log_stop = kvm_log_stop;
 897    kml->listener.log_sync = kvm_log_sync;
 898    kml->listener.priority = 10;
 899
 900    memory_listener_register(&kml->listener, as);
 901}
 902
 903static MemoryListener kvm_io_listener = {
 904    .eventfd_add = kvm_io_ioeventfd_add,
 905    .eventfd_del = kvm_io_ioeventfd_del,
 906    .priority = 10,
 907};
 908
 909int kvm_set_irq(KVMState *s, int irq, int level)
 910{
 911    struct kvm_irq_level event;
 912    int ret;
 913
 914    assert(kvm_async_interrupts_enabled());
 915
 916    event.level = level;
 917    event.irq = irq;
 918    ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
 919    if (ret < 0) {
 920        perror("kvm_set_irq");
 921        abort();
 922    }
 923
 924    return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
 925}
 926
 927#ifdef KVM_CAP_IRQ_ROUTING
 928typedef struct KVMMSIRoute {
 929    struct kvm_irq_routing_entry kroute;
 930    QTAILQ_ENTRY(KVMMSIRoute) entry;
 931} KVMMSIRoute;
 932
 933static void set_gsi(KVMState *s, unsigned int gsi)
 934{
 935    set_bit(gsi, s->used_gsi_bitmap);
 936}
 937
 938static void clear_gsi(KVMState *s, unsigned int gsi)
 939{
 940    clear_bit(gsi, s->used_gsi_bitmap);
 941}
 942
 943void kvm_init_irq_routing(KVMState *s)
 944{
 945    int gsi_count, i;
 946
 947    gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
 948    if (gsi_count > 0) {
 949        /* Round up so we can search ints using ffs */
 950        s->used_gsi_bitmap = bitmap_new(gsi_count);
 951        s->gsi_count = gsi_count;
 952    }
 953
 954    s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
 955    s->nr_allocated_irq_routes = 0;
 956
 957    if (!kvm_direct_msi_allowed) {
 958        for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
 959            QTAILQ_INIT(&s->msi_hashtab[i]);
 960        }
 961    }
 962
 963    kvm_arch_init_irq_routing(s);
 964}
 965
 966void kvm_irqchip_commit_routes(KVMState *s)
 967{
 968    int ret;
 969
 970    if (kvm_gsi_direct_mapping()) {
 971        return;
 972    }
 973
 974    if (!kvm_gsi_routing_enabled()) {
 975        return;
 976    }
 977
 978    s->irq_routes->flags = 0;
 979    trace_kvm_irqchip_commit_routes();
 980    ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
 981    assert(ret == 0);
 982}
 983
 984static void kvm_add_routing_entry(KVMState *s,
 985                                  struct kvm_irq_routing_entry *entry)
 986{
 987    struct kvm_irq_routing_entry *new;
 988    int n, size;
 989
 990    if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
 991        n = s->nr_allocated_irq_routes * 2;
 992        if (n < 64) {
 993            n = 64;
 994        }
 995        size = sizeof(struct kvm_irq_routing);
 996        size += n * sizeof(*new);
 997        s->irq_routes = g_realloc(s->irq_routes, size);
 998        s->nr_allocated_irq_routes = n;
 999    }
1000    n = s->irq_routes->nr++;
1001    new = &s->irq_routes->entries[n];
1002
1003    *new = *entry;
1004
1005    set_gsi(s, entry->gsi);
1006}
1007
1008static int kvm_update_routing_entry(KVMState *s,
1009                                    struct kvm_irq_routing_entry *new_entry)
1010{
1011    struct kvm_irq_routing_entry *entry;
1012    int n;
1013
1014    for (n = 0; n < s->irq_routes->nr; n++) {
1015        entry = &s->irq_routes->entries[n];
1016        if (entry->gsi != new_entry->gsi) {
1017            continue;
1018        }
1019
1020        if(!memcmp(entry, new_entry, sizeof *entry)) {
1021            return 0;
1022        }
1023
1024        *entry = *new_entry;
1025
1026        return 0;
1027    }
1028
1029    return -ESRCH;
1030}
1031
1032void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
1033{
1034    struct kvm_irq_routing_entry e = {};
1035
1036    assert(pin < s->gsi_count);
1037
1038    e.gsi = irq;
1039    e.type = KVM_IRQ_ROUTING_IRQCHIP;
1040    e.flags = 0;
1041    e.u.irqchip.irqchip = irqchip;
1042    e.u.irqchip.pin = pin;
1043    kvm_add_routing_entry(s, &e);
1044}
1045
1046void kvm_irqchip_release_virq(KVMState *s, int virq)
1047{
1048    struct kvm_irq_routing_entry *e;
1049    int i;
1050
1051    if (kvm_gsi_direct_mapping()) {
1052        return;
1053    }
1054
1055    for (i = 0; i < s->irq_routes->nr; i++) {
1056        e = &s->irq_routes->entries[i];
1057        if (e->gsi == virq) {
1058            s->irq_routes->nr--;
1059            *e = s->irq_routes->entries[s->irq_routes->nr];
1060        }
1061    }
1062    clear_gsi(s, virq);
1063    kvm_arch_release_virq_post(virq);
1064    trace_kvm_irqchip_release_virq(virq);
1065}
1066
1067static unsigned int kvm_hash_msi(uint32_t data)
1068{
1069    /* This is optimized for IA32 MSI layout. However, no other arch shall
1070     * repeat the mistake of not providing a direct MSI injection API. */
1071    return data & 0xff;
1072}
1073
1074static void kvm_flush_dynamic_msi_routes(KVMState *s)
1075{
1076    KVMMSIRoute *route, *next;
1077    unsigned int hash;
1078
1079    for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
1080        QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
1081            kvm_irqchip_release_virq(s, route->kroute.gsi);
1082            QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
1083            g_free(route);
1084        }
1085    }
1086}
1087
1088static int kvm_irqchip_get_virq(KVMState *s)
1089{
1090    int next_virq;
1091
1092    /*
1093     * PIC and IOAPIC share the first 16 GSI numbers, thus the available
1094     * GSI numbers are more than the number of IRQ route. Allocating a GSI
1095     * number can succeed even though a new route entry cannot be added.
1096     * When this happens, flush dynamic MSI entries to free IRQ route entries.
1097     */
1098    if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) {
1099        kvm_flush_dynamic_msi_routes(s);
1100    }
1101
1102    /* Return the lowest unused GSI in the bitmap */
1103    next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
1104    if (next_virq >= s->gsi_count) {
1105        return -ENOSPC;
1106    } else {
1107        return next_virq;
1108    }
1109}
1110
1111static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
1112{
1113    unsigned int hash = kvm_hash_msi(msg.data);
1114    KVMMSIRoute *route;
1115
1116    QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
1117        if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
1118            route->kroute.u.msi.address_hi == (msg.address >> 32) &&
1119            route->kroute.u.msi.data == le32_to_cpu(msg.data)) {
1120            return route;
1121        }
1122    }
1123    return NULL;
1124}
1125
1126int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1127{
1128    struct kvm_msi msi;
1129    KVMMSIRoute *route;
1130
1131    if (kvm_direct_msi_allowed) {
1132        msi.address_lo = (uint32_t)msg.address;
1133        msi.address_hi = msg.address >> 32;
1134        msi.data = le32_to_cpu(msg.data);
1135        msi.flags = 0;
1136        memset(msi.pad, 0, sizeof(msi.pad));
1137
1138        return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
1139    }
1140
1141    route = kvm_lookup_msi_route(s, msg);
1142    if (!route) {
1143        int virq;
1144
1145        virq = kvm_irqchip_get_virq(s);
1146        if (virq < 0) {
1147            return virq;
1148        }
1149
1150        route = g_malloc0(sizeof(KVMMSIRoute));
1151        route->kroute.gsi = virq;
1152        route->kroute.type = KVM_IRQ_ROUTING_MSI;
1153        route->kroute.flags = 0;
1154        route->kroute.u.msi.address_lo = (uint32_t)msg.address;
1155        route->kroute.u.msi.address_hi = msg.address >> 32;
1156        route->kroute.u.msi.data = le32_to_cpu(msg.data);
1157
1158        kvm_add_routing_entry(s, &route->kroute);
1159        kvm_irqchip_commit_routes(s);
1160
1161        QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
1162                           entry);
1163    }
1164
1165    assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
1166
1167    return kvm_set_irq(s, route->kroute.gsi, 1);
1168}
1169
1170int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
1171{
1172    struct kvm_irq_routing_entry kroute = {};
1173    int virq;
1174    MSIMessage msg = {0, 0};
1175
1176    if (pci_available && dev) {
1177        msg = pci_get_msi_message(dev, vector);
1178    }
1179
1180    if (kvm_gsi_direct_mapping()) {
1181        return kvm_arch_msi_data_to_gsi(msg.data);
1182    }
1183
1184    if (!kvm_gsi_routing_enabled()) {
1185        return -ENOSYS;
1186    }
1187
1188    virq = kvm_irqchip_get_virq(s);
1189    if (virq < 0) {
1190        return virq;
1191    }
1192
1193    kroute.gsi = virq;
1194    kroute.type = KVM_IRQ_ROUTING_MSI;
1195    kroute.flags = 0;
1196    kroute.u.msi.address_lo = (uint32_t)msg.address;
1197    kroute.u.msi.address_hi = msg.address >> 32;
1198    kroute.u.msi.data = le32_to_cpu(msg.data);
1199    if (pci_available && kvm_msi_devid_required()) {
1200        kroute.flags = KVM_MSI_VALID_DEVID;
1201        kroute.u.msi.devid = pci_requester_id(dev);
1202    }
1203    if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1204        kvm_irqchip_release_virq(s, virq);
1205        return -EINVAL;
1206    }
1207
1208    trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
1209                                    vector, virq);
1210
1211    kvm_add_routing_entry(s, &kroute);
1212    kvm_arch_add_msi_route_post(&kroute, vector, dev);
1213    kvm_irqchip_commit_routes(s);
1214
1215    return virq;
1216}
1217
1218int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
1219                                 PCIDevice *dev)
1220{
1221    struct kvm_irq_routing_entry kroute = {};
1222
1223    if (kvm_gsi_direct_mapping()) {
1224        return 0;
1225    }
1226
1227    if (!kvm_irqchip_in_kernel()) {
1228        return -ENOSYS;
1229    }
1230
1231    kroute.gsi = virq;
1232    kroute.type = KVM_IRQ_ROUTING_MSI;
1233    kroute.flags = 0;
1234    kroute.u.msi.address_lo = (uint32_t)msg.address;
1235    kroute.u.msi.address_hi = msg.address >> 32;
1236    kroute.u.msi.data = le32_to_cpu(msg.data);
1237    if (pci_available && kvm_msi_devid_required()) {
1238        kroute.flags = KVM_MSI_VALID_DEVID;
1239        kroute.u.msi.devid = pci_requester_id(dev);
1240    }
1241    if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1242        return -EINVAL;
1243    }
1244
1245    trace_kvm_irqchip_update_msi_route(virq);
1246
1247    return kvm_update_routing_entry(s, &kroute);
1248}
1249
1250static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq,
1251                                    bool assign)
1252{
1253    struct kvm_irqfd irqfd = {
1254        .fd = fd,
1255        .gsi = virq,
1256        .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
1257    };
1258
1259    if (rfd != -1) {
1260        irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
1261        irqfd.resamplefd = rfd;
1262    }
1263
1264    if (!kvm_irqfds_enabled()) {
1265        return -ENOSYS;
1266    }
1267
1268    return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
1269}
1270
1271int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
1272{
1273    struct kvm_irq_routing_entry kroute = {};
1274    int virq;
1275
1276    if (!kvm_gsi_routing_enabled()) {
1277        return -ENOSYS;
1278    }
1279
1280    virq = kvm_irqchip_get_virq(s);
1281    if (virq < 0) {
1282        return virq;
1283    }
1284
1285    kroute.gsi = virq;
1286    kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER;
1287    kroute.flags = 0;
1288    kroute.u.adapter.summary_addr = adapter->summary_addr;
1289    kroute.u.adapter.ind_addr = adapter->ind_addr;
1290    kroute.u.adapter.summary_offset = adapter->summary_offset;
1291    kroute.u.adapter.ind_offset = adapter->ind_offset;
1292    kroute.u.adapter.adapter_id = adapter->adapter_id;
1293
1294    kvm_add_routing_entry(s, &kroute);
1295
1296    return virq;
1297}
1298
1299int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
1300{
1301    struct kvm_irq_routing_entry kroute = {};
1302    int virq;
1303
1304    if (!kvm_gsi_routing_enabled()) {
1305        return -ENOSYS;
1306    }
1307    if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) {
1308        return -ENOSYS;
1309    }
1310    virq = kvm_irqchip_get_virq(s);
1311    if (virq < 0) {
1312        return virq;
1313    }
1314
1315    kroute.gsi = virq;
1316    kroute.type = KVM_IRQ_ROUTING_HV_SINT;
1317    kroute.flags = 0;
1318    kroute.u.hv_sint.vcpu = vcpu;
1319    kroute.u.hv_sint.sint = sint;
1320
1321    kvm_add_routing_entry(s, &kroute);
1322    kvm_irqchip_commit_routes(s);
1323
1324    return virq;
1325}
1326
1327#else /* !KVM_CAP_IRQ_ROUTING */
1328
1329void kvm_init_irq_routing(KVMState *s)
1330{
1331}
1332
1333void kvm_irqchip_release_virq(KVMState *s, int virq)
1334{
1335}
1336
1337int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1338{
1339    abort();
1340}
1341
1342int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
1343{
1344    return -ENOSYS;
1345}
1346
1347int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
1348{
1349    return -ENOSYS;
1350}
1351
1352int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
1353{
1354    return -ENOSYS;
1355}
1356
1357static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
1358{
1359    abort();
1360}
1361
1362int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
1363{
1364    return -ENOSYS;
1365}
1366#endif /* !KVM_CAP_IRQ_ROUTING */
1367
1368int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
1369                                       EventNotifier *rn, int virq)
1370{
1371    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n),
1372           rn ? event_notifier_get_fd(rn) : -1, virq, true);
1373}
1374
1375int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
1376                                          int virq)
1377{
1378    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq,
1379           false);
1380}
1381
1382int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
1383                                   EventNotifier *rn, qemu_irq irq)
1384{
1385    gpointer key, gsi;
1386    gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
1387
1388    if (!found) {
1389        return -ENXIO;
1390    }
1391    return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
1392}
1393
1394int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
1395                                      qemu_irq irq)
1396{
1397    gpointer key, gsi;
1398    gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
1399
1400    if (!found) {
1401        return -ENXIO;
1402    }
1403    return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
1404}
1405
1406void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
1407{
1408    g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
1409}
1410
1411static void kvm_irqchip_create(MachineState *machine, KVMState *s)
1412{
1413    int ret;
1414
1415    if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
1416        ;
1417    } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
1418        ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
1419        if (ret < 0) {
1420            fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
1421            exit(1);
1422        }
1423    } else {
1424        return;
1425    }
1426
1427    /* First probe and see if there's a arch-specific hook to create the
1428     * in-kernel irqchip for us */
1429    ret = kvm_arch_irqchip_create(machine, s);
1430    if (ret == 0) {
1431        if (machine_kernel_irqchip_split(machine)) {
1432            perror("Split IRQ chip mode not supported.");
1433            exit(1);
1434        } else {
1435            ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
1436        }
1437    }
1438    if (ret < 0) {
1439        fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
1440        exit(1);
1441    }
1442
1443    kvm_kernel_irqchip = true;
1444    /* If we have an in-kernel IRQ chip then we must have asynchronous
1445     * interrupt delivery (though the reverse is not necessarily true)
1446     */
1447    kvm_async_interrupts_allowed = true;
1448    kvm_halt_in_kernel_allowed = true;
1449
1450    kvm_init_irq_routing(s);
1451
1452    s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
1453}
1454
1455/* Find number of supported CPUs using the recommended
1456 * procedure from the kernel API documentation to cope with
1457 * older kernels that may be missing capabilities.
1458 */
1459static int kvm_recommended_vcpus(KVMState *s)
1460{
1461    int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
1462    return (ret) ? ret : 4;
1463}
1464
1465static int kvm_max_vcpus(KVMState *s)
1466{
1467    int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
1468    return (ret) ? ret : kvm_recommended_vcpus(s);
1469}
1470
1471static int kvm_max_vcpu_id(KVMState *s)
1472{
1473    int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
1474    return (ret) ? ret : kvm_max_vcpus(s);
1475}
1476
1477bool kvm_vcpu_id_is_valid(int vcpu_id)
1478{
1479    KVMState *s = KVM_STATE(current_machine->accelerator);
1480    return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
1481}
1482
1483static int kvm_init(MachineState *ms)
1484{
1485    MachineClass *mc = MACHINE_GET_CLASS(ms);
1486    static const char upgrade_note[] =
1487        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1488        "(see http://sourceforge.net/projects/kvm).\n";
1489    struct {
1490        const char *name;
1491        int num;
1492    } num_cpus[] = {
1493        { "SMP",          smp_cpus },
1494        { "hotpluggable", max_cpus },
1495        { NULL, }
1496    }, *nc = num_cpus;
1497    int soft_vcpus_limit, hard_vcpus_limit;
1498    KVMState *s;
1499    const KVMCapabilityInfo *missing_cap;
1500    int ret;
1501    int type = 0;
1502    const char *kvm_type;
1503
1504    s = KVM_STATE(ms->accelerator);
1505
1506    /*
1507     * On systems where the kernel can support different base page
1508     * sizes, host page size may be different from TARGET_PAGE_SIZE,
1509     * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
1510     * page size for the system though.
1511     */
1512    assert(TARGET_PAGE_SIZE <= getpagesize());
1513
1514    s->sigmask_len = 8;
1515
1516#ifdef KVM_CAP_SET_GUEST_DEBUG
1517    QTAILQ_INIT(&s->kvm_sw_breakpoints);
1518#endif
1519    QLIST_INIT(&s->kvm_parked_vcpus);
1520    s->vmfd = -1;
1521    s->fd = qemu_open("/dev/kvm", O_RDWR);
1522    if (s->fd == -1) {
1523        fprintf(stderr, "Could not access KVM kernel module: %m\n");
1524        ret = -errno;
1525        goto err;
1526    }
1527
1528    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
1529    if (ret < KVM_API_VERSION) {
1530        if (ret >= 0) {
1531            ret = -EINVAL;
1532        }
1533        fprintf(stderr, "kvm version too old\n");
1534        goto err;
1535    }
1536
1537    if (ret > KVM_API_VERSION) {
1538        ret = -EINVAL;
1539        fprintf(stderr, "kvm version not supported\n");
1540        goto err;
1541    }
1542
1543    kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
1544    s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
1545
1546    /* If unspecified, use the default value */
1547    if (!s->nr_slots) {
1548        s->nr_slots = 32;
1549    }
1550
1551    kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type");
1552    if (mc->kvm_type) {
1553        type = mc->kvm_type(kvm_type);
1554    } else if (kvm_type) {
1555        ret = -EINVAL;
1556        fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type);
1557        goto err;
1558    }
1559
1560    do {
1561        ret = kvm_ioctl(s, KVM_CREATE_VM, type);
1562    } while (ret == -EINTR);
1563
1564    if (ret < 0) {
1565        fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret,
1566                strerror(-ret));
1567
1568#ifdef TARGET_S390X
1569        if (ret == -EINVAL) {
1570            fprintf(stderr,
1571                    "Host kernel setup problem detected. Please verify:\n");
1572            fprintf(stderr, "- for kernels supporting the switch_amode or"
1573                    " user_mode parameters, whether\n");
1574            fprintf(stderr,
1575                    "  user space is running in primary address space\n");
1576            fprintf(stderr,
1577                    "- for kernels supporting the vm.allocate_pgste sysctl, "
1578                    "whether it is enabled\n");
1579        }
1580#endif
1581        goto err;
1582    }
1583
1584    s->vmfd = ret;
1585
1586    /* check the vcpu limits */
1587    soft_vcpus_limit = kvm_recommended_vcpus(s);
1588    hard_vcpus_limit = kvm_max_vcpus(s);
1589
1590    while (nc->name) {
1591        if (nc->num > soft_vcpus_limit) {
1592            warn_report("Number of %s cpus requested (%d) exceeds "
1593                        "the recommended cpus supported by KVM (%d)",
1594                        nc->name, nc->num, soft_vcpus_limit);
1595
1596            if (nc->num > hard_vcpus_limit) {
1597                fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
1598                        "the maximum cpus supported by KVM (%d)\n",
1599                        nc->name, nc->num, hard_vcpus_limit);
1600                exit(1);
1601            }
1602        }
1603        nc++;
1604    }
1605
1606    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
1607    if (!missing_cap) {
1608        missing_cap =
1609            kvm_check_extension_list(s, kvm_arch_required_capabilities);
1610    }
1611    if (missing_cap) {
1612        ret = -EINVAL;
1613        fprintf(stderr, "kvm does not support %s\n%s",
1614                missing_cap->name, upgrade_note);
1615        goto err;
1616    }
1617
1618    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
1619
1620#ifdef KVM_CAP_VCPU_EVENTS
1621    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
1622#endif
1623
1624    s->robust_singlestep =
1625        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
1626
1627#ifdef KVM_CAP_DEBUGREGS
1628    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
1629#endif
1630
1631#ifdef KVM_CAP_IRQ_ROUTING
1632    kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
1633#endif
1634
1635    s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
1636
1637    s->irq_set_ioctl = KVM_IRQ_LINE;
1638    if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
1639        s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
1640    }
1641
1642#ifdef KVM_CAP_READONLY_MEM
1643    kvm_readonly_mem_allowed =
1644        (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
1645#endif
1646
1647    kvm_eventfds_allowed =
1648        (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0);
1649
1650    kvm_irqfds_allowed =
1651        (kvm_check_extension(s, KVM_CAP_IRQFD) > 0);
1652
1653    kvm_resamplefds_allowed =
1654        (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
1655
1656    kvm_vm_attributes_allowed =
1657        (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
1658
1659    kvm_ioeventfd_any_length_allowed =
1660        (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0);
1661
1662    kvm_state = s;
1663
1664    /*
1665     * if memory encryption object is specified then initialize the memory
1666     * encryption context.
1667     */
1668    if (ms->memory_encryption) {
1669        kvm_state->memcrypt_handle = sev_guest_init(ms->memory_encryption);
1670        if (!kvm_state->memcrypt_handle) {
1671            ret = -1;
1672            goto err;
1673        }
1674
1675        kvm_state->memcrypt_encrypt_data = sev_encrypt_data;
1676    }
1677
1678    ret = kvm_arch_init(ms, s);
1679    if (ret < 0) {
1680        goto err;
1681    }
1682
1683    if (machine_kernel_irqchip_allowed(ms)) {
1684        kvm_irqchip_create(ms, s);
1685    }
1686
1687    if (kvm_eventfds_allowed) {
1688        s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
1689        s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
1690    }
1691    s->memory_listener.listener.coalesced_mmio_add = kvm_coalesce_mmio_region;
1692    s->memory_listener.listener.coalesced_mmio_del = kvm_uncoalesce_mmio_region;
1693
1694    kvm_memory_listener_register(s, &s->memory_listener,
1695                                 &address_space_memory, 0);
1696    memory_listener_register(&kvm_io_listener,
1697                             &address_space_io);
1698
1699    s->many_ioeventfds = kvm_check_many_ioeventfds();
1700
1701    s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
1702
1703    return 0;
1704
1705err:
1706    assert(ret < 0);
1707    if (s->vmfd >= 0) {
1708        close(s->vmfd);
1709    }
1710    if (s->fd != -1) {
1711        close(s->fd);
1712    }
1713    g_free(s->memory_listener.slots);
1714
1715    return ret;
1716}
1717
1718void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
1719{
1720    s->sigmask_len = sigmask_len;
1721}
1722
1723static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
1724                          int size, uint32_t count)
1725{
1726    int i;
1727    uint8_t *ptr = data;
1728
1729    for (i = 0; i < count; i++) {
1730        address_space_rw(&address_space_io, port, attrs,
1731                         ptr, size,
1732                         direction == KVM_EXIT_IO_OUT);
1733        ptr += size;
1734    }
1735}
1736
1737static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
1738{
1739    fprintf(stderr, "KVM internal error. Suberror: %d\n",
1740            run->internal.suberror);
1741
1742    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
1743        int i;
1744
1745        for (i = 0; i < run->internal.ndata; ++i) {
1746            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
1747                    i, (uint64_t)run->internal.data[i]);
1748        }
1749    }
1750    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
1751        fprintf(stderr, "emulation failure\n");
1752        if (!kvm_arch_stop_on_emulation_error(cpu)) {
1753            cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
1754            return EXCP_INTERRUPT;
1755        }
1756    }
1757    /* FIXME: Should trigger a qmp message to let management know
1758     * something went wrong.
1759     */
1760    return -1;
1761}
1762
1763void kvm_flush_coalesced_mmio_buffer(void)
1764{
1765    KVMState *s = kvm_state;
1766
1767    if (s->coalesced_flush_in_progress) {
1768        return;
1769    }
1770
1771    s->coalesced_flush_in_progress = true;
1772
1773    if (s->coalesced_mmio_ring) {
1774        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
1775        while (ring->first != ring->last) {
1776            struct kvm_coalesced_mmio *ent;
1777
1778            ent = &ring->coalesced_mmio[ring->first];
1779
1780            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
1781            smp_wmb();
1782            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
1783        }
1784    }
1785
1786    s->coalesced_flush_in_progress = false;
1787}
1788
1789static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
1790{
1791    if (!cpu->vcpu_dirty) {
1792        kvm_arch_get_registers(cpu);
1793        cpu->vcpu_dirty = true;
1794    }
1795}
1796
1797void kvm_cpu_synchronize_state(CPUState *cpu)
1798{
1799    if (!cpu->vcpu_dirty) {
1800        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
1801    }
1802}
1803
1804static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
1805{
1806    kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
1807    cpu->vcpu_dirty = false;
1808}
1809
1810void kvm_cpu_synchronize_post_reset(CPUState *cpu)
1811{
1812    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
1813}
1814
1815static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
1816{
1817    kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
1818    cpu->vcpu_dirty = false;
1819}
1820
1821void kvm_cpu_synchronize_post_init(CPUState *cpu)
1822{
1823    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
1824}
1825
1826static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
1827{
1828    cpu->vcpu_dirty = true;
1829}
1830
1831void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
1832{
1833    run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
1834}
1835
1836#ifdef KVM_HAVE_MCE_INJECTION
1837static __thread void *pending_sigbus_addr;
1838static __thread int pending_sigbus_code;
1839static __thread bool have_sigbus_pending;
1840#endif
1841
1842static void kvm_cpu_kick(CPUState *cpu)
1843{
1844    atomic_set(&cpu->kvm_run->immediate_exit, 1);
1845}
1846
1847static void kvm_cpu_kick_self(void)
1848{
1849    if (kvm_immediate_exit) {
1850        kvm_cpu_kick(current_cpu);
1851    } else {
1852        qemu_cpu_kick_self();
1853    }
1854}
1855
1856static void kvm_eat_signals(CPUState *cpu)
1857{
1858    struct timespec ts = { 0, 0 };
1859    siginfo_t siginfo;
1860    sigset_t waitset;
1861    sigset_t chkset;
1862    int r;
1863
1864    if (kvm_immediate_exit) {
1865        atomic_set(&cpu->kvm_run->immediate_exit, 0);
1866        /* Write kvm_run->immediate_exit before the cpu->exit_request
1867         * write in kvm_cpu_exec.
1868         */
1869        smp_wmb();
1870        return;
1871    }
1872
1873    sigemptyset(&waitset);
1874    sigaddset(&waitset, SIG_IPI);
1875
1876    do {
1877        r = sigtimedwait(&waitset, &siginfo, &ts);
1878        if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
1879            perror("sigtimedwait");
1880            exit(1);
1881        }
1882
1883        r = sigpending(&chkset);
1884        if (r == -1) {
1885            perror("sigpending");
1886            exit(1);
1887        }
1888    } while (sigismember(&chkset, SIG_IPI));
1889}
1890
1891int kvm_cpu_exec(CPUState *cpu)
1892{
1893    struct kvm_run *run = cpu->kvm_run;
1894    int ret, run_ret;
1895
1896    DPRINTF("kvm_cpu_exec()\n");
1897
1898    if (kvm_arch_process_async_events(cpu)) {
1899        atomic_set(&cpu->exit_request, 0);
1900        return EXCP_HLT;
1901    }
1902
1903    qemu_mutex_unlock_iothread();
1904    cpu_exec_start(cpu);
1905
1906    do {
1907        MemTxAttrs attrs;
1908
1909        if (cpu->vcpu_dirty) {
1910            kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
1911            cpu->vcpu_dirty = false;
1912        }
1913
1914        kvm_arch_pre_run(cpu, run);
1915        if (atomic_read(&cpu->exit_request)) {
1916            DPRINTF("interrupt exit requested\n");
1917            /*
1918             * KVM requires us to reenter the kernel after IO exits to complete
1919             * instruction emulation. This self-signal will ensure that we
1920             * leave ASAP again.
1921             */
1922            kvm_cpu_kick_self();
1923        }
1924
1925        /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
1926         * Matching barrier in kvm_eat_signals.
1927         */
1928        smp_rmb();
1929
1930        run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
1931
1932        attrs = kvm_arch_post_run(cpu, run);
1933
1934#ifdef KVM_HAVE_MCE_INJECTION
1935        if (unlikely(have_sigbus_pending)) {
1936            qemu_mutex_lock_iothread();
1937            kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
1938                                    pending_sigbus_addr);
1939            have_sigbus_pending = false;
1940            qemu_mutex_unlock_iothread();
1941        }
1942#endif
1943
1944        if (run_ret < 0) {
1945            if (run_ret == -EINTR || run_ret == -EAGAIN) {
1946                DPRINTF("io window exit\n");
1947                kvm_eat_signals(cpu);
1948                ret = EXCP_INTERRUPT;
1949                break;
1950            }
1951            fprintf(stderr, "error: kvm run failed %s\n",
1952                    strerror(-run_ret));
1953#ifdef TARGET_PPC
1954            if (run_ret == -EBUSY) {
1955                fprintf(stderr,
1956                        "This is probably because your SMT is enabled.\n"
1957                        "VCPU can only run on primary threads with all "
1958                        "secondary threads offline.\n");
1959            }
1960#endif
1961            ret = -1;
1962            break;
1963        }
1964
1965        trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
1966        switch (run->exit_reason) {
1967        case KVM_EXIT_IO:
1968            DPRINTF("handle_io\n");
1969            /* Called outside BQL */
1970            kvm_handle_io(run->io.port, attrs,
1971                          (uint8_t *)run + run->io.data_offset,
1972                          run->io.direction,
1973                          run->io.size,
1974                          run->io.count);
1975            ret = 0;
1976            break;
1977        case KVM_EXIT_MMIO:
1978            DPRINTF("handle_mmio\n");
1979            /* Called outside BQL */
1980            address_space_rw(&address_space_memory,
1981                             run->mmio.phys_addr, attrs,
1982                             run->mmio.data,
1983                             run->mmio.len,
1984                             run->mmio.is_write);
1985            ret = 0;
1986            break;
1987        case KVM_EXIT_IRQ_WINDOW_OPEN:
1988            DPRINTF("irq_window_open\n");
1989            ret = EXCP_INTERRUPT;
1990            break;
1991        case KVM_EXIT_SHUTDOWN:
1992            DPRINTF("shutdown\n");
1993            qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
1994            ret = EXCP_INTERRUPT;
1995            break;
1996        case KVM_EXIT_UNKNOWN:
1997            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
1998                    (uint64_t)run->hw.hardware_exit_reason);
1999            ret = -1;
2000            break;
2001        case KVM_EXIT_INTERNAL_ERROR:
2002            ret = kvm_handle_internal_error(cpu, run);
2003            break;
2004        case KVM_EXIT_SYSTEM_EVENT:
2005            switch (run->system_event.type) {
2006            case KVM_SYSTEM_EVENT_SHUTDOWN:
2007                qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
2008                ret = EXCP_INTERRUPT;
2009                break;
2010            case KVM_SYSTEM_EVENT_RESET:
2011                qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2012                ret = EXCP_INTERRUPT;
2013                break;
2014            case KVM_SYSTEM_EVENT_CRASH:
2015                kvm_cpu_synchronize_state(cpu);
2016                qemu_mutex_lock_iothread();
2017                qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2018                qemu_mutex_unlock_iothread();
2019                ret = 0;
2020                break;
2021            default:
2022                DPRINTF("kvm_arch_handle_exit\n");
2023                ret = kvm_arch_handle_exit(cpu, run);
2024                break;
2025            }
2026            break;
2027        default:
2028            DPRINTF("kvm_arch_handle_exit\n");
2029            ret = kvm_arch_handle_exit(cpu, run);
2030            break;
2031        }
2032    } while (ret == 0);
2033
2034    cpu_exec_end(cpu);
2035    qemu_mutex_lock_iothread();
2036
2037    if (ret < 0) {
2038        cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
2039        vm_stop(RUN_STATE_INTERNAL_ERROR);
2040    }
2041
2042    atomic_set(&cpu->exit_request, 0);
2043    return ret;
2044}
2045
2046int kvm_ioctl(KVMState *s, int type, ...)
2047{
2048    int ret;
2049    void *arg;
2050    va_list ap;
2051
2052    va_start(ap, type);
2053    arg = va_arg(ap, void *);
2054    va_end(ap);
2055
2056    trace_kvm_ioctl(type, arg);
2057    ret = ioctl(s->fd, type, arg);
2058    if (ret == -1) {
2059        ret = -errno;
2060    }
2061    return ret;
2062}
2063
2064int kvm_vm_ioctl(KVMState *s, int type, ...)
2065{
2066    int ret;
2067    void *arg;
2068    va_list ap;
2069
2070    va_start(ap, type);
2071    arg = va_arg(ap, void *);
2072    va_end(ap);
2073
2074    trace_kvm_vm_ioctl(type, arg);
2075    ret = ioctl(s->vmfd, type, arg);
2076    if (ret == -1) {
2077        ret = -errno;
2078    }
2079    return ret;
2080}
2081
2082int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
2083{
2084    int ret;
2085    void *arg;
2086    va_list ap;
2087
2088    va_start(ap, type);
2089    arg = va_arg(ap, void *);
2090    va_end(ap);
2091
2092    trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
2093    ret = ioctl(cpu->kvm_fd, type, arg);
2094    if (ret == -1) {
2095        ret = -errno;
2096    }
2097    return ret;
2098}
2099
2100int kvm_device_ioctl(int fd, int type, ...)
2101{
2102    int ret;
2103    void *arg;
2104    va_list ap;
2105
2106    va_start(ap, type);
2107    arg = va_arg(ap, void *);
2108    va_end(ap);
2109
2110    trace_kvm_device_ioctl(fd, type, arg);
2111    ret = ioctl(fd, type, arg);
2112    if (ret == -1) {
2113        ret = -errno;
2114    }
2115    return ret;
2116}
2117
2118int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
2119{
2120    int ret;
2121    struct kvm_device_attr attribute = {
2122        .group = group,
2123        .attr = attr,
2124    };
2125
2126    if (!kvm_vm_attributes_allowed) {
2127        return 0;
2128    }
2129
2130    ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
2131    /* kvm returns 0 on success for HAS_DEVICE_ATTR */
2132    return ret ? 0 : 1;
2133}
2134
2135int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
2136{
2137    struct kvm_device_attr attribute = {
2138        .group = group,
2139        .attr = attr,
2140        .flags = 0,
2141    };
2142
2143    return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
2144}
2145
2146int kvm_device_access(int fd, int group, uint64_t attr,
2147                      void *val, bool write, Error **errp)
2148{
2149    struct kvm_device_attr kvmattr;
2150    int err;
2151
2152    kvmattr.flags = 0;
2153    kvmattr.group = group;
2154    kvmattr.attr = attr;
2155    kvmattr.addr = (uintptr_t)val;
2156
2157    err = kvm_device_ioctl(fd,
2158                           write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
2159                           &kvmattr);
2160    if (err < 0) {
2161        error_setg_errno(errp, -err,
2162                         "KVM_%s_DEVICE_ATTR failed: Group %d "
2163                         "attr 0x%016" PRIx64,
2164                         write ? "SET" : "GET", group, attr);
2165    }
2166    return err;
2167}
2168
2169bool kvm_has_sync_mmu(void)
2170{
2171    return kvm_state->sync_mmu;
2172}
2173
2174int kvm_has_vcpu_events(void)
2175{
2176    return kvm_state->vcpu_events;
2177}
2178
2179int kvm_has_robust_singlestep(void)
2180{
2181    return kvm_state->robust_singlestep;
2182}
2183
2184int kvm_has_debugregs(void)
2185{
2186    return kvm_state->debugregs;
2187}
2188
2189int kvm_has_many_ioeventfds(void)
2190{
2191    if (!kvm_enabled()) {
2192        return 0;
2193    }
2194    return kvm_state->many_ioeventfds;
2195}
2196
2197int kvm_has_gsi_routing(void)
2198{
2199#ifdef KVM_CAP_IRQ_ROUTING
2200    return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
2201#else
2202    return false;
2203#endif
2204}
2205
2206int kvm_has_intx_set_mask(void)
2207{
2208    return kvm_state->intx_set_mask;
2209}
2210
2211bool kvm_arm_supports_user_irq(void)
2212{
2213    return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
2214}
2215
2216#ifdef KVM_CAP_SET_GUEST_DEBUG
2217struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu,
2218                                                 target_ulong pc)
2219{
2220    struct kvm_sw_breakpoint *bp;
2221
2222    QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
2223        if (bp->pc == pc) {
2224            return bp;
2225        }
2226    }
2227    return NULL;
2228}
2229
2230int kvm_sw_breakpoints_active(CPUState *cpu)
2231{
2232    return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
2233}
2234
2235struct kvm_set_guest_debug_data {
2236    struct kvm_guest_debug dbg;
2237    int err;
2238};
2239
2240static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
2241{
2242    struct kvm_set_guest_debug_data *dbg_data =
2243        (struct kvm_set_guest_debug_data *) data.host_ptr;
2244
2245    dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
2246                                   &dbg_data->dbg);
2247}
2248
2249int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
2250{
2251    struct kvm_set_guest_debug_data data;
2252
2253    data.dbg.control = reinject_trap;
2254
2255    if (cpu->singlestep_enabled) {
2256        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
2257    }
2258    kvm_arch_update_guest_debug(cpu, &data.dbg);
2259
2260    run_on_cpu(cpu, kvm_invoke_set_guest_debug,
2261               RUN_ON_CPU_HOST_PTR(&data));
2262    return data.err;
2263}
2264
2265int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2266                          target_ulong len, int type)
2267{
2268    struct kvm_sw_breakpoint *bp;
2269    int err;
2270
2271    if (type == GDB_BREAKPOINT_SW) {
2272        bp = kvm_find_sw_breakpoint(cpu, addr);
2273        if (bp) {
2274            bp->use_count++;
2275            return 0;
2276        }
2277
2278        bp = g_malloc(sizeof(struct kvm_sw_breakpoint));
2279        bp->pc = addr;
2280        bp->use_count = 1;
2281        err = kvm_arch_insert_sw_breakpoint(cpu, bp);
2282        if (err) {
2283            g_free(bp);
2284            return err;
2285        }
2286
2287        QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
2288    } else {
2289        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
2290        if (err) {
2291            return err;
2292        }
2293    }
2294
2295    CPU_FOREACH(cpu) {
2296        err = kvm_update_guest_debug(cpu, 0);
2297        if (err) {
2298            return err;
2299        }
2300    }
2301    return 0;
2302}
2303
2304int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2305                          target_ulong len, int type)
2306{
2307    struct kvm_sw_breakpoint *bp;
2308    int err;
2309
2310    if (type == GDB_BREAKPOINT_SW) {
2311        bp = kvm_find_sw_breakpoint(cpu, addr);
2312        if (!bp) {
2313            return -ENOENT;
2314        }
2315
2316        if (bp->use_count > 1) {
2317            bp->use_count--;
2318            return 0;
2319        }
2320
2321        err = kvm_arch_remove_sw_breakpoint(cpu, bp);
2322        if (err) {
2323            return err;
2324        }
2325
2326        QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
2327        g_free(bp);
2328    } else {
2329        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
2330        if (err) {
2331            return err;
2332        }
2333    }
2334
2335    CPU_FOREACH(cpu) {
2336        err = kvm_update_guest_debug(cpu, 0);
2337        if (err) {
2338            return err;
2339        }
2340    }
2341    return 0;
2342}
2343
2344void kvm_remove_all_breakpoints(CPUState *cpu)
2345{
2346    struct kvm_sw_breakpoint *bp, *next;
2347    KVMState *s = cpu->kvm_state;
2348    CPUState *tmpcpu;
2349
2350    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
2351        if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
2352            /* Try harder to find a CPU that currently sees the breakpoint. */
2353            CPU_FOREACH(tmpcpu) {
2354                if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
2355                    break;
2356                }
2357            }
2358        }
2359        QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
2360        g_free(bp);
2361    }
2362    kvm_arch_remove_all_hw_breakpoints();
2363
2364    CPU_FOREACH(cpu) {
2365        kvm_update_guest_debug(cpu, 0);
2366    }
2367}
2368
2369#else /* !KVM_CAP_SET_GUEST_DEBUG */
2370
2371int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
2372{
2373    return -EINVAL;
2374}
2375
2376int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2377                          target_ulong len, int type)
2378{
2379    return -EINVAL;
2380}
2381
2382int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2383                          target_ulong len, int type)
2384{
2385    return -EINVAL;
2386}
2387
2388void kvm_remove_all_breakpoints(CPUState *cpu)
2389{
2390}
2391#endif /* !KVM_CAP_SET_GUEST_DEBUG */
2392
2393static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
2394{
2395    KVMState *s = kvm_state;
2396    struct kvm_signal_mask *sigmask;
2397    int r;
2398
2399    sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
2400
2401    sigmask->len = s->sigmask_len;
2402    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
2403    r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
2404    g_free(sigmask);
2405
2406    return r;
2407}
2408
2409static void kvm_ipi_signal(int sig)
2410{
2411    if (current_cpu) {
2412        assert(kvm_immediate_exit);
2413        kvm_cpu_kick(current_cpu);
2414    }
2415}
2416
2417void kvm_init_cpu_signals(CPUState *cpu)
2418{
2419    int r;
2420    sigset_t set;
2421    struct sigaction sigact;
2422
2423    memset(&sigact, 0, sizeof(sigact));
2424    sigact.sa_handler = kvm_ipi_signal;
2425    sigaction(SIG_IPI, &sigact, NULL);
2426
2427    pthread_sigmask(SIG_BLOCK, NULL, &set);
2428#if defined KVM_HAVE_MCE_INJECTION
2429    sigdelset(&set, SIGBUS);
2430    pthread_sigmask(SIG_SETMASK, &set, NULL);
2431#endif
2432    sigdelset(&set, SIG_IPI);
2433    if (kvm_immediate_exit) {
2434        r = pthread_sigmask(SIG_SETMASK, &set, NULL);
2435    } else {
2436        r = kvm_set_signal_mask(cpu, &set);
2437    }
2438    if (r) {
2439        fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
2440        exit(1);
2441    }
2442}
2443
2444/* Called asynchronously in VCPU thread.  */
2445int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
2446{
2447#ifdef KVM_HAVE_MCE_INJECTION
2448    if (have_sigbus_pending) {
2449        return 1;
2450    }
2451    have_sigbus_pending = true;
2452    pending_sigbus_addr = addr;
2453    pending_sigbus_code = code;
2454    atomic_set(&cpu->exit_request, 1);
2455    return 0;
2456#else
2457    return 1;
2458#endif
2459}
2460
2461/* Called synchronously (via signalfd) in main thread.  */
2462int kvm_on_sigbus(int code, void *addr)
2463{
2464#ifdef KVM_HAVE_MCE_INJECTION
2465    /* Action required MCE kills the process if SIGBUS is blocked.  Because
2466     * that's what happens in the I/O thread, where we handle MCE via signalfd,
2467     * we can only get action optional here.
2468     */
2469    assert(code != BUS_MCEERR_AR);
2470    kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
2471    return 0;
2472#else
2473    return 1;
2474#endif
2475}
2476
2477int kvm_create_device(KVMState *s, uint64_t type, bool test)
2478{
2479    int ret;
2480    struct kvm_create_device create_dev;
2481
2482    create_dev.type = type;
2483    create_dev.fd = -1;
2484    create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
2485
2486    if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
2487        return -ENOTSUP;
2488    }
2489
2490    ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
2491    if (ret) {
2492        return ret;
2493    }
2494
2495    return test ? 0 : create_dev.fd;
2496}
2497
2498bool kvm_device_supported(int vmfd, uint64_t type)
2499{
2500    struct kvm_create_device create_dev = {
2501        .type = type,
2502        .fd = -1,
2503        .flags = KVM_CREATE_DEVICE_TEST,
2504    };
2505
2506    if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
2507        return false;
2508    }
2509
2510    return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
2511}
2512
2513int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
2514{
2515    struct kvm_one_reg reg;
2516    int r;
2517
2518    reg.id = id;
2519    reg.addr = (uintptr_t) source;
2520    r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
2521    if (r) {
2522        trace_kvm_failed_reg_set(id, strerror(-r));
2523    }
2524    return r;
2525}
2526
2527int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
2528{
2529    struct kvm_one_reg reg;
2530    int r;
2531
2532    reg.id = id;
2533    reg.addr = (uintptr_t) target;
2534    r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
2535    if (r) {
2536        trace_kvm_failed_reg_get(id, strerror(-r));
2537    }
2538    return r;
2539}
2540
2541static void kvm_accel_class_init(ObjectClass *oc, void *data)
2542{
2543    AccelClass *ac = ACCEL_CLASS(oc);
2544    ac->name = "KVM";
2545    ac->init_machine = kvm_init;
2546    ac->allowed = &kvm_allowed;
2547}
2548
2549static const TypeInfo kvm_accel_type = {
2550    .name = TYPE_KVM_ACCEL,
2551    .parent = TYPE_ACCEL,
2552    .class_init = kvm_accel_class_init,
2553    .instance_size = sizeof(KVMState),
2554};
2555
2556static void kvm_type_init(void)
2557{
2558    type_register_static(&kvm_accel_type);
2559}
2560
2561type_init(kvm_type_init);
2562