qemu/accel/kvm/kvm-all.c
<<
>>
Prefs
   1/*
   2 * QEMU KVM support
   3 *
   4 * Copyright IBM, Corp. 2008
   5 *           Red Hat, Inc. 2008
   6 *
   7 * Authors:
   8 *  Anthony Liguori   <aliguori@us.ibm.com>
   9 *  Glauber Costa     <gcosta@redhat.com>
  10 *
  11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
  12 * See the COPYING file in the top-level directory.
  13 *
  14 */
  15
  16#include "qemu/osdep.h"
  17#include <sys/ioctl.h>
  18
  19#include <linux/kvm.h>
  20
  21#include "qemu-common.h"
  22#include "qemu/atomic.h"
  23#include "qemu/option.h"
  24#include "qemu/config-file.h"
  25#include "qemu/error-report.h"
  26#include "qapi/error.h"
  27#include "hw/hw.h"
  28#include "hw/pci/msi.h"
  29#include "hw/pci/msix.h"
  30#include "hw/s390x/adapter.h"
  31#include "exec/gdbstub.h"
  32#include "sysemu/kvm_int.h"
  33#include "sysemu/cpus.h"
  34#include "qemu/bswap.h"
  35#include "exec/memory.h"
  36#include "exec/ram_addr.h"
  37#include "exec/address-spaces.h"
  38#include "qemu/event_notifier.h"
  39#include "trace.h"
  40#include "hw/irq.h"
  41#include "sysemu/sev.h"
  42
  43#include "hw/boards.h"
  44
  45/* This check must be after config-host.h is included */
  46#ifdef CONFIG_EVENTFD
  47#include <sys/eventfd.h>
  48#endif
  49
  50/* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
  51 * need to use the real host PAGE_SIZE, as that's what KVM will use.
  52 */
  53#define PAGE_SIZE getpagesize()
  54
  55//#define DEBUG_KVM
  56
  57#ifdef DEBUG_KVM
  58#define DPRINTF(fmt, ...) \
  59    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
  60#else
  61#define DPRINTF(fmt, ...) \
  62    do { } while (0)
  63#endif
  64
  65#define KVM_MSI_HASHTAB_SIZE    256
  66
  67struct KVMParkedVcpu {
  68    unsigned long vcpu_id;
  69    int kvm_fd;
  70    QLIST_ENTRY(KVMParkedVcpu) node;
  71};
  72
  73struct KVMState
  74{
  75    AccelState parent_obj;
  76
  77    int nr_slots;
  78    int fd;
  79    int vmfd;
  80    int coalesced_mmio;
  81    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
  82    bool coalesced_flush_in_progress;
  83    int vcpu_events;
  84    int robust_singlestep;
  85    int debugregs;
  86#ifdef KVM_CAP_SET_GUEST_DEBUG
  87    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
  88#endif
  89    int many_ioeventfds;
  90    int intx_set_mask;
  91    bool sync_mmu;
  92    /* The man page (and posix) say ioctl numbers are signed int, but
  93     * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
  94     * unsigned, and treating them as signed here can break things */
  95    unsigned irq_set_ioctl;
  96    unsigned int sigmask_len;
  97    GHashTable *gsimap;
  98#ifdef KVM_CAP_IRQ_ROUTING
  99    struct kvm_irq_routing *irq_routes;
 100    int nr_allocated_irq_routes;
 101    unsigned long *used_gsi_bitmap;
 102    unsigned int gsi_count;
 103    QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
 104#endif
 105    KVMMemoryListener memory_listener;
 106    QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
 107
 108    /* memory encryption */
 109    void *memcrypt_handle;
 110    int (*memcrypt_encrypt_data)(void *handle, uint8_t *ptr, uint64_t len);
 111};
 112
 113KVMState *kvm_state;
 114bool kvm_kernel_irqchip;
 115bool kvm_split_irqchip;
 116bool kvm_async_interrupts_allowed;
 117bool kvm_halt_in_kernel_allowed;
 118bool kvm_eventfds_allowed;
 119bool kvm_irqfds_allowed;
 120bool kvm_resamplefds_allowed;
 121bool kvm_msi_via_irqfd_allowed;
 122bool kvm_gsi_routing_allowed;
 123bool kvm_gsi_direct_mapping;
 124bool kvm_allowed;
 125bool kvm_readonly_mem_allowed;
 126bool kvm_vm_attributes_allowed;
 127bool kvm_direct_msi_allowed;
 128bool kvm_ioeventfd_any_length_allowed;
 129bool kvm_msi_use_devid;
 130static bool kvm_immediate_exit;
 131
 132static const KVMCapabilityInfo kvm_required_capabilites[] = {
 133    KVM_CAP_INFO(USER_MEMORY),
 134    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
 135    KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
 136    KVM_CAP_LAST_INFO
 137};
 138
 139int kvm_get_max_memslots(void)
 140{
 141    KVMState *s = KVM_STATE(current_machine->accelerator);
 142
 143    return s->nr_slots;
 144}
 145
 146bool kvm_memcrypt_enabled(void)
 147{
 148    if (kvm_state && kvm_state->memcrypt_handle) {
 149        return true;
 150    }
 151
 152    return false;
 153}
 154
 155int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len)
 156{
 157    if (kvm_state->memcrypt_handle &&
 158        kvm_state->memcrypt_encrypt_data) {
 159        return kvm_state->memcrypt_encrypt_data(kvm_state->memcrypt_handle,
 160                                              ptr, len);
 161    }
 162
 163    return 1;
 164}
 165
 166static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
 167{
 168    KVMState *s = kvm_state;
 169    int i;
 170
 171    for (i = 0; i < s->nr_slots; i++) {
 172        if (kml->slots[i].memory_size == 0) {
 173            return &kml->slots[i];
 174        }
 175    }
 176
 177    return NULL;
 178}
 179
 180bool kvm_has_free_slot(MachineState *ms)
 181{
 182    KVMState *s = KVM_STATE(ms->accelerator);
 183
 184    return kvm_get_free_slot(&s->memory_listener);
 185}
 186
 187static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
 188{
 189    KVMSlot *slot = kvm_get_free_slot(kml);
 190
 191    if (slot) {
 192        return slot;
 193    }
 194
 195    fprintf(stderr, "%s: no free slot available\n", __func__);
 196    abort();
 197}
 198
 199static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
 200                                         hwaddr start_addr,
 201                                         hwaddr size)
 202{
 203    KVMState *s = kvm_state;
 204    int i;
 205
 206    for (i = 0; i < s->nr_slots; i++) {
 207        KVMSlot *mem = &kml->slots[i];
 208
 209        if (start_addr == mem->start_addr && size == mem->memory_size) {
 210            return mem;
 211        }
 212    }
 213
 214    return NULL;
 215}
 216
 217/*
 218 * Calculate and align the start address and the size of the section.
 219 * Return the size. If the size is 0, the aligned section is empty.
 220 */
 221static hwaddr kvm_align_section(MemoryRegionSection *section,
 222                                hwaddr *start)
 223{
 224    hwaddr size = int128_get64(section->size);
 225    hwaddr delta, aligned;
 226
 227    /* kvm works in page size chunks, but the function may be called
 228       with sub-page size and unaligned start address. Pad the start
 229       address to next and truncate size to previous page boundary. */
 230    aligned = ROUND_UP(section->offset_within_address_space,
 231                       qemu_real_host_page_size);
 232    delta = aligned - section->offset_within_address_space;
 233    *start = aligned;
 234    if (delta > size) {
 235        return 0;
 236    }
 237
 238    return (size - delta) & qemu_real_host_page_mask;
 239}
 240
 241int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
 242                                       hwaddr *phys_addr)
 243{
 244    KVMMemoryListener *kml = &s->memory_listener;
 245    int i;
 246
 247    for (i = 0; i < s->nr_slots; i++) {
 248        KVMSlot *mem = &kml->slots[i];
 249
 250        if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
 251            *phys_addr = mem->start_addr + (ram - mem->ram);
 252            return 1;
 253        }
 254    }
 255
 256    return 0;
 257}
 258
 259static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new)
 260{
 261    KVMState *s = kvm_state;
 262    struct kvm_userspace_memory_region mem;
 263    int ret;
 264
 265    mem.slot = slot->slot | (kml->as_id << 16);
 266    mem.guest_phys_addr = slot->start_addr;
 267    mem.userspace_addr = (unsigned long)slot->ram;
 268    mem.flags = slot->flags;
 269
 270    if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) {
 271        /* Set the slot size to 0 before setting the slot to the desired
 272         * value. This is needed based on KVM commit 75d61fbc. */
 273        mem.memory_size = 0;
 274        kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 275    }
 276    mem.memory_size = slot->memory_size;
 277    ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 278    slot->old_flags = mem.flags;
 279    trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr,
 280                              mem.memory_size, mem.userspace_addr, ret);
 281    return ret;
 282}
 283
 284int kvm_destroy_vcpu(CPUState *cpu)
 285{
 286    KVMState *s = kvm_state;
 287    long mmap_size;
 288    struct KVMParkedVcpu *vcpu = NULL;
 289    int ret = 0;
 290
 291    DPRINTF("kvm_destroy_vcpu\n");
 292
 293    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
 294    if (mmap_size < 0) {
 295        ret = mmap_size;
 296        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
 297        goto err;
 298    }
 299
 300    ret = munmap(cpu->kvm_run, mmap_size);
 301    if (ret < 0) {
 302        goto err;
 303    }
 304
 305    vcpu = g_malloc0(sizeof(*vcpu));
 306    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
 307    vcpu->kvm_fd = cpu->kvm_fd;
 308    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
 309err:
 310    return ret;
 311}
 312
 313static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
 314{
 315    struct KVMParkedVcpu *cpu;
 316
 317    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
 318        if (cpu->vcpu_id == vcpu_id) {
 319            int kvm_fd;
 320
 321            QLIST_REMOVE(cpu, node);
 322            kvm_fd = cpu->kvm_fd;
 323            g_free(cpu);
 324            return kvm_fd;
 325        }
 326    }
 327
 328    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
 329}
 330
 331int kvm_init_vcpu(CPUState *cpu)
 332{
 333    KVMState *s = kvm_state;
 334    long mmap_size;
 335    int ret;
 336
 337    DPRINTF("kvm_init_vcpu\n");
 338
 339    ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
 340    if (ret < 0) {
 341        DPRINTF("kvm_create_vcpu failed\n");
 342        goto err;
 343    }
 344
 345    cpu->kvm_fd = ret;
 346    cpu->kvm_state = s;
 347    cpu->vcpu_dirty = true;
 348
 349    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
 350    if (mmap_size < 0) {
 351        ret = mmap_size;
 352        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
 353        goto err;
 354    }
 355
 356    cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
 357                        cpu->kvm_fd, 0);
 358    if (cpu->kvm_run == MAP_FAILED) {
 359        ret = -errno;
 360        DPRINTF("mmap'ing vcpu state failed\n");
 361        goto err;
 362    }
 363
 364    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
 365        s->coalesced_mmio_ring =
 366            (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
 367    }
 368
 369    ret = kvm_arch_init_vcpu(cpu);
 370err:
 371    return ret;
 372}
 373
 374/*
 375 * dirty pages logging control
 376 */
 377
 378static int kvm_mem_flags(MemoryRegion *mr)
 379{
 380    bool readonly = mr->readonly || memory_region_is_romd(mr);
 381    int flags = 0;
 382
 383    if (memory_region_get_dirty_log_mask(mr) != 0) {
 384        flags |= KVM_MEM_LOG_DIRTY_PAGES;
 385    }
 386    if (readonly && kvm_readonly_mem_allowed) {
 387        flags |= KVM_MEM_READONLY;
 388    }
 389    return flags;
 390}
 391
 392static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
 393                                 MemoryRegion *mr)
 394{
 395    mem->flags = kvm_mem_flags(mr);
 396
 397    /* If nothing changed effectively, no need to issue ioctl */
 398    if (mem->flags == mem->old_flags) {
 399        return 0;
 400    }
 401
 402    return kvm_set_user_memory_region(kml, mem, false);
 403}
 404
 405static int kvm_section_update_flags(KVMMemoryListener *kml,
 406                                    MemoryRegionSection *section)
 407{
 408    hwaddr start_addr, size;
 409    KVMSlot *mem;
 410
 411    size = kvm_align_section(section, &start_addr);
 412    if (!size) {
 413        return 0;
 414    }
 415
 416    mem = kvm_lookup_matching_slot(kml, start_addr, size);
 417    if (!mem) {
 418        /* We don't have a slot if we want to trap every access. */
 419        return 0;
 420    }
 421
 422    return kvm_slot_update_flags(kml, mem, section->mr);
 423}
 424
 425static void kvm_log_start(MemoryListener *listener,
 426                          MemoryRegionSection *section,
 427                          int old, int new)
 428{
 429    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 430    int r;
 431
 432    if (old != 0) {
 433        return;
 434    }
 435
 436    r = kvm_section_update_flags(kml, section);
 437    if (r < 0) {
 438        abort();
 439    }
 440}
 441
 442static void kvm_log_stop(MemoryListener *listener,
 443                          MemoryRegionSection *section,
 444                          int old, int new)
 445{
 446    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 447    int r;
 448
 449    if (new != 0) {
 450        return;
 451    }
 452
 453    r = kvm_section_update_flags(kml, section);
 454    if (r < 0) {
 455        abort();
 456    }
 457}
 458
 459/* get kvm's dirty pages bitmap and update qemu's */
 460static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
 461                                         unsigned long *bitmap)
 462{
 463    ram_addr_t start = section->offset_within_region +
 464                       memory_region_get_ram_addr(section->mr);
 465    ram_addr_t pages = int128_get64(section->size) / getpagesize();
 466
 467    cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages);
 468    return 0;
 469}
 470
 471#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
 472
 473/**
 474 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
 475 * This function updates qemu's dirty bitmap using
 476 * memory_region_set_dirty().  This means all bits are set
 477 * to dirty.
 478 *
 479 * @start_add: start of logged region.
 480 * @end_addr: end of logged region.
 481 */
 482static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
 483                                          MemoryRegionSection *section)
 484{
 485    KVMState *s = kvm_state;
 486    struct kvm_dirty_log d = {};
 487    KVMSlot *mem;
 488    hwaddr start_addr, size;
 489
 490    size = kvm_align_section(section, &start_addr);
 491    if (size) {
 492        mem = kvm_lookup_matching_slot(kml, start_addr, size);
 493        if (!mem) {
 494            /* We don't have a slot if we want to trap every access. */
 495            return 0;
 496        }
 497
 498        /* XXX bad kernel interface alert
 499         * For dirty bitmap, kernel allocates array of size aligned to
 500         * bits-per-long.  But for case when the kernel is 64bits and
 501         * the userspace is 32bits, userspace can't align to the same
 502         * bits-per-long, since sizeof(long) is different between kernel
 503         * and user space.  This way, userspace will provide buffer which
 504         * may be 4 bytes less than the kernel will use, resulting in
 505         * userspace memory corruption (which is not detectable by valgrind
 506         * too, in most cases).
 507         * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
 508         * a hope that sizeof(long) won't become >8 any time soon.
 509         */
 510        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
 511                     /*HOST_LONG_BITS*/ 64) / 8;
 512        d.dirty_bitmap = g_malloc0(size);
 513
 514        d.slot = mem->slot | (kml->as_id << 16);
 515        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
 516            DPRINTF("ioctl failed %d\n", errno);
 517            g_free(d.dirty_bitmap);
 518            return -1;
 519        }
 520
 521        kvm_get_dirty_pages_log_range(section, d.dirty_bitmap);
 522        g_free(d.dirty_bitmap);
 523    }
 524
 525    return 0;
 526}
 527
 528static void kvm_coalesce_mmio_region(MemoryListener *listener,
 529                                     MemoryRegionSection *secion,
 530                                     hwaddr start, hwaddr size)
 531{
 532    KVMState *s = kvm_state;
 533
 534    if (s->coalesced_mmio) {
 535        struct kvm_coalesced_mmio_zone zone;
 536
 537        zone.addr = start;
 538        zone.size = size;
 539        zone.pad = 0;
 540
 541        (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
 542    }
 543}
 544
 545static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
 546                                       MemoryRegionSection *secion,
 547                                       hwaddr start, hwaddr size)
 548{
 549    KVMState *s = kvm_state;
 550
 551    if (s->coalesced_mmio) {
 552        struct kvm_coalesced_mmio_zone zone;
 553
 554        zone.addr = start;
 555        zone.size = size;
 556        zone.pad = 0;
 557
 558        (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
 559    }
 560}
 561
 562int kvm_check_extension(KVMState *s, unsigned int extension)
 563{
 564    int ret;
 565
 566    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
 567    if (ret < 0) {
 568        ret = 0;
 569    }
 570
 571    return ret;
 572}
 573
 574int kvm_vm_check_extension(KVMState *s, unsigned int extension)
 575{
 576    int ret;
 577
 578    ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
 579    if (ret < 0) {
 580        /* VM wide version not implemented, use global one instead */
 581        ret = kvm_check_extension(s, extension);
 582    }
 583
 584    return ret;
 585}
 586
 587static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
 588{
 589#if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN)
 590    /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN
 591     * endianness, but the memory core hands them in target endianness.
 592     * For example, PPC is always treated as big-endian even if running
 593     * on KVM and on PPC64LE.  Correct here.
 594     */
 595    switch (size) {
 596    case 2:
 597        val = bswap16(val);
 598        break;
 599    case 4:
 600        val = bswap32(val);
 601        break;
 602    }
 603#endif
 604    return val;
 605}
 606
 607static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
 608                                  bool assign, uint32_t size, bool datamatch)
 609{
 610    int ret;
 611    struct kvm_ioeventfd iofd = {
 612        .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
 613        .addr = addr,
 614        .len = size,
 615        .flags = 0,
 616        .fd = fd,
 617    };
 618
 619    if (!kvm_enabled()) {
 620        return -ENOSYS;
 621    }
 622
 623    if (datamatch) {
 624        iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
 625    }
 626    if (!assign) {
 627        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
 628    }
 629
 630    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
 631
 632    if (ret < 0) {
 633        return -errno;
 634    }
 635
 636    return 0;
 637}
 638
 639static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
 640                                 bool assign, uint32_t size, bool datamatch)
 641{
 642    struct kvm_ioeventfd kick = {
 643        .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
 644        .addr = addr,
 645        .flags = KVM_IOEVENTFD_FLAG_PIO,
 646        .len = size,
 647        .fd = fd,
 648    };
 649    int r;
 650    if (!kvm_enabled()) {
 651        return -ENOSYS;
 652    }
 653    if (datamatch) {
 654        kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
 655    }
 656    if (!assign) {
 657        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
 658    }
 659    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
 660    if (r < 0) {
 661        return r;
 662    }
 663    return 0;
 664}
 665
 666
 667static int kvm_check_many_ioeventfds(void)
 668{
 669    /* Userspace can use ioeventfd for io notification.  This requires a host
 670     * that supports eventfd(2) and an I/O thread; since eventfd does not
 671     * support SIGIO it cannot interrupt the vcpu.
 672     *
 673     * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
 674     * can avoid creating too many ioeventfds.
 675     */
 676#if defined(CONFIG_EVENTFD)
 677    int ioeventfds[7];
 678    int i, ret = 0;
 679    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
 680        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
 681        if (ioeventfds[i] < 0) {
 682            break;
 683        }
 684        ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true);
 685        if (ret < 0) {
 686            close(ioeventfds[i]);
 687            break;
 688        }
 689    }
 690
 691    /* Decide whether many devices are supported or not */
 692    ret = i == ARRAY_SIZE(ioeventfds);
 693
 694    while (i-- > 0) {
 695        kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true);
 696        close(ioeventfds[i]);
 697    }
 698    return ret;
 699#else
 700    return 0;
 701#endif
 702}
 703
 704static const KVMCapabilityInfo *
 705kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
 706{
 707    while (list->name) {
 708        if (!kvm_check_extension(s, list->value)) {
 709            return list;
 710        }
 711        list++;
 712    }
 713    return NULL;
 714}
 715
 716static void kvm_set_phys_mem(KVMMemoryListener *kml,
 717                             MemoryRegionSection *section, bool add)
 718{
 719    KVMSlot *mem;
 720    int err;
 721    MemoryRegion *mr = section->mr;
 722    bool writeable = !mr->readonly && !mr->rom_device;
 723    hwaddr start_addr, size;
 724    void *ram;
 725
 726    if (!memory_region_is_ram(mr)) {
 727        if (writeable || !kvm_readonly_mem_allowed) {
 728            return;
 729        } else if (!mr->romd_mode) {
 730            /* If the memory device is not in romd_mode, then we actually want
 731             * to remove the kvm memory slot so all accesses will trap. */
 732            add = false;
 733        }
 734    }
 735
 736    size = kvm_align_section(section, &start_addr);
 737    if (!size) {
 738        return;
 739    }
 740
 741    /* use aligned delta to align the ram address */
 742    ram = memory_region_get_ram_ptr(mr) + section->offset_within_region +
 743          (start_addr - section->offset_within_address_space);
 744
 745    if (!add) {
 746        mem = kvm_lookup_matching_slot(kml, start_addr, size);
 747        if (!mem) {
 748            return;
 749        }
 750        if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
 751            kvm_physical_sync_dirty_bitmap(kml, section);
 752        }
 753
 754        /* unregister the slot */
 755        mem->memory_size = 0;
 756        mem->flags = 0;
 757        err = kvm_set_user_memory_region(kml, mem, false);
 758        if (err) {
 759            fprintf(stderr, "%s: error unregistering slot: %s\n",
 760                    __func__, strerror(-err));
 761            abort();
 762        }
 763        return;
 764    }
 765
 766    /* register the new slot */
 767    mem = kvm_alloc_slot(kml);
 768    mem->memory_size = size;
 769    mem->start_addr = start_addr;
 770    mem->ram = ram;
 771    mem->flags = kvm_mem_flags(mr);
 772
 773    err = kvm_set_user_memory_region(kml, mem, true);
 774    if (err) {
 775        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
 776                strerror(-err));
 777        abort();
 778    }
 779}
 780
 781static void kvm_region_add(MemoryListener *listener,
 782                           MemoryRegionSection *section)
 783{
 784    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 785
 786    memory_region_ref(section->mr);
 787    kvm_set_phys_mem(kml, section, true);
 788}
 789
 790static void kvm_region_del(MemoryListener *listener,
 791                           MemoryRegionSection *section)
 792{
 793    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 794
 795    kvm_set_phys_mem(kml, section, false);
 796    memory_region_unref(section->mr);
 797}
 798
 799static void kvm_log_sync(MemoryListener *listener,
 800                         MemoryRegionSection *section)
 801{
 802    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 803    int r;
 804
 805    r = kvm_physical_sync_dirty_bitmap(kml, section);
 806    if (r < 0) {
 807        abort();
 808    }
 809}
 810
 811static void kvm_mem_ioeventfd_add(MemoryListener *listener,
 812                                  MemoryRegionSection *section,
 813                                  bool match_data, uint64_t data,
 814                                  EventNotifier *e)
 815{
 816    int fd = event_notifier_get_fd(e);
 817    int r;
 818
 819    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
 820                               data, true, int128_get64(section->size),
 821                               match_data);
 822    if (r < 0) {
 823        fprintf(stderr, "%s: error adding ioeventfd: %s\n",
 824                __func__, strerror(-r));
 825        abort();
 826    }
 827}
 828
 829static void kvm_mem_ioeventfd_del(MemoryListener *listener,
 830                                  MemoryRegionSection *section,
 831                                  bool match_data, uint64_t data,
 832                                  EventNotifier *e)
 833{
 834    int fd = event_notifier_get_fd(e);
 835    int r;
 836
 837    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
 838                               data, false, int128_get64(section->size),
 839                               match_data);
 840    if (r < 0) {
 841        abort();
 842    }
 843}
 844
 845static void kvm_io_ioeventfd_add(MemoryListener *listener,
 846                                 MemoryRegionSection *section,
 847                                 bool match_data, uint64_t data,
 848                                 EventNotifier *e)
 849{
 850    int fd = event_notifier_get_fd(e);
 851    int r;
 852
 853    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
 854                              data, true, int128_get64(section->size),
 855                              match_data);
 856    if (r < 0) {
 857        fprintf(stderr, "%s: error adding ioeventfd: %s\n",
 858                __func__, strerror(-r));
 859        abort();
 860    }
 861}
 862
 863static void kvm_io_ioeventfd_del(MemoryListener *listener,
 864                                 MemoryRegionSection *section,
 865                                 bool match_data, uint64_t data,
 866                                 EventNotifier *e)
 867
 868{
 869    int fd = event_notifier_get_fd(e);
 870    int r;
 871
 872    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
 873                              data, false, int128_get64(section->size),
 874                              match_data);
 875    if (r < 0) {
 876        abort();
 877    }
 878}
 879
 880void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
 881                                  AddressSpace *as, int as_id)
 882{
 883    int i;
 884
 885    kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot));
 886    kml->as_id = as_id;
 887
 888    for (i = 0; i < s->nr_slots; i++) {
 889        kml->slots[i].slot = i;
 890    }
 891
 892    kml->listener.region_add = kvm_region_add;
 893    kml->listener.region_del = kvm_region_del;
 894    kml->listener.log_start = kvm_log_start;
 895    kml->listener.log_stop = kvm_log_stop;
 896    kml->listener.log_sync = kvm_log_sync;
 897    kml->listener.priority = 10;
 898
 899    memory_listener_register(&kml->listener, as);
 900}
 901
 902static MemoryListener kvm_io_listener = {
 903    .eventfd_add = kvm_io_ioeventfd_add,
 904    .eventfd_del = kvm_io_ioeventfd_del,
 905    .priority = 10,
 906};
 907
 908int kvm_set_irq(KVMState *s, int irq, int level)
 909{
 910    struct kvm_irq_level event;
 911    int ret;
 912
 913    assert(kvm_async_interrupts_enabled());
 914
 915    event.level = level;
 916    event.irq = irq;
 917    ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
 918    if (ret < 0) {
 919        perror("kvm_set_irq");
 920        abort();
 921    }
 922
 923    return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
 924}
 925
 926#ifdef KVM_CAP_IRQ_ROUTING
 927typedef struct KVMMSIRoute {
 928    struct kvm_irq_routing_entry kroute;
 929    QTAILQ_ENTRY(KVMMSIRoute) entry;
 930} KVMMSIRoute;
 931
 932static void set_gsi(KVMState *s, unsigned int gsi)
 933{
 934    set_bit(gsi, s->used_gsi_bitmap);
 935}
 936
 937static void clear_gsi(KVMState *s, unsigned int gsi)
 938{
 939    clear_bit(gsi, s->used_gsi_bitmap);
 940}
 941
 942void kvm_init_irq_routing(KVMState *s)
 943{
 944    int gsi_count, i;
 945
 946    gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
 947    if (gsi_count > 0) {
 948        /* Round up so we can search ints using ffs */
 949        s->used_gsi_bitmap = bitmap_new(gsi_count);
 950        s->gsi_count = gsi_count;
 951    }
 952
 953    s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
 954    s->nr_allocated_irq_routes = 0;
 955
 956    if (!kvm_direct_msi_allowed) {
 957        for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
 958            QTAILQ_INIT(&s->msi_hashtab[i]);
 959        }
 960    }
 961
 962    kvm_arch_init_irq_routing(s);
 963}
 964
 965void kvm_irqchip_commit_routes(KVMState *s)
 966{
 967    int ret;
 968
 969    if (kvm_gsi_direct_mapping()) {
 970        return;
 971    }
 972
 973    if (!kvm_gsi_routing_enabled()) {
 974        return;
 975    }
 976
 977    s->irq_routes->flags = 0;
 978    trace_kvm_irqchip_commit_routes();
 979    ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
 980    assert(ret == 0);
 981}
 982
 983static void kvm_add_routing_entry(KVMState *s,
 984                                  struct kvm_irq_routing_entry *entry)
 985{
 986    struct kvm_irq_routing_entry *new;
 987    int n, size;
 988
 989    if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
 990        n = s->nr_allocated_irq_routes * 2;
 991        if (n < 64) {
 992            n = 64;
 993        }
 994        size = sizeof(struct kvm_irq_routing);
 995        size += n * sizeof(*new);
 996        s->irq_routes = g_realloc(s->irq_routes, size);
 997        s->nr_allocated_irq_routes = n;
 998    }
 999    n = s->irq_routes->nr++;
1000    new = &s->irq_routes->entries[n];
1001
1002    *new = *entry;
1003
1004    set_gsi(s, entry->gsi);
1005}
1006
1007static int kvm_update_routing_entry(KVMState *s,
1008                                    struct kvm_irq_routing_entry *new_entry)
1009{
1010    struct kvm_irq_routing_entry *entry;
1011    int n;
1012
1013    for (n = 0; n < s->irq_routes->nr; n++) {
1014        entry = &s->irq_routes->entries[n];
1015        if (entry->gsi != new_entry->gsi) {
1016            continue;
1017        }
1018
1019        if(!memcmp(entry, new_entry, sizeof *entry)) {
1020            return 0;
1021        }
1022
1023        *entry = *new_entry;
1024
1025        return 0;
1026    }
1027
1028    return -ESRCH;
1029}
1030
1031void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
1032{
1033    struct kvm_irq_routing_entry e = {};
1034
1035    assert(pin < s->gsi_count);
1036
1037    e.gsi = irq;
1038    e.type = KVM_IRQ_ROUTING_IRQCHIP;
1039    e.flags = 0;
1040    e.u.irqchip.irqchip = irqchip;
1041    e.u.irqchip.pin = pin;
1042    kvm_add_routing_entry(s, &e);
1043}
1044
1045void kvm_irqchip_release_virq(KVMState *s, int virq)
1046{
1047    struct kvm_irq_routing_entry *e;
1048    int i;
1049
1050    if (kvm_gsi_direct_mapping()) {
1051        return;
1052    }
1053
1054    for (i = 0; i < s->irq_routes->nr; i++) {
1055        e = &s->irq_routes->entries[i];
1056        if (e->gsi == virq) {
1057            s->irq_routes->nr--;
1058            *e = s->irq_routes->entries[s->irq_routes->nr];
1059        }
1060    }
1061    clear_gsi(s, virq);
1062    kvm_arch_release_virq_post(virq);
1063    trace_kvm_irqchip_release_virq(virq);
1064}
1065
1066static unsigned int kvm_hash_msi(uint32_t data)
1067{
1068    /* This is optimized for IA32 MSI layout. However, no other arch shall
1069     * repeat the mistake of not providing a direct MSI injection API. */
1070    return data & 0xff;
1071}
1072
1073static void kvm_flush_dynamic_msi_routes(KVMState *s)
1074{
1075    KVMMSIRoute *route, *next;
1076    unsigned int hash;
1077
1078    for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
1079        QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
1080            kvm_irqchip_release_virq(s, route->kroute.gsi);
1081            QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
1082            g_free(route);
1083        }
1084    }
1085}
1086
1087static int kvm_irqchip_get_virq(KVMState *s)
1088{
1089    int next_virq;
1090
1091    /*
1092     * PIC and IOAPIC share the first 16 GSI numbers, thus the available
1093     * GSI numbers are more than the number of IRQ route. Allocating a GSI
1094     * number can succeed even though a new route entry cannot be added.
1095     * When this happens, flush dynamic MSI entries to free IRQ route entries.
1096     */
1097    if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) {
1098        kvm_flush_dynamic_msi_routes(s);
1099    }
1100
1101    /* Return the lowest unused GSI in the bitmap */
1102    next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
1103    if (next_virq >= s->gsi_count) {
1104        return -ENOSPC;
1105    } else {
1106        return next_virq;
1107    }
1108}
1109
1110static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
1111{
1112    unsigned int hash = kvm_hash_msi(msg.data);
1113    KVMMSIRoute *route;
1114
1115    QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
1116        if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
1117            route->kroute.u.msi.address_hi == (msg.address >> 32) &&
1118            route->kroute.u.msi.data == le32_to_cpu(msg.data)) {
1119            return route;
1120        }
1121    }
1122    return NULL;
1123}
1124
1125int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1126{
1127    struct kvm_msi msi;
1128    KVMMSIRoute *route;
1129
1130    if (kvm_direct_msi_allowed) {
1131        msi.address_lo = (uint32_t)msg.address;
1132        msi.address_hi = msg.address >> 32;
1133        msi.data = le32_to_cpu(msg.data);
1134        msi.flags = 0;
1135        memset(msi.pad, 0, sizeof(msi.pad));
1136
1137        return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
1138    }
1139
1140    route = kvm_lookup_msi_route(s, msg);
1141    if (!route) {
1142        int virq;
1143
1144        virq = kvm_irqchip_get_virq(s);
1145        if (virq < 0) {
1146            return virq;
1147        }
1148
1149        route = g_malloc0(sizeof(KVMMSIRoute));
1150        route->kroute.gsi = virq;
1151        route->kroute.type = KVM_IRQ_ROUTING_MSI;
1152        route->kroute.flags = 0;
1153        route->kroute.u.msi.address_lo = (uint32_t)msg.address;
1154        route->kroute.u.msi.address_hi = msg.address >> 32;
1155        route->kroute.u.msi.data = le32_to_cpu(msg.data);
1156
1157        kvm_add_routing_entry(s, &route->kroute);
1158        kvm_irqchip_commit_routes(s);
1159
1160        QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
1161                           entry);
1162    }
1163
1164    assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
1165
1166    return kvm_set_irq(s, route->kroute.gsi, 1);
1167}
1168
1169int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
1170{
1171    struct kvm_irq_routing_entry kroute = {};
1172    int virq;
1173    MSIMessage msg = {0, 0};
1174
1175    if (pci_available && dev) {
1176        msg = pci_get_msi_message(dev, vector);
1177    }
1178
1179    if (kvm_gsi_direct_mapping()) {
1180        return kvm_arch_msi_data_to_gsi(msg.data);
1181    }
1182
1183    if (!kvm_gsi_routing_enabled()) {
1184        return -ENOSYS;
1185    }
1186
1187    virq = kvm_irqchip_get_virq(s);
1188    if (virq < 0) {
1189        return virq;
1190    }
1191
1192    kroute.gsi = virq;
1193    kroute.type = KVM_IRQ_ROUTING_MSI;
1194    kroute.flags = 0;
1195    kroute.u.msi.address_lo = (uint32_t)msg.address;
1196    kroute.u.msi.address_hi = msg.address >> 32;
1197    kroute.u.msi.data = le32_to_cpu(msg.data);
1198    if (pci_available && kvm_msi_devid_required()) {
1199        kroute.flags = KVM_MSI_VALID_DEVID;
1200        kroute.u.msi.devid = pci_requester_id(dev);
1201    }
1202    if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1203        kvm_irqchip_release_virq(s, virq);
1204        return -EINVAL;
1205    }
1206
1207    trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
1208                                    vector, virq);
1209
1210    kvm_add_routing_entry(s, &kroute);
1211    kvm_arch_add_msi_route_post(&kroute, vector, dev);
1212    kvm_irqchip_commit_routes(s);
1213
1214    return virq;
1215}
1216
1217int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
1218                                 PCIDevice *dev)
1219{
1220    struct kvm_irq_routing_entry kroute = {};
1221
1222    if (kvm_gsi_direct_mapping()) {
1223        return 0;
1224    }
1225
1226    if (!kvm_irqchip_in_kernel()) {
1227        return -ENOSYS;
1228    }
1229
1230    kroute.gsi = virq;
1231    kroute.type = KVM_IRQ_ROUTING_MSI;
1232    kroute.flags = 0;
1233    kroute.u.msi.address_lo = (uint32_t)msg.address;
1234    kroute.u.msi.address_hi = msg.address >> 32;
1235    kroute.u.msi.data = le32_to_cpu(msg.data);
1236    if (pci_available && kvm_msi_devid_required()) {
1237        kroute.flags = KVM_MSI_VALID_DEVID;
1238        kroute.u.msi.devid = pci_requester_id(dev);
1239    }
1240    if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1241        return -EINVAL;
1242    }
1243
1244    trace_kvm_irqchip_update_msi_route(virq);
1245
1246    return kvm_update_routing_entry(s, &kroute);
1247}
1248
1249static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq,
1250                                    bool assign)
1251{
1252    struct kvm_irqfd irqfd = {
1253        .fd = fd,
1254        .gsi = virq,
1255        .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
1256    };
1257
1258    if (rfd != -1) {
1259        irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
1260        irqfd.resamplefd = rfd;
1261    }
1262
1263    if (!kvm_irqfds_enabled()) {
1264        return -ENOSYS;
1265    }
1266
1267    return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
1268}
1269
1270int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
1271{
1272    struct kvm_irq_routing_entry kroute = {};
1273    int virq;
1274
1275    if (!kvm_gsi_routing_enabled()) {
1276        return -ENOSYS;
1277    }
1278
1279    virq = kvm_irqchip_get_virq(s);
1280    if (virq < 0) {
1281        return virq;
1282    }
1283
1284    kroute.gsi = virq;
1285    kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER;
1286    kroute.flags = 0;
1287    kroute.u.adapter.summary_addr = adapter->summary_addr;
1288    kroute.u.adapter.ind_addr = adapter->ind_addr;
1289    kroute.u.adapter.summary_offset = adapter->summary_offset;
1290    kroute.u.adapter.ind_offset = adapter->ind_offset;
1291    kroute.u.adapter.adapter_id = adapter->adapter_id;
1292
1293    kvm_add_routing_entry(s, &kroute);
1294
1295    return virq;
1296}
1297
1298int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
1299{
1300    struct kvm_irq_routing_entry kroute = {};
1301    int virq;
1302
1303    if (!kvm_gsi_routing_enabled()) {
1304        return -ENOSYS;
1305    }
1306    if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) {
1307        return -ENOSYS;
1308    }
1309    virq = kvm_irqchip_get_virq(s);
1310    if (virq < 0) {
1311        return virq;
1312    }
1313
1314    kroute.gsi = virq;
1315    kroute.type = KVM_IRQ_ROUTING_HV_SINT;
1316    kroute.flags = 0;
1317    kroute.u.hv_sint.vcpu = vcpu;
1318    kroute.u.hv_sint.sint = sint;
1319
1320    kvm_add_routing_entry(s, &kroute);
1321    kvm_irqchip_commit_routes(s);
1322
1323    return virq;
1324}
1325
1326#else /* !KVM_CAP_IRQ_ROUTING */
1327
1328void kvm_init_irq_routing(KVMState *s)
1329{
1330}
1331
1332void kvm_irqchip_release_virq(KVMState *s, int virq)
1333{
1334}
1335
1336int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1337{
1338    abort();
1339}
1340
1341int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
1342{
1343    return -ENOSYS;
1344}
1345
1346int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
1347{
1348    return -ENOSYS;
1349}
1350
1351int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
1352{
1353    return -ENOSYS;
1354}
1355
1356static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
1357{
1358    abort();
1359}
1360
1361int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
1362{
1363    return -ENOSYS;
1364}
1365#endif /* !KVM_CAP_IRQ_ROUTING */
1366
1367int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
1368                                       EventNotifier *rn, int virq)
1369{
1370    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n),
1371           rn ? event_notifier_get_fd(rn) : -1, virq, true);
1372}
1373
1374int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
1375                                          int virq)
1376{
1377    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq,
1378           false);
1379}
1380
1381int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
1382                                   EventNotifier *rn, qemu_irq irq)
1383{
1384    gpointer key, gsi;
1385    gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
1386
1387    if (!found) {
1388        return -ENXIO;
1389    }
1390    return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
1391}
1392
1393int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
1394                                      qemu_irq irq)
1395{
1396    gpointer key, gsi;
1397    gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
1398
1399    if (!found) {
1400        return -ENXIO;
1401    }
1402    return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
1403}
1404
1405void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
1406{
1407    g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
1408}
1409
1410static void kvm_irqchip_create(MachineState *machine, KVMState *s)
1411{
1412    int ret;
1413
1414    if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
1415        ;
1416    } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
1417        ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
1418        if (ret < 0) {
1419            fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
1420            exit(1);
1421        }
1422    } else {
1423        return;
1424    }
1425
1426    /* First probe and see if there's a arch-specific hook to create the
1427     * in-kernel irqchip for us */
1428    ret = kvm_arch_irqchip_create(machine, s);
1429    if (ret == 0) {
1430        if (machine_kernel_irqchip_split(machine)) {
1431            perror("Split IRQ chip mode not supported.");
1432            exit(1);
1433        } else {
1434            ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
1435        }
1436    }
1437    if (ret < 0) {
1438        fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
1439        exit(1);
1440    }
1441
1442    kvm_kernel_irqchip = true;
1443    /* If we have an in-kernel IRQ chip then we must have asynchronous
1444     * interrupt delivery (though the reverse is not necessarily true)
1445     */
1446    kvm_async_interrupts_allowed = true;
1447    kvm_halt_in_kernel_allowed = true;
1448
1449    kvm_init_irq_routing(s);
1450
1451    s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
1452}
1453
1454/* Find number of supported CPUs using the recommended
1455 * procedure from the kernel API documentation to cope with
1456 * older kernels that may be missing capabilities.
1457 */
1458static int kvm_recommended_vcpus(KVMState *s)
1459{
1460    int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
1461    return (ret) ? ret : 4;
1462}
1463
1464static int kvm_max_vcpus(KVMState *s)
1465{
1466    int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
1467    return (ret) ? ret : kvm_recommended_vcpus(s);
1468}
1469
1470static int kvm_max_vcpu_id(KVMState *s)
1471{
1472    int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
1473    return (ret) ? ret : kvm_max_vcpus(s);
1474}
1475
1476bool kvm_vcpu_id_is_valid(int vcpu_id)
1477{
1478    KVMState *s = KVM_STATE(current_machine->accelerator);
1479    return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
1480}
1481
1482static int kvm_init(MachineState *ms)
1483{
1484    MachineClass *mc = MACHINE_GET_CLASS(ms);
1485    static const char upgrade_note[] =
1486        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1487        "(see http://sourceforge.net/projects/kvm).\n";
1488    struct {
1489        const char *name;
1490        int num;
1491    } num_cpus[] = {
1492        { "SMP",          smp_cpus },
1493        { "hotpluggable", max_cpus },
1494        { NULL, }
1495    }, *nc = num_cpus;
1496    int soft_vcpus_limit, hard_vcpus_limit;
1497    KVMState *s;
1498    const KVMCapabilityInfo *missing_cap;
1499    int ret;
1500    int type = 0;
1501    const char *kvm_type;
1502
1503    s = KVM_STATE(ms->accelerator);
1504
1505    /*
1506     * On systems where the kernel can support different base page
1507     * sizes, host page size may be different from TARGET_PAGE_SIZE,
1508     * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
1509     * page size for the system though.
1510     */
1511    assert(TARGET_PAGE_SIZE <= getpagesize());
1512
1513    s->sigmask_len = 8;
1514
1515#ifdef KVM_CAP_SET_GUEST_DEBUG
1516    QTAILQ_INIT(&s->kvm_sw_breakpoints);
1517#endif
1518    QLIST_INIT(&s->kvm_parked_vcpus);
1519    s->vmfd = -1;
1520    s->fd = qemu_open("/dev/kvm", O_RDWR);
1521    if (s->fd == -1) {
1522        fprintf(stderr, "Could not access KVM kernel module: %m\n");
1523        ret = -errno;
1524        goto err;
1525    }
1526
1527    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
1528    if (ret < KVM_API_VERSION) {
1529        if (ret >= 0) {
1530            ret = -EINVAL;
1531        }
1532        fprintf(stderr, "kvm version too old\n");
1533        goto err;
1534    }
1535
1536    if (ret > KVM_API_VERSION) {
1537        ret = -EINVAL;
1538        fprintf(stderr, "kvm version not supported\n");
1539        goto err;
1540    }
1541
1542    kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
1543    s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
1544
1545    /* If unspecified, use the default value */
1546    if (!s->nr_slots) {
1547        s->nr_slots = 32;
1548    }
1549
1550    kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type");
1551    if (mc->kvm_type) {
1552        type = mc->kvm_type(kvm_type);
1553    } else if (kvm_type) {
1554        ret = -EINVAL;
1555        fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type);
1556        goto err;
1557    }
1558
1559    do {
1560        ret = kvm_ioctl(s, KVM_CREATE_VM, type);
1561    } while (ret == -EINTR);
1562
1563    if (ret < 0) {
1564        fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret,
1565                strerror(-ret));
1566
1567#ifdef TARGET_S390X
1568        if (ret == -EINVAL) {
1569            fprintf(stderr,
1570                    "Host kernel setup problem detected. Please verify:\n");
1571            fprintf(stderr, "- for kernels supporting the switch_amode or"
1572                    " user_mode parameters, whether\n");
1573            fprintf(stderr,
1574                    "  user space is running in primary address space\n");
1575            fprintf(stderr,
1576                    "- for kernels supporting the vm.allocate_pgste sysctl, "
1577                    "whether it is enabled\n");
1578        }
1579#endif
1580        goto err;
1581    }
1582
1583    s->vmfd = ret;
1584
1585    /* check the vcpu limits */
1586    soft_vcpus_limit = kvm_recommended_vcpus(s);
1587    hard_vcpus_limit = kvm_max_vcpus(s);
1588
1589    while (nc->name) {
1590        if (nc->num > soft_vcpus_limit) {
1591            warn_report("Number of %s cpus requested (%d) exceeds "
1592                        "the recommended cpus supported by KVM (%d)",
1593                        nc->name, nc->num, soft_vcpus_limit);
1594
1595            if (nc->num > hard_vcpus_limit) {
1596                fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
1597                        "the maximum cpus supported by KVM (%d)\n",
1598                        nc->name, nc->num, hard_vcpus_limit);
1599                exit(1);
1600            }
1601        }
1602        nc++;
1603    }
1604
1605    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
1606    if (!missing_cap) {
1607        missing_cap =
1608            kvm_check_extension_list(s, kvm_arch_required_capabilities);
1609    }
1610    if (missing_cap) {
1611        ret = -EINVAL;
1612        fprintf(stderr, "kvm does not support %s\n%s",
1613                missing_cap->name, upgrade_note);
1614        goto err;
1615    }
1616
1617    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
1618
1619#ifdef KVM_CAP_VCPU_EVENTS
1620    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
1621#endif
1622
1623    s->robust_singlestep =
1624        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
1625
1626#ifdef KVM_CAP_DEBUGREGS
1627    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
1628#endif
1629
1630#ifdef KVM_CAP_IRQ_ROUTING
1631    kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
1632#endif
1633
1634    s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
1635
1636    s->irq_set_ioctl = KVM_IRQ_LINE;
1637    if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
1638        s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
1639    }
1640
1641#ifdef KVM_CAP_READONLY_MEM
1642    kvm_readonly_mem_allowed =
1643        (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
1644#endif
1645
1646    kvm_eventfds_allowed =
1647        (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0);
1648
1649    kvm_irqfds_allowed =
1650        (kvm_check_extension(s, KVM_CAP_IRQFD) > 0);
1651
1652    kvm_resamplefds_allowed =
1653        (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
1654
1655    kvm_vm_attributes_allowed =
1656        (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
1657
1658    kvm_ioeventfd_any_length_allowed =
1659        (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0);
1660
1661    kvm_state = s;
1662
1663    /*
1664     * if memory encryption object is specified then initialize the memory
1665     * encryption context.
1666     */
1667    if (ms->memory_encryption) {
1668        kvm_state->memcrypt_handle = sev_guest_init(ms->memory_encryption);
1669        if (!kvm_state->memcrypt_handle) {
1670            ret = -1;
1671            goto err;
1672        }
1673
1674        kvm_state->memcrypt_encrypt_data = sev_encrypt_data;
1675    }
1676
1677    ret = kvm_arch_init(ms, s);
1678    if (ret < 0) {
1679        goto err;
1680    }
1681
1682    if (machine_kernel_irqchip_allowed(ms)) {
1683        kvm_irqchip_create(ms, s);
1684    }
1685
1686    if (kvm_eventfds_allowed) {
1687        s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
1688        s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
1689    }
1690    s->memory_listener.listener.coalesced_mmio_add = kvm_coalesce_mmio_region;
1691    s->memory_listener.listener.coalesced_mmio_del = kvm_uncoalesce_mmio_region;
1692
1693    kvm_memory_listener_register(s, &s->memory_listener,
1694                                 &address_space_memory, 0);
1695    memory_listener_register(&kvm_io_listener,
1696                             &address_space_io);
1697
1698    s->many_ioeventfds = kvm_check_many_ioeventfds();
1699
1700    s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
1701
1702    return 0;
1703
1704err:
1705    assert(ret < 0);
1706    if (s->vmfd >= 0) {
1707        close(s->vmfd);
1708    }
1709    if (s->fd != -1) {
1710        close(s->fd);
1711    }
1712    g_free(s->memory_listener.slots);
1713
1714    return ret;
1715}
1716
1717void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
1718{
1719    s->sigmask_len = sigmask_len;
1720}
1721
1722static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
1723                          int size, uint32_t count)
1724{
1725    int i;
1726    uint8_t *ptr = data;
1727
1728    for (i = 0; i < count; i++) {
1729        address_space_rw(&address_space_io, port, attrs,
1730                         ptr, size,
1731                         direction == KVM_EXIT_IO_OUT);
1732        ptr += size;
1733    }
1734}
1735
1736static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
1737{
1738    fprintf(stderr, "KVM internal error. Suberror: %d\n",
1739            run->internal.suberror);
1740
1741    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
1742        int i;
1743
1744        for (i = 0; i < run->internal.ndata; ++i) {
1745            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
1746                    i, (uint64_t)run->internal.data[i]);
1747        }
1748    }
1749    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
1750        fprintf(stderr, "emulation failure\n");
1751        if (!kvm_arch_stop_on_emulation_error(cpu)) {
1752            cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
1753            return EXCP_INTERRUPT;
1754        }
1755    }
1756    /* FIXME: Should trigger a qmp message to let management know
1757     * something went wrong.
1758     */
1759    return -1;
1760}
1761
1762void kvm_flush_coalesced_mmio_buffer(void)
1763{
1764    KVMState *s = kvm_state;
1765
1766    if (s->coalesced_flush_in_progress) {
1767        return;
1768    }
1769
1770    s->coalesced_flush_in_progress = true;
1771
1772    if (s->coalesced_mmio_ring) {
1773        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
1774        while (ring->first != ring->last) {
1775            struct kvm_coalesced_mmio *ent;
1776
1777            ent = &ring->coalesced_mmio[ring->first];
1778
1779            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
1780            smp_wmb();
1781            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
1782        }
1783    }
1784
1785    s->coalesced_flush_in_progress = false;
1786}
1787
1788static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
1789{
1790    if (!cpu->vcpu_dirty) {
1791        kvm_arch_get_registers(cpu);
1792        cpu->vcpu_dirty = true;
1793    }
1794}
1795
1796void kvm_cpu_synchronize_state(CPUState *cpu)
1797{
1798    if (!cpu->vcpu_dirty) {
1799        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
1800    }
1801}
1802
1803static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
1804{
1805    kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
1806    cpu->vcpu_dirty = false;
1807}
1808
1809void kvm_cpu_synchronize_post_reset(CPUState *cpu)
1810{
1811    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
1812}
1813
1814static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
1815{
1816    kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
1817    cpu->vcpu_dirty = false;
1818}
1819
1820void kvm_cpu_synchronize_post_init(CPUState *cpu)
1821{
1822    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
1823}
1824
1825static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
1826{
1827    cpu->vcpu_dirty = true;
1828}
1829
1830void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
1831{
1832    run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
1833}
1834
1835#ifdef KVM_HAVE_MCE_INJECTION
1836static __thread void *pending_sigbus_addr;
1837static __thread int pending_sigbus_code;
1838static __thread bool have_sigbus_pending;
1839#endif
1840
1841static void kvm_cpu_kick(CPUState *cpu)
1842{
1843    atomic_set(&cpu->kvm_run->immediate_exit, 1);
1844}
1845
1846static void kvm_cpu_kick_self(void)
1847{
1848    if (kvm_immediate_exit) {
1849        kvm_cpu_kick(current_cpu);
1850    } else {
1851        qemu_cpu_kick_self();
1852    }
1853}
1854
1855static void kvm_eat_signals(CPUState *cpu)
1856{
1857    struct timespec ts = { 0, 0 };
1858    siginfo_t siginfo;
1859    sigset_t waitset;
1860    sigset_t chkset;
1861    int r;
1862
1863    if (kvm_immediate_exit) {
1864        atomic_set(&cpu->kvm_run->immediate_exit, 0);
1865        /* Write kvm_run->immediate_exit before the cpu->exit_request
1866         * write in kvm_cpu_exec.
1867         */
1868        smp_wmb();
1869        return;
1870    }
1871
1872    sigemptyset(&waitset);
1873    sigaddset(&waitset, SIG_IPI);
1874
1875    do {
1876        r = sigtimedwait(&waitset, &siginfo, &ts);
1877        if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
1878            perror("sigtimedwait");
1879            exit(1);
1880        }
1881
1882        r = sigpending(&chkset);
1883        if (r == -1) {
1884            perror("sigpending");
1885            exit(1);
1886        }
1887    } while (sigismember(&chkset, SIG_IPI));
1888}
1889
1890int kvm_cpu_exec(CPUState *cpu)
1891{
1892    struct kvm_run *run = cpu->kvm_run;
1893    int ret, run_ret;
1894
1895    DPRINTF("kvm_cpu_exec()\n");
1896
1897    if (kvm_arch_process_async_events(cpu)) {
1898        atomic_set(&cpu->exit_request, 0);
1899        return EXCP_HLT;
1900    }
1901
1902    qemu_mutex_unlock_iothread();
1903    cpu_exec_start(cpu);
1904
1905    do {
1906        MemTxAttrs attrs;
1907
1908        if (cpu->vcpu_dirty) {
1909            kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
1910            cpu->vcpu_dirty = false;
1911        }
1912
1913        kvm_arch_pre_run(cpu, run);
1914        if (atomic_read(&cpu->exit_request)) {
1915            DPRINTF("interrupt exit requested\n");
1916            /*
1917             * KVM requires us to reenter the kernel after IO exits to complete
1918             * instruction emulation. This self-signal will ensure that we
1919             * leave ASAP again.
1920             */
1921            kvm_cpu_kick_self();
1922        }
1923
1924        /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
1925         * Matching barrier in kvm_eat_signals.
1926         */
1927        smp_rmb();
1928
1929        run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
1930
1931        attrs = kvm_arch_post_run(cpu, run);
1932
1933#ifdef KVM_HAVE_MCE_INJECTION
1934        if (unlikely(have_sigbus_pending)) {
1935            qemu_mutex_lock_iothread();
1936            kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
1937                                    pending_sigbus_addr);
1938            have_sigbus_pending = false;
1939            qemu_mutex_unlock_iothread();
1940        }
1941#endif
1942
1943        if (run_ret < 0) {
1944            if (run_ret == -EINTR || run_ret == -EAGAIN) {
1945                DPRINTF("io window exit\n");
1946                kvm_eat_signals(cpu);
1947                ret = EXCP_INTERRUPT;
1948                break;
1949            }
1950            fprintf(stderr, "error: kvm run failed %s\n",
1951                    strerror(-run_ret));
1952#ifdef TARGET_PPC
1953            if (run_ret == -EBUSY) {
1954                fprintf(stderr,
1955                        "This is probably because your SMT is enabled.\n"
1956                        "VCPU can only run on primary threads with all "
1957                        "secondary threads offline.\n");
1958            }
1959#endif
1960            ret = -1;
1961            break;
1962        }
1963
1964        trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
1965        switch (run->exit_reason) {
1966        case KVM_EXIT_IO:
1967            DPRINTF("handle_io\n");
1968            /* Called outside BQL */
1969            kvm_handle_io(run->io.port, attrs,
1970                          (uint8_t *)run + run->io.data_offset,
1971                          run->io.direction,
1972                          run->io.size,
1973                          run->io.count);
1974            ret = 0;
1975            break;
1976        case KVM_EXIT_MMIO:
1977            DPRINTF("handle_mmio\n");
1978            /* Called outside BQL */
1979            address_space_rw(&address_space_memory,
1980                             run->mmio.phys_addr, attrs,
1981                             run->mmio.data,
1982                             run->mmio.len,
1983                             run->mmio.is_write);
1984            ret = 0;
1985            break;
1986        case KVM_EXIT_IRQ_WINDOW_OPEN:
1987            DPRINTF("irq_window_open\n");
1988            ret = EXCP_INTERRUPT;
1989            break;
1990        case KVM_EXIT_SHUTDOWN:
1991            DPRINTF("shutdown\n");
1992            qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
1993            ret = EXCP_INTERRUPT;
1994            break;
1995        case KVM_EXIT_UNKNOWN:
1996            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
1997                    (uint64_t)run->hw.hardware_exit_reason);
1998            ret = -1;
1999            break;
2000        case KVM_EXIT_INTERNAL_ERROR:
2001            ret = kvm_handle_internal_error(cpu, run);
2002            break;
2003        case KVM_EXIT_SYSTEM_EVENT:
2004            switch (run->system_event.type) {
2005            case KVM_SYSTEM_EVENT_SHUTDOWN:
2006                qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
2007                ret = EXCP_INTERRUPT;
2008                break;
2009            case KVM_SYSTEM_EVENT_RESET:
2010                qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2011                ret = EXCP_INTERRUPT;
2012                break;
2013            case KVM_SYSTEM_EVENT_CRASH:
2014                kvm_cpu_synchronize_state(cpu);
2015                qemu_mutex_lock_iothread();
2016                qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2017                qemu_mutex_unlock_iothread();
2018                ret = 0;
2019                break;
2020            default:
2021                DPRINTF("kvm_arch_handle_exit\n");
2022                ret = kvm_arch_handle_exit(cpu, run);
2023                break;
2024            }
2025            break;
2026        default:
2027            DPRINTF("kvm_arch_handle_exit\n");
2028            ret = kvm_arch_handle_exit(cpu, run);
2029            break;
2030        }
2031    } while (ret == 0);
2032
2033    cpu_exec_end(cpu);
2034    qemu_mutex_lock_iothread();
2035
2036    if (ret < 0) {
2037        cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
2038        vm_stop(RUN_STATE_INTERNAL_ERROR);
2039    }
2040
2041    atomic_set(&cpu->exit_request, 0);
2042    return ret;
2043}
2044
2045int kvm_ioctl(KVMState *s, int type, ...)
2046{
2047    int ret;
2048    void *arg;
2049    va_list ap;
2050
2051    va_start(ap, type);
2052    arg = va_arg(ap, void *);
2053    va_end(ap);
2054
2055    trace_kvm_ioctl(type, arg);
2056    ret = ioctl(s->fd, type, arg);
2057    if (ret == -1) {
2058        ret = -errno;
2059    }
2060    return ret;
2061}
2062
2063int kvm_vm_ioctl(KVMState *s, int type, ...)
2064{
2065    int ret;
2066    void *arg;
2067    va_list ap;
2068
2069    va_start(ap, type);
2070    arg = va_arg(ap, void *);
2071    va_end(ap);
2072
2073    trace_kvm_vm_ioctl(type, arg);
2074    ret = ioctl(s->vmfd, type, arg);
2075    if (ret == -1) {
2076        ret = -errno;
2077    }
2078    return ret;
2079}
2080
2081int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
2082{
2083    int ret;
2084    void *arg;
2085    va_list ap;
2086
2087    va_start(ap, type);
2088    arg = va_arg(ap, void *);
2089    va_end(ap);
2090
2091    trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
2092    ret = ioctl(cpu->kvm_fd, type, arg);
2093    if (ret == -1) {
2094        ret = -errno;
2095    }
2096    return ret;
2097}
2098
2099int kvm_device_ioctl(int fd, int type, ...)
2100{
2101    int ret;
2102    void *arg;
2103    va_list ap;
2104
2105    va_start(ap, type);
2106    arg = va_arg(ap, void *);
2107    va_end(ap);
2108
2109    trace_kvm_device_ioctl(fd, type, arg);
2110    ret = ioctl(fd, type, arg);
2111    if (ret == -1) {
2112        ret = -errno;
2113    }
2114    return ret;
2115}
2116
2117int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
2118{
2119    int ret;
2120    struct kvm_device_attr attribute = {
2121        .group = group,
2122        .attr = attr,
2123    };
2124
2125    if (!kvm_vm_attributes_allowed) {
2126        return 0;
2127    }
2128
2129    ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
2130    /* kvm returns 0 on success for HAS_DEVICE_ATTR */
2131    return ret ? 0 : 1;
2132}
2133
2134int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
2135{
2136    struct kvm_device_attr attribute = {
2137        .group = group,
2138        .attr = attr,
2139        .flags = 0,
2140    };
2141
2142    return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
2143}
2144
2145int kvm_device_access(int fd, int group, uint64_t attr,
2146                      void *val, bool write, Error **errp)
2147{
2148    struct kvm_device_attr kvmattr;
2149    int err;
2150
2151    kvmattr.flags = 0;
2152    kvmattr.group = group;
2153    kvmattr.attr = attr;
2154    kvmattr.addr = (uintptr_t)val;
2155
2156    err = kvm_device_ioctl(fd,
2157                           write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
2158                           &kvmattr);
2159    if (err < 0) {
2160        error_setg_errno(errp, -err,
2161                         "KVM_%s_DEVICE_ATTR failed: Group %d "
2162                         "attr 0x%016" PRIx64,
2163                         write ? "SET" : "GET", group, attr);
2164    }
2165    return err;
2166}
2167
2168bool kvm_has_sync_mmu(void)
2169{
2170    return kvm_state->sync_mmu;
2171}
2172
2173int kvm_has_vcpu_events(void)
2174{
2175    return kvm_state->vcpu_events;
2176}
2177
2178int kvm_has_robust_singlestep(void)
2179{
2180    return kvm_state->robust_singlestep;
2181}
2182
2183int kvm_has_debugregs(void)
2184{
2185    return kvm_state->debugregs;
2186}
2187
2188int kvm_has_many_ioeventfds(void)
2189{
2190    if (!kvm_enabled()) {
2191        return 0;
2192    }
2193    return kvm_state->many_ioeventfds;
2194}
2195
2196int kvm_has_gsi_routing(void)
2197{
2198#ifdef KVM_CAP_IRQ_ROUTING
2199    return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
2200#else
2201    return false;
2202#endif
2203}
2204
2205int kvm_has_intx_set_mask(void)
2206{
2207    return kvm_state->intx_set_mask;
2208}
2209
2210bool kvm_arm_supports_user_irq(void)
2211{
2212    return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
2213}
2214
2215#ifdef KVM_CAP_SET_GUEST_DEBUG
2216struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu,
2217                                                 target_ulong pc)
2218{
2219    struct kvm_sw_breakpoint *bp;
2220
2221    QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
2222        if (bp->pc == pc) {
2223            return bp;
2224        }
2225    }
2226    return NULL;
2227}
2228
2229int kvm_sw_breakpoints_active(CPUState *cpu)
2230{
2231    return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
2232}
2233
2234struct kvm_set_guest_debug_data {
2235    struct kvm_guest_debug dbg;
2236    int err;
2237};
2238
2239static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
2240{
2241    struct kvm_set_guest_debug_data *dbg_data =
2242        (struct kvm_set_guest_debug_data *) data.host_ptr;
2243
2244    dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
2245                                   &dbg_data->dbg);
2246}
2247
2248int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
2249{
2250    struct kvm_set_guest_debug_data data;
2251
2252    data.dbg.control = reinject_trap;
2253
2254    if (cpu->singlestep_enabled) {
2255        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
2256    }
2257    kvm_arch_update_guest_debug(cpu, &data.dbg);
2258
2259    run_on_cpu(cpu, kvm_invoke_set_guest_debug,
2260               RUN_ON_CPU_HOST_PTR(&data));
2261    return data.err;
2262}
2263
2264int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2265                          target_ulong len, int type)
2266{
2267    struct kvm_sw_breakpoint *bp;
2268    int err;
2269
2270    if (type == GDB_BREAKPOINT_SW) {
2271        bp = kvm_find_sw_breakpoint(cpu, addr);
2272        if (bp) {
2273            bp->use_count++;
2274            return 0;
2275        }
2276
2277        bp = g_malloc(sizeof(struct kvm_sw_breakpoint));
2278        bp->pc = addr;
2279        bp->use_count = 1;
2280        err = kvm_arch_insert_sw_breakpoint(cpu, bp);
2281        if (err) {
2282            g_free(bp);
2283            return err;
2284        }
2285
2286        QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
2287    } else {
2288        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
2289        if (err) {
2290            return err;
2291        }
2292    }
2293
2294    CPU_FOREACH(cpu) {
2295        err = kvm_update_guest_debug(cpu, 0);
2296        if (err) {
2297            return err;
2298        }
2299    }
2300    return 0;
2301}
2302
2303int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2304                          target_ulong len, int type)
2305{
2306    struct kvm_sw_breakpoint *bp;
2307    int err;
2308
2309    if (type == GDB_BREAKPOINT_SW) {
2310        bp = kvm_find_sw_breakpoint(cpu, addr);
2311        if (!bp) {
2312            return -ENOENT;
2313        }
2314
2315        if (bp->use_count > 1) {
2316            bp->use_count--;
2317            return 0;
2318        }
2319
2320        err = kvm_arch_remove_sw_breakpoint(cpu, bp);
2321        if (err) {
2322            return err;
2323        }
2324
2325        QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
2326        g_free(bp);
2327    } else {
2328        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
2329        if (err) {
2330            return err;
2331        }
2332    }
2333
2334    CPU_FOREACH(cpu) {
2335        err = kvm_update_guest_debug(cpu, 0);
2336        if (err) {
2337            return err;
2338        }
2339    }
2340    return 0;
2341}
2342
2343void kvm_remove_all_breakpoints(CPUState *cpu)
2344{
2345    struct kvm_sw_breakpoint *bp, *next;
2346    KVMState *s = cpu->kvm_state;
2347    CPUState *tmpcpu;
2348
2349    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
2350        if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
2351            /* Try harder to find a CPU that currently sees the breakpoint. */
2352            CPU_FOREACH(tmpcpu) {
2353                if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
2354                    break;
2355                }
2356            }
2357        }
2358        QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
2359        g_free(bp);
2360    }
2361    kvm_arch_remove_all_hw_breakpoints();
2362
2363    CPU_FOREACH(cpu) {
2364        kvm_update_guest_debug(cpu, 0);
2365    }
2366}
2367
2368#else /* !KVM_CAP_SET_GUEST_DEBUG */
2369
2370int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
2371{
2372    return -EINVAL;
2373}
2374
2375int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2376                          target_ulong len, int type)
2377{
2378    return -EINVAL;
2379}
2380
2381int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2382                          target_ulong len, int type)
2383{
2384    return -EINVAL;
2385}
2386
2387void kvm_remove_all_breakpoints(CPUState *cpu)
2388{
2389}
2390#endif /* !KVM_CAP_SET_GUEST_DEBUG */
2391
2392static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
2393{
2394    KVMState *s = kvm_state;
2395    struct kvm_signal_mask *sigmask;
2396    int r;
2397
2398    sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
2399
2400    sigmask->len = s->sigmask_len;
2401    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
2402    r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
2403    g_free(sigmask);
2404
2405    return r;
2406}
2407
2408static void kvm_ipi_signal(int sig)
2409{
2410    if (current_cpu) {
2411        assert(kvm_immediate_exit);
2412        kvm_cpu_kick(current_cpu);
2413    }
2414}
2415
2416void kvm_init_cpu_signals(CPUState *cpu)
2417{
2418    int r;
2419    sigset_t set;
2420    struct sigaction sigact;
2421
2422    memset(&sigact, 0, sizeof(sigact));
2423    sigact.sa_handler = kvm_ipi_signal;
2424    sigaction(SIG_IPI, &sigact, NULL);
2425
2426    pthread_sigmask(SIG_BLOCK, NULL, &set);
2427#if defined KVM_HAVE_MCE_INJECTION
2428    sigdelset(&set, SIGBUS);
2429    pthread_sigmask(SIG_SETMASK, &set, NULL);
2430#endif
2431    sigdelset(&set, SIG_IPI);
2432    if (kvm_immediate_exit) {
2433        r = pthread_sigmask(SIG_SETMASK, &set, NULL);
2434    } else {
2435        r = kvm_set_signal_mask(cpu, &set);
2436    }
2437    if (r) {
2438        fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
2439        exit(1);
2440    }
2441}
2442
2443/* Called asynchronously in VCPU thread.  */
2444int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
2445{
2446#ifdef KVM_HAVE_MCE_INJECTION
2447    if (have_sigbus_pending) {
2448        return 1;
2449    }
2450    have_sigbus_pending = true;
2451    pending_sigbus_addr = addr;
2452    pending_sigbus_code = code;
2453    atomic_set(&cpu->exit_request, 1);
2454    return 0;
2455#else
2456    return 1;
2457#endif
2458}
2459
2460/* Called synchronously (via signalfd) in main thread.  */
2461int kvm_on_sigbus(int code, void *addr)
2462{
2463#ifdef KVM_HAVE_MCE_INJECTION
2464    /* Action required MCE kills the process if SIGBUS is blocked.  Because
2465     * that's what happens in the I/O thread, where we handle MCE via signalfd,
2466     * we can only get action optional here.
2467     */
2468    assert(code != BUS_MCEERR_AR);
2469    kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
2470    return 0;
2471#else
2472    return 1;
2473#endif
2474}
2475
2476int kvm_create_device(KVMState *s, uint64_t type, bool test)
2477{
2478    int ret;
2479    struct kvm_create_device create_dev;
2480
2481    create_dev.type = type;
2482    create_dev.fd = -1;
2483    create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
2484
2485    if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
2486        return -ENOTSUP;
2487    }
2488
2489    ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
2490    if (ret) {
2491        return ret;
2492    }
2493
2494    return test ? 0 : create_dev.fd;
2495}
2496
2497bool kvm_device_supported(int vmfd, uint64_t type)
2498{
2499    struct kvm_create_device create_dev = {
2500        .type = type,
2501        .fd = -1,
2502        .flags = KVM_CREATE_DEVICE_TEST,
2503    };
2504
2505    if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
2506        return false;
2507    }
2508
2509    return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
2510}
2511
2512int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
2513{
2514    struct kvm_one_reg reg;
2515    int r;
2516
2517    reg.id = id;
2518    reg.addr = (uintptr_t) source;
2519    r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
2520    if (r) {
2521        trace_kvm_failed_reg_set(id, strerror(-r));
2522    }
2523    return r;
2524}
2525
2526int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
2527{
2528    struct kvm_one_reg reg;
2529    int r;
2530
2531    reg.id = id;
2532    reg.addr = (uintptr_t) target;
2533    r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
2534    if (r) {
2535        trace_kvm_failed_reg_get(id, strerror(-r));
2536    }
2537    return r;
2538}
2539
2540static void kvm_accel_class_init(ObjectClass *oc, void *data)
2541{
2542    AccelClass *ac = ACCEL_CLASS(oc);
2543    ac->name = "KVM";
2544    ac->init_machine = kvm_init;
2545    ac->allowed = &kvm_allowed;
2546}
2547
2548static const TypeInfo kvm_accel_type = {
2549    .name = TYPE_KVM_ACCEL,
2550    .parent = TYPE_ACCEL,
2551    .class_init = kvm_accel_class_init,
2552    .instance_size = sizeof(KVMState),
2553};
2554
2555static void kvm_type_init(void)
2556{
2557    type_register_static(&kvm_accel_type);
2558}
2559
2560type_init(kvm_type_init);
2561