qemu/accel/kvm/kvm-all.c
<<
>>
Prefs
   1/*
   2 * QEMU KVM support
   3 *
   4 * Copyright IBM, Corp. 2008
   5 *           Red Hat, Inc. 2008
   6 *
   7 * Authors:
   8 *  Anthony Liguori   <aliguori@us.ibm.com>
   9 *  Glauber Costa     <gcosta@redhat.com>
  10 *
  11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
  12 * See the COPYING file in the top-level directory.
  13 *
  14 */
  15
  16#include "qemu/osdep.h"
  17#include <sys/ioctl.h>
  18
  19#include <linux/kvm.h>
  20
  21#include "qemu/atomic.h"
  22#include "qemu/option.h"
  23#include "qemu/config-file.h"
  24#include "qemu/error-report.h"
  25#include "qapi/error.h"
  26#include "hw/pci/msi.h"
  27#include "hw/pci/msix.h"
  28#include "hw/s390x/adapter.h"
  29#include "exec/gdbstub.h"
  30#include "sysemu/kvm_int.h"
  31#include "sysemu/runstate.h"
  32#include "sysemu/cpus.h"
  33#include "sysemu/sysemu.h"
  34#include "qemu/bswap.h"
  35#include "exec/memory.h"
  36#include "exec/ram_addr.h"
  37#include "exec/address-spaces.h"
  38#include "qemu/event_notifier.h"
  39#include "qemu/main-loop.h"
  40#include "trace.h"
  41#include "hw/irq.h"
  42#include "sysemu/sev.h"
  43#include "sysemu/balloon.h"
  44
  45#include "hw/boards.h"
  46
  47/* This check must be after config-host.h is included */
  48#ifdef CONFIG_EVENTFD
  49#include <sys/eventfd.h>
  50#endif
  51
  52/* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
  53 * need to use the real host PAGE_SIZE, as that's what KVM will use.
  54 */
  55#define PAGE_SIZE qemu_real_host_page_size
  56
  57//#define DEBUG_KVM
  58
  59#ifdef DEBUG_KVM
  60#define DPRINTF(fmt, ...) \
  61    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
  62#else
  63#define DPRINTF(fmt, ...) \
  64    do { } while (0)
  65#endif
  66
  67#define KVM_MSI_HASHTAB_SIZE    256
  68
  69struct KVMParkedVcpu {
  70    unsigned long vcpu_id;
  71    int kvm_fd;
  72    QLIST_ENTRY(KVMParkedVcpu) node;
  73};
  74
  75struct KVMState
  76{
  77    AccelState parent_obj;
  78
  79    int nr_slots;
  80    int fd;
  81    int vmfd;
  82    int coalesced_mmio;
  83    int coalesced_pio;
  84    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
  85    bool coalesced_flush_in_progress;
  86    int vcpu_events;
  87    int robust_singlestep;
  88    int debugregs;
  89#ifdef KVM_CAP_SET_GUEST_DEBUG
  90    QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
  91#endif
  92    int max_nested_state_len;
  93    int many_ioeventfds;
  94    int intx_set_mask;
  95    bool sync_mmu;
  96    bool manual_dirty_log_protect;
  97    /* The man page (and posix) say ioctl numbers are signed int, but
  98     * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
  99     * unsigned, and treating them as signed here can break things */
 100    unsigned irq_set_ioctl;
 101    unsigned int sigmask_len;
 102    GHashTable *gsimap;
 103#ifdef KVM_CAP_IRQ_ROUTING
 104    struct kvm_irq_routing *irq_routes;
 105    int nr_allocated_irq_routes;
 106    unsigned long *used_gsi_bitmap;
 107    unsigned int gsi_count;
 108    QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
 109#endif
 110    KVMMemoryListener memory_listener;
 111    QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
 112
 113    /* memory encryption */
 114    void *memcrypt_handle;
 115    int (*memcrypt_encrypt_data)(void *handle, uint8_t *ptr, uint64_t len);
 116
 117    /* For "info mtree -f" to tell if an MR is registered in KVM */
 118    int nr_as;
 119    struct KVMAs {
 120        KVMMemoryListener *ml;
 121        AddressSpace *as;
 122    } *as;
 123};
 124
 125KVMState *kvm_state;
 126bool kvm_kernel_irqchip;
 127bool kvm_split_irqchip;
 128bool kvm_async_interrupts_allowed;
 129bool kvm_halt_in_kernel_allowed;
 130bool kvm_eventfds_allowed;
 131bool kvm_irqfds_allowed;
 132bool kvm_resamplefds_allowed;
 133bool kvm_msi_via_irqfd_allowed;
 134bool kvm_gsi_routing_allowed;
 135bool kvm_gsi_direct_mapping;
 136bool kvm_allowed;
 137bool kvm_readonly_mem_allowed;
 138bool kvm_vm_attributes_allowed;
 139bool kvm_direct_msi_allowed;
 140bool kvm_ioeventfd_any_length_allowed;
 141bool kvm_msi_use_devid;
 142static bool kvm_immediate_exit;
 143static hwaddr kvm_max_slot_size = ~0;
 144
 145static const KVMCapabilityInfo kvm_required_capabilites[] = {
 146    KVM_CAP_INFO(USER_MEMORY),
 147    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
 148    KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
 149    KVM_CAP_LAST_INFO
 150};
 151
 152static NotifierList kvm_irqchip_change_notifiers =
 153    NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
 154
 155#define kvm_slots_lock(kml)      qemu_mutex_lock(&(kml)->slots_lock)
 156#define kvm_slots_unlock(kml)    qemu_mutex_unlock(&(kml)->slots_lock)
 157
 158int kvm_get_max_memslots(void)
 159{
 160    KVMState *s = KVM_STATE(current_machine->accelerator);
 161
 162    return s->nr_slots;
 163}
 164
 165bool kvm_memcrypt_enabled(void)
 166{
 167    if (kvm_state && kvm_state->memcrypt_handle) {
 168        return true;
 169    }
 170
 171    return false;
 172}
 173
 174int kvm_memcrypt_encrypt_data(uint8_t *ptr, uint64_t len)
 175{
 176    if (kvm_state->memcrypt_handle &&
 177        kvm_state->memcrypt_encrypt_data) {
 178        return kvm_state->memcrypt_encrypt_data(kvm_state->memcrypt_handle,
 179                                              ptr, len);
 180    }
 181
 182    return 1;
 183}
 184
 185/* Called with KVMMemoryListener.slots_lock held */
 186static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
 187{
 188    KVMState *s = kvm_state;
 189    int i;
 190
 191    for (i = 0; i < s->nr_slots; i++) {
 192        if (kml->slots[i].memory_size == 0) {
 193            return &kml->slots[i];
 194        }
 195    }
 196
 197    return NULL;
 198}
 199
 200bool kvm_has_free_slot(MachineState *ms)
 201{
 202    KVMState *s = KVM_STATE(ms->accelerator);
 203    bool result;
 204    KVMMemoryListener *kml = &s->memory_listener;
 205
 206    kvm_slots_lock(kml);
 207    result = !!kvm_get_free_slot(kml);
 208    kvm_slots_unlock(kml);
 209
 210    return result;
 211}
 212
 213/* Called with KVMMemoryListener.slots_lock held */
 214static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
 215{
 216    KVMSlot *slot = kvm_get_free_slot(kml);
 217
 218    if (slot) {
 219        return slot;
 220    }
 221
 222    fprintf(stderr, "%s: no free slot available\n", __func__);
 223    abort();
 224}
 225
 226static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
 227                                         hwaddr start_addr,
 228                                         hwaddr size)
 229{
 230    KVMState *s = kvm_state;
 231    int i;
 232
 233    for (i = 0; i < s->nr_slots; i++) {
 234        KVMSlot *mem = &kml->slots[i];
 235
 236        if (start_addr == mem->start_addr && size == mem->memory_size) {
 237            return mem;
 238        }
 239    }
 240
 241    return NULL;
 242}
 243
 244/*
 245 * Calculate and align the start address and the size of the section.
 246 * Return the size. If the size is 0, the aligned section is empty.
 247 */
 248static hwaddr kvm_align_section(MemoryRegionSection *section,
 249                                hwaddr *start)
 250{
 251    hwaddr size = int128_get64(section->size);
 252    hwaddr delta, aligned;
 253
 254    /* kvm works in page size chunks, but the function may be called
 255       with sub-page size and unaligned start address. Pad the start
 256       address to next and truncate size to previous page boundary. */
 257    aligned = ROUND_UP(section->offset_within_address_space,
 258                       qemu_real_host_page_size);
 259    delta = aligned - section->offset_within_address_space;
 260    *start = aligned;
 261    if (delta > size) {
 262        return 0;
 263    }
 264
 265    return (size - delta) & qemu_real_host_page_mask;
 266}
 267
 268int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
 269                                       hwaddr *phys_addr)
 270{
 271    KVMMemoryListener *kml = &s->memory_listener;
 272    int i, ret = 0;
 273
 274    kvm_slots_lock(kml);
 275    for (i = 0; i < s->nr_slots; i++) {
 276        KVMSlot *mem = &kml->slots[i];
 277
 278        if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
 279            *phys_addr = mem->start_addr + (ram - mem->ram);
 280            ret = 1;
 281            break;
 282        }
 283    }
 284    kvm_slots_unlock(kml);
 285
 286    return ret;
 287}
 288
 289static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new)
 290{
 291    KVMState *s = kvm_state;
 292    struct kvm_userspace_memory_region mem;
 293    int ret;
 294
 295    mem.slot = slot->slot | (kml->as_id << 16);
 296    mem.guest_phys_addr = slot->start_addr;
 297    mem.userspace_addr = (unsigned long)slot->ram;
 298    mem.flags = slot->flags;
 299
 300    if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) {
 301        /* Set the slot size to 0 before setting the slot to the desired
 302         * value. This is needed based on KVM commit 75d61fbc. */
 303        mem.memory_size = 0;
 304        kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 305    }
 306    mem.memory_size = slot->memory_size;
 307    ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 308    slot->old_flags = mem.flags;
 309    trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr,
 310                              mem.memory_size, mem.userspace_addr, ret);
 311    return ret;
 312}
 313
 314int kvm_destroy_vcpu(CPUState *cpu)
 315{
 316    KVMState *s = kvm_state;
 317    long mmap_size;
 318    struct KVMParkedVcpu *vcpu = NULL;
 319    int ret = 0;
 320
 321    DPRINTF("kvm_destroy_vcpu\n");
 322
 323    ret = kvm_arch_destroy_vcpu(cpu);
 324    if (ret < 0) {
 325        goto err;
 326    }
 327
 328    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
 329    if (mmap_size < 0) {
 330        ret = mmap_size;
 331        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
 332        goto err;
 333    }
 334
 335    ret = munmap(cpu->kvm_run, mmap_size);
 336    if (ret < 0) {
 337        goto err;
 338    }
 339
 340    vcpu = g_malloc0(sizeof(*vcpu));
 341    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
 342    vcpu->kvm_fd = cpu->kvm_fd;
 343    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
 344err:
 345    return ret;
 346}
 347
 348static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
 349{
 350    struct KVMParkedVcpu *cpu;
 351
 352    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
 353        if (cpu->vcpu_id == vcpu_id) {
 354            int kvm_fd;
 355
 356            QLIST_REMOVE(cpu, node);
 357            kvm_fd = cpu->kvm_fd;
 358            g_free(cpu);
 359            return kvm_fd;
 360        }
 361    }
 362
 363    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
 364}
 365
 366int kvm_init_vcpu(CPUState *cpu)
 367{
 368    KVMState *s = kvm_state;
 369    long mmap_size;
 370    int ret;
 371
 372    DPRINTF("kvm_init_vcpu\n");
 373
 374    ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
 375    if (ret < 0) {
 376        DPRINTF("kvm_create_vcpu failed\n");
 377        goto err;
 378    }
 379
 380    cpu->kvm_fd = ret;
 381    cpu->kvm_state = s;
 382    cpu->vcpu_dirty = true;
 383
 384    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
 385    if (mmap_size < 0) {
 386        ret = mmap_size;
 387        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
 388        goto err;
 389    }
 390
 391    cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
 392                        cpu->kvm_fd, 0);
 393    if (cpu->kvm_run == MAP_FAILED) {
 394        ret = -errno;
 395        DPRINTF("mmap'ing vcpu state failed\n");
 396        goto err;
 397    }
 398
 399    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
 400        s->coalesced_mmio_ring =
 401            (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
 402    }
 403
 404    ret = kvm_arch_init_vcpu(cpu);
 405err:
 406    return ret;
 407}
 408
 409/*
 410 * dirty pages logging control
 411 */
 412
 413static int kvm_mem_flags(MemoryRegion *mr)
 414{
 415    bool readonly = mr->readonly || memory_region_is_romd(mr);
 416    int flags = 0;
 417
 418    if (memory_region_get_dirty_log_mask(mr) != 0) {
 419        flags |= KVM_MEM_LOG_DIRTY_PAGES;
 420    }
 421    if (readonly && kvm_readonly_mem_allowed) {
 422        flags |= KVM_MEM_READONLY;
 423    }
 424    return flags;
 425}
 426
 427/* Called with KVMMemoryListener.slots_lock held */
 428static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
 429                                 MemoryRegion *mr)
 430{
 431    mem->flags = kvm_mem_flags(mr);
 432
 433    /* If nothing changed effectively, no need to issue ioctl */
 434    if (mem->flags == mem->old_flags) {
 435        return 0;
 436    }
 437
 438    return kvm_set_user_memory_region(kml, mem, false);
 439}
 440
 441static int kvm_section_update_flags(KVMMemoryListener *kml,
 442                                    MemoryRegionSection *section)
 443{
 444    hwaddr start_addr, size, slot_size;
 445    KVMSlot *mem;
 446    int ret = 0;
 447
 448    size = kvm_align_section(section, &start_addr);
 449    if (!size) {
 450        return 0;
 451    }
 452
 453    kvm_slots_lock(kml);
 454
 455    while (size && !ret) {
 456        slot_size = MIN(kvm_max_slot_size, size);
 457        mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
 458        if (!mem) {
 459            /* We don't have a slot if we want to trap every access. */
 460            goto out;
 461        }
 462
 463        ret = kvm_slot_update_flags(kml, mem, section->mr);
 464        start_addr += slot_size;
 465        size -= slot_size;
 466    }
 467
 468out:
 469    kvm_slots_unlock(kml);
 470    return ret;
 471}
 472
 473static void kvm_log_start(MemoryListener *listener,
 474                          MemoryRegionSection *section,
 475                          int old, int new)
 476{
 477    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 478    int r;
 479
 480    if (old != 0) {
 481        return;
 482    }
 483
 484    r = kvm_section_update_flags(kml, section);
 485    if (r < 0) {
 486        abort();
 487    }
 488}
 489
 490static void kvm_log_stop(MemoryListener *listener,
 491                          MemoryRegionSection *section,
 492                          int old, int new)
 493{
 494    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 495    int r;
 496
 497    if (new != 0) {
 498        return;
 499    }
 500
 501    r = kvm_section_update_flags(kml, section);
 502    if (r < 0) {
 503        abort();
 504    }
 505}
 506
 507/* get kvm's dirty pages bitmap and update qemu's */
 508static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
 509                                         unsigned long *bitmap)
 510{
 511    ram_addr_t start = section->offset_within_region +
 512                       memory_region_get_ram_addr(section->mr);
 513    ram_addr_t pages = int128_get64(section->size) / qemu_real_host_page_size;
 514
 515    cpu_physical_memory_set_dirty_lebitmap(bitmap, start, pages);
 516    return 0;
 517}
 518
 519#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
 520
 521/* Allocate the dirty bitmap for a slot  */
 522static void kvm_memslot_init_dirty_bitmap(KVMSlot *mem)
 523{
 524    /*
 525     * XXX bad kernel interface alert
 526     * For dirty bitmap, kernel allocates array of size aligned to
 527     * bits-per-long.  But for case when the kernel is 64bits and
 528     * the userspace is 32bits, userspace can't align to the same
 529     * bits-per-long, since sizeof(long) is different between kernel
 530     * and user space.  This way, userspace will provide buffer which
 531     * may be 4 bytes less than the kernel will use, resulting in
 532     * userspace memory corruption (which is not detectable by valgrind
 533     * too, in most cases).
 534     * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
 535     * a hope that sizeof(long) won't become >8 any time soon.
 536     */
 537    hwaddr bitmap_size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
 538                                        /*HOST_LONG_BITS*/ 64) / 8;
 539    mem->dirty_bmap = g_malloc0(bitmap_size);
 540}
 541
 542/**
 543 * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
 544 *
 545 * This function will first try to fetch dirty bitmap from the kernel,
 546 * and then updates qemu's dirty bitmap.
 547 *
 548 * NOTE: caller must be with kml->slots_lock held.
 549 *
 550 * @kml: the KVM memory listener object
 551 * @section: the memory section to sync the dirty bitmap with
 552 */
 553static int kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
 554                                          MemoryRegionSection *section)
 555{
 556    KVMState *s = kvm_state;
 557    struct kvm_dirty_log d = {};
 558    KVMSlot *mem;
 559    hwaddr start_addr, size;
 560    hwaddr slot_size, slot_offset = 0;
 561    int ret = 0;
 562
 563    size = kvm_align_section(section, &start_addr);
 564    while (size) {
 565        MemoryRegionSection subsection = *section;
 566
 567        slot_size = MIN(kvm_max_slot_size, size);
 568        mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
 569        if (!mem) {
 570            /* We don't have a slot if we want to trap every access. */
 571            goto out;
 572        }
 573
 574        if (!mem->dirty_bmap) {
 575            /* Allocate on the first log_sync, once and for all */
 576            kvm_memslot_init_dirty_bitmap(mem);
 577        }
 578
 579        d.dirty_bitmap = mem->dirty_bmap;
 580        d.slot = mem->slot | (kml->as_id << 16);
 581        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
 582            DPRINTF("ioctl failed %d\n", errno);
 583            ret = -1;
 584            goto out;
 585        }
 586
 587        subsection.offset_within_region += slot_offset;
 588        subsection.size = int128_make64(slot_size);
 589        kvm_get_dirty_pages_log_range(&subsection, d.dirty_bitmap);
 590
 591        slot_offset += slot_size;
 592        start_addr += slot_size;
 593        size -= slot_size;
 594    }
 595out:
 596    return ret;
 597}
 598
 599/* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */
 600#define KVM_CLEAR_LOG_SHIFT  6
 601#define KVM_CLEAR_LOG_ALIGN  (qemu_real_host_page_size << KVM_CLEAR_LOG_SHIFT)
 602#define KVM_CLEAR_LOG_MASK   (-KVM_CLEAR_LOG_ALIGN)
 603
 604static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
 605                                  uint64_t size)
 606{
 607    KVMState *s = kvm_state;
 608    uint64_t end, bmap_start, start_delta, bmap_npages;
 609    struct kvm_clear_dirty_log d;
 610    unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size;
 611    int ret;
 612
 613    /*
 614     * We need to extend either the start or the size or both to
 615     * satisfy the KVM interface requirement.  Firstly, do the start
 616     * page alignment on 64 host pages
 617     */
 618    bmap_start = start & KVM_CLEAR_LOG_MASK;
 619    start_delta = start - bmap_start;
 620    bmap_start /= psize;
 621
 622    /*
 623     * The kernel interface has restriction on the size too, that either:
 624     *
 625     * (1) the size is 64 host pages aligned (just like the start), or
 626     * (2) the size fills up until the end of the KVM memslot.
 627     */
 628    bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN)
 629        << KVM_CLEAR_LOG_SHIFT;
 630    end = mem->memory_size / psize;
 631    if (bmap_npages > end - bmap_start) {
 632        bmap_npages = end - bmap_start;
 633    }
 634    start_delta /= psize;
 635
 636    /*
 637     * Prepare the bitmap to clear dirty bits.  Here we must guarantee
 638     * that we won't clear any unknown dirty bits otherwise we might
 639     * accidentally clear some set bits which are not yet synced from
 640     * the kernel into QEMU's bitmap, then we'll lose track of the
 641     * guest modifications upon those pages (which can directly lead
 642     * to guest data loss or panic after migration).
 643     *
 644     * Layout of the KVMSlot.dirty_bmap:
 645     *
 646     *                   |<-------- bmap_npages -----------..>|
 647     *                                                     [1]
 648     *                     start_delta         size
 649     *  |----------------|-------------|------------------|------------|
 650     *  ^                ^             ^                               ^
 651     *  |                |             |                               |
 652     * start          bmap_start     (start)                         end
 653     * of memslot                                             of memslot
 654     *
 655     * [1] bmap_npages can be aligned to either 64 pages or the end of slot
 656     */
 657
 658    assert(bmap_start % BITS_PER_LONG == 0);
 659    /* We should never do log_clear before log_sync */
 660    assert(mem->dirty_bmap);
 661    if (start_delta) {
 662        /* Slow path - we need to manipulate a temp bitmap */
 663        bmap_clear = bitmap_new(bmap_npages);
 664        bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap,
 665                                    bmap_start, start_delta + size / psize);
 666        /*
 667         * We need to fill the holes at start because that was not
 668         * specified by the caller and we extended the bitmap only for
 669         * 64 pages alignment
 670         */
 671        bitmap_clear(bmap_clear, 0, start_delta);
 672        d.dirty_bitmap = bmap_clear;
 673    } else {
 674        /* Fast path - start address aligns well with BITS_PER_LONG */
 675        d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start);
 676    }
 677
 678    d.first_page = bmap_start;
 679    /* It should never overflow.  If it happens, say something */
 680    assert(bmap_npages <= UINT32_MAX);
 681    d.num_pages = bmap_npages;
 682    d.slot = mem->slot | (as_id << 16);
 683
 684    if (kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d) == -1) {
 685        ret = -errno;
 686        error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, "
 687                     "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d",
 688                     __func__, d.slot, (uint64_t)d.first_page,
 689                     (uint32_t)d.num_pages, ret);
 690    } else {
 691        ret = 0;
 692        trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages);
 693    }
 694
 695    /*
 696     * After we have updated the remote dirty bitmap, we update the
 697     * cached bitmap as well for the memslot, then if another user
 698     * clears the same region we know we shouldn't clear it again on
 699     * the remote otherwise it's data loss as well.
 700     */
 701    bitmap_clear(mem->dirty_bmap, bmap_start + start_delta,
 702                 size / psize);
 703    /* This handles the NULL case well */
 704    g_free(bmap_clear);
 705    return ret;
 706}
 707
 708
 709/**
 710 * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range
 711 *
 712 * NOTE: this will be a no-op if we haven't enabled manual dirty log
 713 * protection in the host kernel because in that case this operation
 714 * will be done within log_sync().
 715 *
 716 * @kml:     the kvm memory listener
 717 * @section: the memory range to clear dirty bitmap
 718 */
 719static int kvm_physical_log_clear(KVMMemoryListener *kml,
 720                                  MemoryRegionSection *section)
 721{
 722    KVMState *s = kvm_state;
 723    uint64_t start, size, offset, count;
 724    KVMSlot *mem;
 725    int ret = 0, i;
 726
 727    if (!s->manual_dirty_log_protect) {
 728        /* No need to do explicit clear */
 729        return ret;
 730    }
 731
 732    start = section->offset_within_address_space;
 733    size = int128_get64(section->size);
 734
 735    if (!size) {
 736        /* Nothing more we can do... */
 737        return ret;
 738    }
 739
 740    kvm_slots_lock(kml);
 741
 742    for (i = 0; i < s->nr_slots; i++) {
 743        mem = &kml->slots[i];
 744        /* Discard slots that are empty or do not overlap the section */
 745        if (!mem->memory_size ||
 746            mem->start_addr > start + size - 1 ||
 747            start > mem->start_addr + mem->memory_size - 1) {
 748            continue;
 749        }
 750
 751        if (start >= mem->start_addr) {
 752            /* The slot starts before section or is aligned to it.  */
 753            offset = start - mem->start_addr;
 754            count = MIN(mem->memory_size - offset, size);
 755        } else {
 756            /* The slot starts after section.  */
 757            offset = 0;
 758            count = MIN(mem->memory_size, size - (mem->start_addr - start));
 759        }
 760        ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count);
 761        if (ret < 0) {
 762            break;
 763        }
 764    }
 765
 766    kvm_slots_unlock(kml);
 767
 768    return ret;
 769}
 770
 771static void kvm_coalesce_mmio_region(MemoryListener *listener,
 772                                     MemoryRegionSection *secion,
 773                                     hwaddr start, hwaddr size)
 774{
 775    KVMState *s = kvm_state;
 776
 777    if (s->coalesced_mmio) {
 778        struct kvm_coalesced_mmio_zone zone;
 779
 780        zone.addr = start;
 781        zone.size = size;
 782        zone.pad = 0;
 783
 784        (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
 785    }
 786}
 787
 788static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
 789                                       MemoryRegionSection *secion,
 790                                       hwaddr start, hwaddr size)
 791{
 792    KVMState *s = kvm_state;
 793
 794    if (s->coalesced_mmio) {
 795        struct kvm_coalesced_mmio_zone zone;
 796
 797        zone.addr = start;
 798        zone.size = size;
 799        zone.pad = 0;
 800
 801        (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
 802    }
 803}
 804
 805static void kvm_coalesce_pio_add(MemoryListener *listener,
 806                                MemoryRegionSection *section,
 807                                hwaddr start, hwaddr size)
 808{
 809    KVMState *s = kvm_state;
 810
 811    if (s->coalesced_pio) {
 812        struct kvm_coalesced_mmio_zone zone;
 813
 814        zone.addr = start;
 815        zone.size = size;
 816        zone.pio = 1;
 817
 818        (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
 819    }
 820}
 821
 822static void kvm_coalesce_pio_del(MemoryListener *listener,
 823                                MemoryRegionSection *section,
 824                                hwaddr start, hwaddr size)
 825{
 826    KVMState *s = kvm_state;
 827
 828    if (s->coalesced_pio) {
 829        struct kvm_coalesced_mmio_zone zone;
 830
 831        zone.addr = start;
 832        zone.size = size;
 833        zone.pio = 1;
 834
 835        (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
 836     }
 837}
 838
 839static MemoryListener kvm_coalesced_pio_listener = {
 840    .coalesced_io_add = kvm_coalesce_pio_add,
 841    .coalesced_io_del = kvm_coalesce_pio_del,
 842};
 843
 844int kvm_check_extension(KVMState *s, unsigned int extension)
 845{
 846    int ret;
 847
 848    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
 849    if (ret < 0) {
 850        ret = 0;
 851    }
 852
 853    return ret;
 854}
 855
 856int kvm_vm_check_extension(KVMState *s, unsigned int extension)
 857{
 858    int ret;
 859
 860    ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
 861    if (ret < 0) {
 862        /* VM wide version not implemented, use global one instead */
 863        ret = kvm_check_extension(s, extension);
 864    }
 865
 866    return ret;
 867}
 868
 869static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
 870{
 871#if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN)
 872    /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN
 873     * endianness, but the memory core hands them in target endianness.
 874     * For example, PPC is always treated as big-endian even if running
 875     * on KVM and on PPC64LE.  Correct here.
 876     */
 877    switch (size) {
 878    case 2:
 879        val = bswap16(val);
 880        break;
 881    case 4:
 882        val = bswap32(val);
 883        break;
 884    }
 885#endif
 886    return val;
 887}
 888
 889static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
 890                                  bool assign, uint32_t size, bool datamatch)
 891{
 892    int ret;
 893    struct kvm_ioeventfd iofd = {
 894        .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
 895        .addr = addr,
 896        .len = size,
 897        .flags = 0,
 898        .fd = fd,
 899    };
 900
 901    trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size,
 902                                 datamatch);
 903    if (!kvm_enabled()) {
 904        return -ENOSYS;
 905    }
 906
 907    if (datamatch) {
 908        iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
 909    }
 910    if (!assign) {
 911        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
 912    }
 913
 914    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
 915
 916    if (ret < 0) {
 917        return -errno;
 918    }
 919
 920    return 0;
 921}
 922
 923static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
 924                                 bool assign, uint32_t size, bool datamatch)
 925{
 926    struct kvm_ioeventfd kick = {
 927        .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
 928        .addr = addr,
 929        .flags = KVM_IOEVENTFD_FLAG_PIO,
 930        .len = size,
 931        .fd = fd,
 932    };
 933    int r;
 934    trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch);
 935    if (!kvm_enabled()) {
 936        return -ENOSYS;
 937    }
 938    if (datamatch) {
 939        kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
 940    }
 941    if (!assign) {
 942        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
 943    }
 944    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
 945    if (r < 0) {
 946        return r;
 947    }
 948    return 0;
 949}
 950
 951
 952static int kvm_check_many_ioeventfds(void)
 953{
 954    /* Userspace can use ioeventfd for io notification.  This requires a host
 955     * that supports eventfd(2) and an I/O thread; since eventfd does not
 956     * support SIGIO it cannot interrupt the vcpu.
 957     *
 958     * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
 959     * can avoid creating too many ioeventfds.
 960     */
 961#if defined(CONFIG_EVENTFD)
 962    int ioeventfds[7];
 963    int i, ret = 0;
 964    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
 965        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
 966        if (ioeventfds[i] < 0) {
 967            break;
 968        }
 969        ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true);
 970        if (ret < 0) {
 971            close(ioeventfds[i]);
 972            break;
 973        }
 974    }
 975
 976    /* Decide whether many devices are supported or not */
 977    ret = i == ARRAY_SIZE(ioeventfds);
 978
 979    while (i-- > 0) {
 980        kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true);
 981        close(ioeventfds[i]);
 982    }
 983    return ret;
 984#else
 985    return 0;
 986#endif
 987}
 988
 989static const KVMCapabilityInfo *
 990kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
 991{
 992    while (list->name) {
 993        if (!kvm_check_extension(s, list->value)) {
 994            return list;
 995        }
 996        list++;
 997    }
 998    return NULL;
 999}
1000
1001void kvm_set_max_memslot_size(hwaddr max_slot_size)
1002{
1003    g_assert(
1004        ROUND_UP(max_slot_size, qemu_real_host_page_size) == max_slot_size
1005    );
1006    kvm_max_slot_size = max_slot_size;
1007}
1008
1009static void kvm_set_phys_mem(KVMMemoryListener *kml,
1010                             MemoryRegionSection *section, bool add)
1011{
1012    KVMSlot *mem;
1013    int err;
1014    MemoryRegion *mr = section->mr;
1015    bool writeable = !mr->readonly && !mr->rom_device;
1016    hwaddr start_addr, size, slot_size;
1017    void *ram;
1018
1019    if (!memory_region_is_ram(mr)) {
1020        if (writeable || !kvm_readonly_mem_allowed) {
1021            return;
1022        } else if (!mr->romd_mode) {
1023            /* If the memory device is not in romd_mode, then we actually want
1024             * to remove the kvm memory slot so all accesses will trap. */
1025            add = false;
1026        }
1027    }
1028
1029    size = kvm_align_section(section, &start_addr);
1030    if (!size) {
1031        return;
1032    }
1033
1034    /* use aligned delta to align the ram address */
1035    ram = memory_region_get_ram_ptr(mr) + section->offset_within_region +
1036          (start_addr - section->offset_within_address_space);
1037
1038    kvm_slots_lock(kml);
1039
1040    if (!add) {
1041        do {
1042            slot_size = MIN(kvm_max_slot_size, size);
1043            mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
1044            if (!mem) {
1045                goto out;
1046            }
1047            if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1048                kvm_physical_sync_dirty_bitmap(kml, section);
1049            }
1050
1051            /* unregister the slot */
1052            g_free(mem->dirty_bmap);
1053            mem->dirty_bmap = NULL;
1054            mem->memory_size = 0;
1055            mem->flags = 0;
1056            err = kvm_set_user_memory_region(kml, mem, false);
1057            if (err) {
1058                fprintf(stderr, "%s: error unregistering slot: %s\n",
1059                        __func__, strerror(-err));
1060                abort();
1061            }
1062            start_addr += slot_size;
1063            size -= slot_size;
1064        } while (size);
1065        goto out;
1066    }
1067
1068    /* register the new slot */
1069    do {
1070        slot_size = MIN(kvm_max_slot_size, size);
1071        mem = kvm_alloc_slot(kml);
1072        mem->memory_size = slot_size;
1073        mem->start_addr = start_addr;
1074        mem->ram = ram;
1075        mem->flags = kvm_mem_flags(mr);
1076
1077        if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1078            /*
1079             * Reallocate the bmap; it means it doesn't disappear in
1080             * middle of a migrate.
1081             */
1082            kvm_memslot_init_dirty_bitmap(mem);
1083        }
1084        err = kvm_set_user_memory_region(kml, mem, true);
1085        if (err) {
1086            fprintf(stderr, "%s: error registering slot: %s\n", __func__,
1087                    strerror(-err));
1088            abort();
1089        }
1090        start_addr += slot_size;
1091        ram += slot_size;
1092        size -= slot_size;
1093    } while (size);
1094
1095out:
1096    kvm_slots_unlock(kml);
1097}
1098
1099static void kvm_region_add(MemoryListener *listener,
1100                           MemoryRegionSection *section)
1101{
1102    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1103
1104    memory_region_ref(section->mr);
1105    kvm_set_phys_mem(kml, section, true);
1106}
1107
1108static void kvm_region_del(MemoryListener *listener,
1109                           MemoryRegionSection *section)
1110{
1111    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1112
1113    kvm_set_phys_mem(kml, section, false);
1114    memory_region_unref(section->mr);
1115}
1116
1117static void kvm_log_sync(MemoryListener *listener,
1118                         MemoryRegionSection *section)
1119{
1120    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1121    int r;
1122
1123    kvm_slots_lock(kml);
1124    r = kvm_physical_sync_dirty_bitmap(kml, section);
1125    kvm_slots_unlock(kml);
1126    if (r < 0) {
1127        abort();
1128    }
1129}
1130
1131static void kvm_log_clear(MemoryListener *listener,
1132                          MemoryRegionSection *section)
1133{
1134    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1135    int r;
1136
1137    r = kvm_physical_log_clear(kml, section);
1138    if (r < 0) {
1139        error_report_once("%s: kvm log clear failed: mr=%s "
1140                          "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__,
1141                          section->mr->name, section->offset_within_region,
1142                          int128_get64(section->size));
1143        abort();
1144    }
1145}
1146
1147static void kvm_mem_ioeventfd_add(MemoryListener *listener,
1148                                  MemoryRegionSection *section,
1149                                  bool match_data, uint64_t data,
1150                                  EventNotifier *e)
1151{
1152    int fd = event_notifier_get_fd(e);
1153    int r;
1154
1155    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1156                               data, true, int128_get64(section->size),
1157                               match_data);
1158    if (r < 0) {
1159        fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1160                __func__, strerror(-r), -r);
1161        abort();
1162    }
1163}
1164
1165static void kvm_mem_ioeventfd_del(MemoryListener *listener,
1166                                  MemoryRegionSection *section,
1167                                  bool match_data, uint64_t data,
1168                                  EventNotifier *e)
1169{
1170    int fd = event_notifier_get_fd(e);
1171    int r;
1172
1173    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1174                               data, false, int128_get64(section->size),
1175                               match_data);
1176    if (r < 0) {
1177        fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1178                __func__, strerror(-r), -r);
1179        abort();
1180    }
1181}
1182
1183static void kvm_io_ioeventfd_add(MemoryListener *listener,
1184                                 MemoryRegionSection *section,
1185                                 bool match_data, uint64_t data,
1186                                 EventNotifier *e)
1187{
1188    int fd = event_notifier_get_fd(e);
1189    int r;
1190
1191    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1192                              data, true, int128_get64(section->size),
1193                              match_data);
1194    if (r < 0) {
1195        fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1196                __func__, strerror(-r), -r);
1197        abort();
1198    }
1199}
1200
1201static void kvm_io_ioeventfd_del(MemoryListener *listener,
1202                                 MemoryRegionSection *section,
1203                                 bool match_data, uint64_t data,
1204                                 EventNotifier *e)
1205
1206{
1207    int fd = event_notifier_get_fd(e);
1208    int r;
1209
1210    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1211                              data, false, int128_get64(section->size),
1212                              match_data);
1213    if (r < 0) {
1214        fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1215                __func__, strerror(-r), -r);
1216        abort();
1217    }
1218}
1219
1220void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
1221                                  AddressSpace *as, int as_id)
1222{
1223    int i;
1224
1225    qemu_mutex_init(&kml->slots_lock);
1226    kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot));
1227    kml->as_id = as_id;
1228
1229    for (i = 0; i < s->nr_slots; i++) {
1230        kml->slots[i].slot = i;
1231    }
1232
1233    kml->listener.region_add = kvm_region_add;
1234    kml->listener.region_del = kvm_region_del;
1235    kml->listener.log_start = kvm_log_start;
1236    kml->listener.log_stop = kvm_log_stop;
1237    kml->listener.log_sync = kvm_log_sync;
1238    kml->listener.log_clear = kvm_log_clear;
1239    kml->listener.priority = 10;
1240
1241    memory_listener_register(&kml->listener, as);
1242
1243    for (i = 0; i < s->nr_as; ++i) {
1244        if (!s->as[i].as) {
1245            s->as[i].as = as;
1246            s->as[i].ml = kml;
1247            break;
1248        }
1249    }
1250}
1251
1252static MemoryListener kvm_io_listener = {
1253    .eventfd_add = kvm_io_ioeventfd_add,
1254    .eventfd_del = kvm_io_ioeventfd_del,
1255    .priority = 10,
1256};
1257
1258int kvm_set_irq(KVMState *s, int irq, int level)
1259{
1260    struct kvm_irq_level event;
1261    int ret;
1262
1263    assert(kvm_async_interrupts_enabled());
1264
1265    event.level = level;
1266    event.irq = irq;
1267    ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
1268    if (ret < 0) {
1269        perror("kvm_set_irq");
1270        abort();
1271    }
1272
1273    return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
1274}
1275
1276#ifdef KVM_CAP_IRQ_ROUTING
1277typedef struct KVMMSIRoute {
1278    struct kvm_irq_routing_entry kroute;
1279    QTAILQ_ENTRY(KVMMSIRoute) entry;
1280} KVMMSIRoute;
1281
1282static void set_gsi(KVMState *s, unsigned int gsi)
1283{
1284    set_bit(gsi, s->used_gsi_bitmap);
1285}
1286
1287static void clear_gsi(KVMState *s, unsigned int gsi)
1288{
1289    clear_bit(gsi, s->used_gsi_bitmap);
1290}
1291
1292void kvm_init_irq_routing(KVMState *s)
1293{
1294    int gsi_count, i;
1295
1296    gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
1297    if (gsi_count > 0) {
1298        /* Round up so we can search ints using ffs */
1299        s->used_gsi_bitmap = bitmap_new(gsi_count);
1300        s->gsi_count = gsi_count;
1301    }
1302
1303    s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
1304    s->nr_allocated_irq_routes = 0;
1305
1306    if (!kvm_direct_msi_allowed) {
1307        for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
1308            QTAILQ_INIT(&s->msi_hashtab[i]);
1309        }
1310    }
1311
1312    kvm_arch_init_irq_routing(s);
1313}
1314
1315void kvm_irqchip_commit_routes(KVMState *s)
1316{
1317    int ret;
1318
1319    if (kvm_gsi_direct_mapping()) {
1320        return;
1321    }
1322
1323    if (!kvm_gsi_routing_enabled()) {
1324        return;
1325    }
1326
1327    s->irq_routes->flags = 0;
1328    trace_kvm_irqchip_commit_routes();
1329    ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
1330    assert(ret == 0);
1331}
1332
1333static void kvm_add_routing_entry(KVMState *s,
1334                                  struct kvm_irq_routing_entry *entry)
1335{
1336    struct kvm_irq_routing_entry *new;
1337    int n, size;
1338
1339    if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
1340        n = s->nr_allocated_irq_routes * 2;
1341        if (n < 64) {
1342            n = 64;
1343        }
1344        size = sizeof(struct kvm_irq_routing);
1345        size += n * sizeof(*new);
1346        s->irq_routes = g_realloc(s->irq_routes, size);
1347        s->nr_allocated_irq_routes = n;
1348    }
1349    n = s->irq_routes->nr++;
1350    new = &s->irq_routes->entries[n];
1351
1352    *new = *entry;
1353
1354    set_gsi(s, entry->gsi);
1355}
1356
1357static int kvm_update_routing_entry(KVMState *s,
1358                                    struct kvm_irq_routing_entry *new_entry)
1359{
1360    struct kvm_irq_routing_entry *entry;
1361    int n;
1362
1363    for (n = 0; n < s->irq_routes->nr; n++) {
1364        entry = &s->irq_routes->entries[n];
1365        if (entry->gsi != new_entry->gsi) {
1366            continue;
1367        }
1368
1369        if(!memcmp(entry, new_entry, sizeof *entry)) {
1370            return 0;
1371        }
1372
1373        *entry = *new_entry;
1374
1375        return 0;
1376    }
1377
1378    return -ESRCH;
1379}
1380
1381void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
1382{
1383    struct kvm_irq_routing_entry e = {};
1384
1385    assert(pin < s->gsi_count);
1386
1387    e.gsi = irq;
1388    e.type = KVM_IRQ_ROUTING_IRQCHIP;
1389    e.flags = 0;
1390    e.u.irqchip.irqchip = irqchip;
1391    e.u.irqchip.pin = pin;
1392    kvm_add_routing_entry(s, &e);
1393}
1394
1395void kvm_irqchip_release_virq(KVMState *s, int virq)
1396{
1397    struct kvm_irq_routing_entry *e;
1398    int i;
1399
1400    if (kvm_gsi_direct_mapping()) {
1401        return;
1402    }
1403
1404    for (i = 0; i < s->irq_routes->nr; i++) {
1405        e = &s->irq_routes->entries[i];
1406        if (e->gsi == virq) {
1407            s->irq_routes->nr--;
1408            *e = s->irq_routes->entries[s->irq_routes->nr];
1409        }
1410    }
1411    clear_gsi(s, virq);
1412    kvm_arch_release_virq_post(virq);
1413    trace_kvm_irqchip_release_virq(virq);
1414}
1415
1416void kvm_irqchip_add_change_notifier(Notifier *n)
1417{
1418    notifier_list_add(&kvm_irqchip_change_notifiers, n);
1419}
1420
1421void kvm_irqchip_remove_change_notifier(Notifier *n)
1422{
1423    notifier_remove(n);
1424}
1425
1426void kvm_irqchip_change_notify(void)
1427{
1428    notifier_list_notify(&kvm_irqchip_change_notifiers, NULL);
1429}
1430
1431static unsigned int kvm_hash_msi(uint32_t data)
1432{
1433    /* This is optimized for IA32 MSI layout. However, no other arch shall
1434     * repeat the mistake of not providing a direct MSI injection API. */
1435    return data & 0xff;
1436}
1437
1438static void kvm_flush_dynamic_msi_routes(KVMState *s)
1439{
1440    KVMMSIRoute *route, *next;
1441    unsigned int hash;
1442
1443    for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
1444        QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
1445            kvm_irqchip_release_virq(s, route->kroute.gsi);
1446            QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
1447            g_free(route);
1448        }
1449    }
1450}
1451
1452static int kvm_irqchip_get_virq(KVMState *s)
1453{
1454    int next_virq;
1455
1456    /*
1457     * PIC and IOAPIC share the first 16 GSI numbers, thus the available
1458     * GSI numbers are more than the number of IRQ route. Allocating a GSI
1459     * number can succeed even though a new route entry cannot be added.
1460     * When this happens, flush dynamic MSI entries to free IRQ route entries.
1461     */
1462    if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) {
1463        kvm_flush_dynamic_msi_routes(s);
1464    }
1465
1466    /* Return the lowest unused GSI in the bitmap */
1467    next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
1468    if (next_virq >= s->gsi_count) {
1469        return -ENOSPC;
1470    } else {
1471        return next_virq;
1472    }
1473}
1474
1475static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
1476{
1477    unsigned int hash = kvm_hash_msi(msg.data);
1478    KVMMSIRoute *route;
1479
1480    QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
1481        if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
1482            route->kroute.u.msi.address_hi == (msg.address >> 32) &&
1483            route->kroute.u.msi.data == le32_to_cpu(msg.data)) {
1484            return route;
1485        }
1486    }
1487    return NULL;
1488}
1489
1490int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1491{
1492    struct kvm_msi msi;
1493    KVMMSIRoute *route;
1494
1495    if (kvm_direct_msi_allowed) {
1496        msi.address_lo = (uint32_t)msg.address;
1497        msi.address_hi = msg.address >> 32;
1498        msi.data = le32_to_cpu(msg.data);
1499        msi.flags = 0;
1500        memset(msi.pad, 0, sizeof(msi.pad));
1501
1502        return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
1503    }
1504
1505    route = kvm_lookup_msi_route(s, msg);
1506    if (!route) {
1507        int virq;
1508
1509        virq = kvm_irqchip_get_virq(s);
1510        if (virq < 0) {
1511            return virq;
1512        }
1513
1514        route = g_malloc0(sizeof(KVMMSIRoute));
1515        route->kroute.gsi = virq;
1516        route->kroute.type = KVM_IRQ_ROUTING_MSI;
1517        route->kroute.flags = 0;
1518        route->kroute.u.msi.address_lo = (uint32_t)msg.address;
1519        route->kroute.u.msi.address_hi = msg.address >> 32;
1520        route->kroute.u.msi.data = le32_to_cpu(msg.data);
1521
1522        kvm_add_routing_entry(s, &route->kroute);
1523        kvm_irqchip_commit_routes(s);
1524
1525        QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
1526                           entry);
1527    }
1528
1529    assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
1530
1531    return kvm_set_irq(s, route->kroute.gsi, 1);
1532}
1533
1534int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
1535{
1536    struct kvm_irq_routing_entry kroute = {};
1537    int virq;
1538    MSIMessage msg = {0, 0};
1539
1540    if (pci_available && dev) {
1541        msg = pci_get_msi_message(dev, vector);
1542    }
1543
1544    if (kvm_gsi_direct_mapping()) {
1545        return kvm_arch_msi_data_to_gsi(msg.data);
1546    }
1547
1548    if (!kvm_gsi_routing_enabled()) {
1549        return -ENOSYS;
1550    }
1551
1552    virq = kvm_irqchip_get_virq(s);
1553    if (virq < 0) {
1554        return virq;
1555    }
1556
1557    kroute.gsi = virq;
1558    kroute.type = KVM_IRQ_ROUTING_MSI;
1559    kroute.flags = 0;
1560    kroute.u.msi.address_lo = (uint32_t)msg.address;
1561    kroute.u.msi.address_hi = msg.address >> 32;
1562    kroute.u.msi.data = le32_to_cpu(msg.data);
1563    if (pci_available && kvm_msi_devid_required()) {
1564        kroute.flags = KVM_MSI_VALID_DEVID;
1565        kroute.u.msi.devid = pci_requester_id(dev);
1566    }
1567    if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1568        kvm_irqchip_release_virq(s, virq);
1569        return -EINVAL;
1570    }
1571
1572    trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
1573                                    vector, virq);
1574
1575    kvm_add_routing_entry(s, &kroute);
1576    kvm_arch_add_msi_route_post(&kroute, vector, dev);
1577    kvm_irqchip_commit_routes(s);
1578
1579    return virq;
1580}
1581
1582int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
1583                                 PCIDevice *dev)
1584{
1585    struct kvm_irq_routing_entry kroute = {};
1586
1587    if (kvm_gsi_direct_mapping()) {
1588        return 0;
1589    }
1590
1591    if (!kvm_irqchip_in_kernel()) {
1592        return -ENOSYS;
1593    }
1594
1595    kroute.gsi = virq;
1596    kroute.type = KVM_IRQ_ROUTING_MSI;
1597    kroute.flags = 0;
1598    kroute.u.msi.address_lo = (uint32_t)msg.address;
1599    kroute.u.msi.address_hi = msg.address >> 32;
1600    kroute.u.msi.data = le32_to_cpu(msg.data);
1601    if (pci_available && kvm_msi_devid_required()) {
1602        kroute.flags = KVM_MSI_VALID_DEVID;
1603        kroute.u.msi.devid = pci_requester_id(dev);
1604    }
1605    if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1606        return -EINVAL;
1607    }
1608
1609    trace_kvm_irqchip_update_msi_route(virq);
1610
1611    return kvm_update_routing_entry(s, &kroute);
1612}
1613
1614static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq,
1615                                    bool assign)
1616{
1617    struct kvm_irqfd irqfd = {
1618        .fd = fd,
1619        .gsi = virq,
1620        .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
1621    };
1622
1623    if (rfd != -1) {
1624        irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
1625        irqfd.resamplefd = rfd;
1626    }
1627
1628    if (!kvm_irqfds_enabled()) {
1629        return -ENOSYS;
1630    }
1631
1632    return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
1633}
1634
1635int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
1636{
1637    struct kvm_irq_routing_entry kroute = {};
1638    int virq;
1639
1640    if (!kvm_gsi_routing_enabled()) {
1641        return -ENOSYS;
1642    }
1643
1644    virq = kvm_irqchip_get_virq(s);
1645    if (virq < 0) {
1646        return virq;
1647    }
1648
1649    kroute.gsi = virq;
1650    kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER;
1651    kroute.flags = 0;
1652    kroute.u.adapter.summary_addr = adapter->summary_addr;
1653    kroute.u.adapter.ind_addr = adapter->ind_addr;
1654    kroute.u.adapter.summary_offset = adapter->summary_offset;
1655    kroute.u.adapter.ind_offset = adapter->ind_offset;
1656    kroute.u.adapter.adapter_id = adapter->adapter_id;
1657
1658    kvm_add_routing_entry(s, &kroute);
1659
1660    return virq;
1661}
1662
1663int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
1664{
1665    struct kvm_irq_routing_entry kroute = {};
1666    int virq;
1667
1668    if (!kvm_gsi_routing_enabled()) {
1669        return -ENOSYS;
1670    }
1671    if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) {
1672        return -ENOSYS;
1673    }
1674    virq = kvm_irqchip_get_virq(s);
1675    if (virq < 0) {
1676        return virq;
1677    }
1678
1679    kroute.gsi = virq;
1680    kroute.type = KVM_IRQ_ROUTING_HV_SINT;
1681    kroute.flags = 0;
1682    kroute.u.hv_sint.vcpu = vcpu;
1683    kroute.u.hv_sint.sint = sint;
1684
1685    kvm_add_routing_entry(s, &kroute);
1686    kvm_irqchip_commit_routes(s);
1687
1688    return virq;
1689}
1690
1691#else /* !KVM_CAP_IRQ_ROUTING */
1692
1693void kvm_init_irq_routing(KVMState *s)
1694{
1695}
1696
1697void kvm_irqchip_release_virq(KVMState *s, int virq)
1698{
1699}
1700
1701int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1702{
1703    abort();
1704}
1705
1706int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
1707{
1708    return -ENOSYS;
1709}
1710
1711int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
1712{
1713    return -ENOSYS;
1714}
1715
1716int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
1717{
1718    return -ENOSYS;
1719}
1720
1721static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
1722{
1723    abort();
1724}
1725
1726int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
1727{
1728    return -ENOSYS;
1729}
1730#endif /* !KVM_CAP_IRQ_ROUTING */
1731
1732int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
1733                                       EventNotifier *rn, int virq)
1734{
1735    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n),
1736           rn ? event_notifier_get_fd(rn) : -1, virq, true);
1737}
1738
1739int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
1740                                          int virq)
1741{
1742    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq,
1743           false);
1744}
1745
1746int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
1747                                   EventNotifier *rn, qemu_irq irq)
1748{
1749    gpointer key, gsi;
1750    gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
1751
1752    if (!found) {
1753        return -ENXIO;
1754    }
1755    return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
1756}
1757
1758int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
1759                                      qemu_irq irq)
1760{
1761    gpointer key, gsi;
1762    gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
1763
1764    if (!found) {
1765        return -ENXIO;
1766    }
1767    return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
1768}
1769
1770void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
1771{
1772    g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
1773}
1774
1775static void kvm_irqchip_create(MachineState *machine, KVMState *s)
1776{
1777    int ret;
1778
1779    if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
1780        ;
1781    } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
1782        ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
1783        if (ret < 0) {
1784            fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
1785            exit(1);
1786        }
1787    } else {
1788        return;
1789    }
1790
1791    /* First probe and see if there's a arch-specific hook to create the
1792     * in-kernel irqchip for us */
1793    ret = kvm_arch_irqchip_create(machine, s);
1794    if (ret == 0) {
1795        if (machine_kernel_irqchip_split(machine)) {
1796            perror("Split IRQ chip mode not supported.");
1797            exit(1);
1798        } else {
1799            ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
1800        }
1801    }
1802    if (ret < 0) {
1803        fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
1804        exit(1);
1805    }
1806
1807    kvm_kernel_irqchip = true;
1808    /* If we have an in-kernel IRQ chip then we must have asynchronous
1809     * interrupt delivery (though the reverse is not necessarily true)
1810     */
1811    kvm_async_interrupts_allowed = true;
1812    kvm_halt_in_kernel_allowed = true;
1813
1814    kvm_init_irq_routing(s);
1815
1816    s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
1817}
1818
1819/* Find number of supported CPUs using the recommended
1820 * procedure from the kernel API documentation to cope with
1821 * older kernels that may be missing capabilities.
1822 */
1823static int kvm_recommended_vcpus(KVMState *s)
1824{
1825    int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
1826    return (ret) ? ret : 4;
1827}
1828
1829static int kvm_max_vcpus(KVMState *s)
1830{
1831    int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
1832    return (ret) ? ret : kvm_recommended_vcpus(s);
1833}
1834
1835static int kvm_max_vcpu_id(KVMState *s)
1836{
1837    int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
1838    return (ret) ? ret : kvm_max_vcpus(s);
1839}
1840
1841bool kvm_vcpu_id_is_valid(int vcpu_id)
1842{
1843    KVMState *s = KVM_STATE(current_machine->accelerator);
1844    return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
1845}
1846
1847static int kvm_init(MachineState *ms)
1848{
1849    MachineClass *mc = MACHINE_GET_CLASS(ms);
1850    static const char upgrade_note[] =
1851        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1852        "(see http://sourceforge.net/projects/kvm).\n";
1853    struct {
1854        const char *name;
1855        int num;
1856    } num_cpus[] = {
1857        { "SMP",          ms->smp.cpus },
1858        { "hotpluggable", ms->smp.max_cpus },
1859        { NULL, }
1860    }, *nc = num_cpus;
1861    int soft_vcpus_limit, hard_vcpus_limit;
1862    KVMState *s;
1863    const KVMCapabilityInfo *missing_cap;
1864    int ret;
1865    int type = 0;
1866    const char *kvm_type;
1867
1868    s = KVM_STATE(ms->accelerator);
1869
1870    /*
1871     * On systems where the kernel can support different base page
1872     * sizes, host page size may be different from TARGET_PAGE_SIZE,
1873     * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
1874     * page size for the system though.
1875     */
1876    assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size);
1877
1878    s->sigmask_len = 8;
1879
1880#ifdef KVM_CAP_SET_GUEST_DEBUG
1881    QTAILQ_INIT(&s->kvm_sw_breakpoints);
1882#endif
1883    QLIST_INIT(&s->kvm_parked_vcpus);
1884    s->vmfd = -1;
1885    s->fd = qemu_open("/dev/kvm", O_RDWR);
1886    if (s->fd == -1) {
1887        fprintf(stderr, "Could not access KVM kernel module: %m\n");
1888        ret = -errno;
1889        goto err;
1890    }
1891
1892    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
1893    if (ret < KVM_API_VERSION) {
1894        if (ret >= 0) {
1895            ret = -EINVAL;
1896        }
1897        fprintf(stderr, "kvm version too old\n");
1898        goto err;
1899    }
1900
1901    if (ret > KVM_API_VERSION) {
1902        ret = -EINVAL;
1903        fprintf(stderr, "kvm version not supported\n");
1904        goto err;
1905    }
1906
1907    kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
1908    s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
1909
1910    /* If unspecified, use the default value */
1911    if (!s->nr_slots) {
1912        s->nr_slots = 32;
1913    }
1914
1915    s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE);
1916    if (s->nr_as <= 1) {
1917        s->nr_as = 1;
1918    }
1919    s->as = g_new0(struct KVMAs, s->nr_as);
1920
1921    kvm_type = qemu_opt_get(qemu_get_machine_opts(), "kvm-type");
1922    if (mc->kvm_type) {
1923        type = mc->kvm_type(ms, kvm_type);
1924    } else if (kvm_type) {
1925        ret = -EINVAL;
1926        fprintf(stderr, "Invalid argument kvm-type=%s\n", kvm_type);
1927        goto err;
1928    }
1929
1930    do {
1931        ret = kvm_ioctl(s, KVM_CREATE_VM, type);
1932    } while (ret == -EINTR);
1933
1934    if (ret < 0) {
1935        fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret,
1936                strerror(-ret));
1937
1938#ifdef TARGET_S390X
1939        if (ret == -EINVAL) {
1940            fprintf(stderr,
1941                    "Host kernel setup problem detected. Please verify:\n");
1942            fprintf(stderr, "- for kernels supporting the switch_amode or"
1943                    " user_mode parameters, whether\n");
1944            fprintf(stderr,
1945                    "  user space is running in primary address space\n");
1946            fprintf(stderr,
1947                    "- for kernels supporting the vm.allocate_pgste sysctl, "
1948                    "whether it is enabled\n");
1949        }
1950#endif
1951        goto err;
1952    }
1953
1954    s->vmfd = ret;
1955
1956    /* check the vcpu limits */
1957    soft_vcpus_limit = kvm_recommended_vcpus(s);
1958    hard_vcpus_limit = kvm_max_vcpus(s);
1959
1960    while (nc->name) {
1961        if (nc->num > soft_vcpus_limit) {
1962            warn_report("Number of %s cpus requested (%d) exceeds "
1963                        "the recommended cpus supported by KVM (%d)",
1964                        nc->name, nc->num, soft_vcpus_limit);
1965
1966            if (nc->num > hard_vcpus_limit) {
1967                fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
1968                        "the maximum cpus supported by KVM (%d)\n",
1969                        nc->name, nc->num, hard_vcpus_limit);
1970                exit(1);
1971            }
1972        }
1973        nc++;
1974    }
1975
1976    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
1977    if (!missing_cap) {
1978        missing_cap =
1979            kvm_check_extension_list(s, kvm_arch_required_capabilities);
1980    }
1981    if (missing_cap) {
1982        ret = -EINVAL;
1983        fprintf(stderr, "kvm does not support %s\n%s",
1984                missing_cap->name, upgrade_note);
1985        goto err;
1986    }
1987
1988    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
1989    s->coalesced_pio = s->coalesced_mmio &&
1990                       kvm_check_extension(s, KVM_CAP_COALESCED_PIO);
1991
1992    s->manual_dirty_log_protect =
1993        kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
1994    if (s->manual_dirty_log_protect) {
1995        ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0, 1);
1996        if (ret) {
1997            warn_report("Trying to enable KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 "
1998                        "but failed.  Falling back to the legacy mode. ");
1999            s->manual_dirty_log_protect = false;
2000        }
2001    }
2002
2003#ifdef KVM_CAP_VCPU_EVENTS
2004    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
2005#endif
2006
2007    s->robust_singlestep =
2008        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
2009
2010#ifdef KVM_CAP_DEBUGREGS
2011    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
2012#endif
2013
2014    s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
2015
2016#ifdef KVM_CAP_IRQ_ROUTING
2017    kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
2018#endif
2019
2020    s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
2021
2022    s->irq_set_ioctl = KVM_IRQ_LINE;
2023    if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
2024        s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
2025    }
2026
2027    kvm_readonly_mem_allowed =
2028        (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
2029
2030    kvm_eventfds_allowed =
2031        (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0);
2032
2033    kvm_irqfds_allowed =
2034        (kvm_check_extension(s, KVM_CAP_IRQFD) > 0);
2035
2036    kvm_resamplefds_allowed =
2037        (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
2038
2039    kvm_vm_attributes_allowed =
2040        (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
2041
2042    kvm_ioeventfd_any_length_allowed =
2043        (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0);
2044
2045    kvm_state = s;
2046
2047    /*
2048     * if memory encryption object is specified then initialize the memory
2049     * encryption context.
2050     */
2051    if (ms->memory_encryption) {
2052        kvm_state->memcrypt_handle = sev_guest_init(ms->memory_encryption);
2053        if (!kvm_state->memcrypt_handle) {
2054            ret = -1;
2055            goto err;
2056        }
2057
2058        kvm_state->memcrypt_encrypt_data = sev_encrypt_data;
2059    }
2060
2061    ret = kvm_arch_init(ms, s);
2062    if (ret < 0) {
2063        goto err;
2064    }
2065
2066    if (machine_kernel_irqchip_allowed(ms)) {
2067        kvm_irqchip_create(ms, s);
2068    }
2069
2070    if (kvm_eventfds_allowed) {
2071        s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
2072        s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
2073    }
2074    s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region;
2075    s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region;
2076
2077    kvm_memory_listener_register(s, &s->memory_listener,
2078                                 &address_space_memory, 0);
2079    memory_listener_register(&kvm_io_listener,
2080                             &address_space_io);
2081    memory_listener_register(&kvm_coalesced_pio_listener,
2082                             &address_space_io);
2083
2084    s->many_ioeventfds = kvm_check_many_ioeventfds();
2085
2086    s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
2087    if (!s->sync_mmu) {
2088        qemu_balloon_inhibit(true);
2089    }
2090
2091    return 0;
2092
2093err:
2094    assert(ret < 0);
2095    if (s->vmfd >= 0) {
2096        close(s->vmfd);
2097    }
2098    if (s->fd != -1) {
2099        close(s->fd);
2100    }
2101    g_free(s->memory_listener.slots);
2102
2103    return ret;
2104}
2105
2106void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
2107{
2108    s->sigmask_len = sigmask_len;
2109}
2110
2111static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
2112                          int size, uint32_t count)
2113{
2114    int i;
2115    uint8_t *ptr = data;
2116
2117    for (i = 0; i < count; i++) {
2118        address_space_rw(&address_space_io, port, attrs,
2119                         ptr, size,
2120                         direction == KVM_EXIT_IO_OUT);
2121        ptr += size;
2122    }
2123}
2124
2125static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
2126{
2127    fprintf(stderr, "KVM internal error. Suberror: %d\n",
2128            run->internal.suberror);
2129
2130    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
2131        int i;
2132
2133        for (i = 0; i < run->internal.ndata; ++i) {
2134            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
2135                    i, (uint64_t)run->internal.data[i]);
2136        }
2137    }
2138    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
2139        fprintf(stderr, "emulation failure\n");
2140        if (!kvm_arch_stop_on_emulation_error(cpu)) {
2141            cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2142            return EXCP_INTERRUPT;
2143        }
2144    }
2145    /* FIXME: Should trigger a qmp message to let management know
2146     * something went wrong.
2147     */
2148    return -1;
2149}
2150
2151void kvm_flush_coalesced_mmio_buffer(void)
2152{
2153    KVMState *s = kvm_state;
2154
2155    if (s->coalesced_flush_in_progress) {
2156        return;
2157    }
2158
2159    s->coalesced_flush_in_progress = true;
2160
2161    if (s->coalesced_mmio_ring) {
2162        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
2163        while (ring->first != ring->last) {
2164            struct kvm_coalesced_mmio *ent;
2165
2166            ent = &ring->coalesced_mmio[ring->first];
2167
2168            if (ent->pio == 1) {
2169                address_space_rw(&address_space_io, ent->phys_addr,
2170                                 MEMTXATTRS_UNSPECIFIED, ent->data,
2171                                 ent->len, true);
2172            } else {
2173                cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
2174            }
2175            smp_wmb();
2176            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
2177        }
2178    }
2179
2180    s->coalesced_flush_in_progress = false;
2181}
2182
2183static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2184{
2185    if (!cpu->vcpu_dirty) {
2186        kvm_arch_get_registers(cpu);
2187        cpu->vcpu_dirty = true;
2188    }
2189}
2190
2191void kvm_cpu_synchronize_state(CPUState *cpu)
2192{
2193    if (!cpu->vcpu_dirty) {
2194        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
2195    }
2196}
2197
2198static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
2199{
2200    kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
2201    cpu->vcpu_dirty = false;
2202}
2203
2204void kvm_cpu_synchronize_post_reset(CPUState *cpu)
2205{
2206    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2207}
2208
2209static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
2210{
2211    kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
2212    cpu->vcpu_dirty = false;
2213}
2214
2215void kvm_cpu_synchronize_post_init(CPUState *cpu)
2216{
2217    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2218}
2219
2220static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
2221{
2222    cpu->vcpu_dirty = true;
2223}
2224
2225void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
2226{
2227    run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2228}
2229
2230#ifdef KVM_HAVE_MCE_INJECTION
2231static __thread void *pending_sigbus_addr;
2232static __thread int pending_sigbus_code;
2233static __thread bool have_sigbus_pending;
2234#endif
2235
2236static void kvm_cpu_kick(CPUState *cpu)
2237{
2238    atomic_set(&cpu->kvm_run->immediate_exit, 1);
2239}
2240
2241static void kvm_cpu_kick_self(void)
2242{
2243    if (kvm_immediate_exit) {
2244        kvm_cpu_kick(current_cpu);
2245    } else {
2246        qemu_cpu_kick_self();
2247    }
2248}
2249
2250static void kvm_eat_signals(CPUState *cpu)
2251{
2252    struct timespec ts = { 0, 0 };
2253    siginfo_t siginfo;
2254    sigset_t waitset;
2255    sigset_t chkset;
2256    int r;
2257
2258    if (kvm_immediate_exit) {
2259        atomic_set(&cpu->kvm_run->immediate_exit, 0);
2260        /* Write kvm_run->immediate_exit before the cpu->exit_request
2261         * write in kvm_cpu_exec.
2262         */
2263        smp_wmb();
2264        return;
2265    }
2266
2267    sigemptyset(&waitset);
2268    sigaddset(&waitset, SIG_IPI);
2269
2270    do {
2271        r = sigtimedwait(&waitset, &siginfo, &ts);
2272        if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
2273            perror("sigtimedwait");
2274            exit(1);
2275        }
2276
2277        r = sigpending(&chkset);
2278        if (r == -1) {
2279            perror("sigpending");
2280            exit(1);
2281        }
2282    } while (sigismember(&chkset, SIG_IPI));
2283}
2284
2285int kvm_cpu_exec(CPUState *cpu)
2286{
2287    struct kvm_run *run = cpu->kvm_run;
2288    int ret, run_ret;
2289
2290    DPRINTF("kvm_cpu_exec()\n");
2291
2292    if (kvm_arch_process_async_events(cpu)) {
2293        atomic_set(&cpu->exit_request, 0);
2294        return EXCP_HLT;
2295    }
2296
2297    qemu_mutex_unlock_iothread();
2298    cpu_exec_start(cpu);
2299
2300    do {
2301        MemTxAttrs attrs;
2302
2303        if (cpu->vcpu_dirty) {
2304            kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
2305            cpu->vcpu_dirty = false;
2306        }
2307
2308        kvm_arch_pre_run(cpu, run);
2309        if (atomic_read(&cpu->exit_request)) {
2310            DPRINTF("interrupt exit requested\n");
2311            /*
2312             * KVM requires us to reenter the kernel after IO exits to complete
2313             * instruction emulation. This self-signal will ensure that we
2314             * leave ASAP again.
2315             */
2316            kvm_cpu_kick_self();
2317        }
2318
2319        /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
2320         * Matching barrier in kvm_eat_signals.
2321         */
2322        smp_rmb();
2323
2324        run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
2325
2326        attrs = kvm_arch_post_run(cpu, run);
2327
2328#ifdef KVM_HAVE_MCE_INJECTION
2329        if (unlikely(have_sigbus_pending)) {
2330            qemu_mutex_lock_iothread();
2331            kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
2332                                    pending_sigbus_addr);
2333            have_sigbus_pending = false;
2334            qemu_mutex_unlock_iothread();
2335        }
2336#endif
2337
2338        if (run_ret < 0) {
2339            if (run_ret == -EINTR || run_ret == -EAGAIN) {
2340                DPRINTF("io window exit\n");
2341                kvm_eat_signals(cpu);
2342                ret = EXCP_INTERRUPT;
2343                break;
2344            }
2345            fprintf(stderr, "error: kvm run failed %s\n",
2346                    strerror(-run_ret));
2347#ifdef TARGET_PPC
2348            if (run_ret == -EBUSY) {
2349                fprintf(stderr,
2350                        "This is probably because your SMT is enabled.\n"
2351                        "VCPU can only run on primary threads with all "
2352                        "secondary threads offline.\n");
2353            }
2354#endif
2355            ret = -1;
2356            break;
2357        }
2358
2359        trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
2360        switch (run->exit_reason) {
2361        case KVM_EXIT_IO:
2362            DPRINTF("handle_io\n");
2363            /* Called outside BQL */
2364            kvm_handle_io(run->io.port, attrs,
2365                          (uint8_t *)run + run->io.data_offset,
2366                          run->io.direction,
2367                          run->io.size,
2368                          run->io.count);
2369            ret = 0;
2370            break;
2371        case KVM_EXIT_MMIO:
2372            DPRINTF("handle_mmio\n");
2373            /* Called outside BQL */
2374            address_space_rw(&address_space_memory,
2375                             run->mmio.phys_addr, attrs,
2376                             run->mmio.data,
2377                             run->mmio.len,
2378                             run->mmio.is_write);
2379            ret = 0;
2380            break;
2381        case KVM_EXIT_IRQ_WINDOW_OPEN:
2382            DPRINTF("irq_window_open\n");
2383            ret = EXCP_INTERRUPT;
2384            break;
2385        case KVM_EXIT_SHUTDOWN:
2386            DPRINTF("shutdown\n");
2387            qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2388            ret = EXCP_INTERRUPT;
2389            break;
2390        case KVM_EXIT_UNKNOWN:
2391            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
2392                    (uint64_t)run->hw.hardware_exit_reason);
2393            ret = -1;
2394            break;
2395        case KVM_EXIT_INTERNAL_ERROR:
2396            ret = kvm_handle_internal_error(cpu, run);
2397            break;
2398        case KVM_EXIT_SYSTEM_EVENT:
2399            switch (run->system_event.type) {
2400            case KVM_SYSTEM_EVENT_SHUTDOWN:
2401                qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
2402                ret = EXCP_INTERRUPT;
2403                break;
2404            case KVM_SYSTEM_EVENT_RESET:
2405                qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2406                ret = EXCP_INTERRUPT;
2407                break;
2408            case KVM_SYSTEM_EVENT_CRASH:
2409                kvm_cpu_synchronize_state(cpu);
2410                qemu_mutex_lock_iothread();
2411                qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2412                qemu_mutex_unlock_iothread();
2413                ret = 0;
2414                break;
2415            default:
2416                DPRINTF("kvm_arch_handle_exit\n");
2417                ret = kvm_arch_handle_exit(cpu, run);
2418                break;
2419            }
2420            break;
2421        default:
2422            DPRINTF("kvm_arch_handle_exit\n");
2423            ret = kvm_arch_handle_exit(cpu, run);
2424            break;
2425        }
2426    } while (ret == 0);
2427
2428    cpu_exec_end(cpu);
2429    qemu_mutex_lock_iothread();
2430
2431    if (ret < 0) {
2432        cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2433        vm_stop(RUN_STATE_INTERNAL_ERROR);
2434    }
2435
2436    atomic_set(&cpu->exit_request, 0);
2437    return ret;
2438}
2439
2440int kvm_ioctl(KVMState *s, int type, ...)
2441{
2442    int ret;
2443    void *arg;
2444    va_list ap;
2445
2446    va_start(ap, type);
2447    arg = va_arg(ap, void *);
2448    va_end(ap);
2449
2450    trace_kvm_ioctl(type, arg);
2451    ret = ioctl(s->fd, type, arg);
2452    if (ret == -1) {
2453        ret = -errno;
2454    }
2455    return ret;
2456}
2457
2458int kvm_vm_ioctl(KVMState *s, int type, ...)
2459{
2460    int ret;
2461    void *arg;
2462    va_list ap;
2463
2464    va_start(ap, type);
2465    arg = va_arg(ap, void *);
2466    va_end(ap);
2467
2468    trace_kvm_vm_ioctl(type, arg);
2469    ret = ioctl(s->vmfd, type, arg);
2470    if (ret == -1) {
2471        ret = -errno;
2472    }
2473    return ret;
2474}
2475
2476int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
2477{
2478    int ret;
2479    void *arg;
2480    va_list ap;
2481
2482    va_start(ap, type);
2483    arg = va_arg(ap, void *);
2484    va_end(ap);
2485
2486    trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
2487    ret = ioctl(cpu->kvm_fd, type, arg);
2488    if (ret == -1) {
2489        ret = -errno;
2490    }
2491    return ret;
2492}
2493
2494int kvm_device_ioctl(int fd, int type, ...)
2495{
2496    int ret;
2497    void *arg;
2498    va_list ap;
2499
2500    va_start(ap, type);
2501    arg = va_arg(ap, void *);
2502    va_end(ap);
2503
2504    trace_kvm_device_ioctl(fd, type, arg);
2505    ret = ioctl(fd, type, arg);
2506    if (ret == -1) {
2507        ret = -errno;
2508    }
2509    return ret;
2510}
2511
2512int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
2513{
2514    int ret;
2515    struct kvm_device_attr attribute = {
2516        .group = group,
2517        .attr = attr,
2518    };
2519
2520    if (!kvm_vm_attributes_allowed) {
2521        return 0;
2522    }
2523
2524    ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
2525    /* kvm returns 0 on success for HAS_DEVICE_ATTR */
2526    return ret ? 0 : 1;
2527}
2528
2529int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
2530{
2531    struct kvm_device_attr attribute = {
2532        .group = group,
2533        .attr = attr,
2534        .flags = 0,
2535    };
2536
2537    return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
2538}
2539
2540int kvm_device_access(int fd, int group, uint64_t attr,
2541                      void *val, bool write, Error **errp)
2542{
2543    struct kvm_device_attr kvmattr;
2544    int err;
2545
2546    kvmattr.flags = 0;
2547    kvmattr.group = group;
2548    kvmattr.attr = attr;
2549    kvmattr.addr = (uintptr_t)val;
2550
2551    err = kvm_device_ioctl(fd,
2552                           write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
2553                           &kvmattr);
2554    if (err < 0) {
2555        error_setg_errno(errp, -err,
2556                         "KVM_%s_DEVICE_ATTR failed: Group %d "
2557                         "attr 0x%016" PRIx64,
2558                         write ? "SET" : "GET", group, attr);
2559    }
2560    return err;
2561}
2562
2563bool kvm_has_sync_mmu(void)
2564{
2565    return kvm_state->sync_mmu;
2566}
2567
2568int kvm_has_vcpu_events(void)
2569{
2570    return kvm_state->vcpu_events;
2571}
2572
2573int kvm_has_robust_singlestep(void)
2574{
2575    return kvm_state->robust_singlestep;
2576}
2577
2578int kvm_has_debugregs(void)
2579{
2580    return kvm_state->debugregs;
2581}
2582
2583int kvm_max_nested_state_length(void)
2584{
2585    return kvm_state->max_nested_state_len;
2586}
2587
2588int kvm_has_many_ioeventfds(void)
2589{
2590    if (!kvm_enabled()) {
2591        return 0;
2592    }
2593    return kvm_state->many_ioeventfds;
2594}
2595
2596int kvm_has_gsi_routing(void)
2597{
2598#ifdef KVM_CAP_IRQ_ROUTING
2599    return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
2600#else
2601    return false;
2602#endif
2603}
2604
2605int kvm_has_intx_set_mask(void)
2606{
2607    return kvm_state->intx_set_mask;
2608}
2609
2610bool kvm_arm_supports_user_irq(void)
2611{
2612    return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
2613}
2614
2615#ifdef KVM_CAP_SET_GUEST_DEBUG
2616struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu,
2617                                                 target_ulong pc)
2618{
2619    struct kvm_sw_breakpoint *bp;
2620
2621    QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
2622        if (bp->pc == pc) {
2623            return bp;
2624        }
2625    }
2626    return NULL;
2627}
2628
2629int kvm_sw_breakpoints_active(CPUState *cpu)
2630{
2631    return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
2632}
2633
2634struct kvm_set_guest_debug_data {
2635    struct kvm_guest_debug dbg;
2636    int err;
2637};
2638
2639static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
2640{
2641    struct kvm_set_guest_debug_data *dbg_data =
2642        (struct kvm_set_guest_debug_data *) data.host_ptr;
2643
2644    dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
2645                                   &dbg_data->dbg);
2646}
2647
2648int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
2649{
2650    struct kvm_set_guest_debug_data data;
2651
2652    data.dbg.control = reinject_trap;
2653
2654    if (cpu->singlestep_enabled) {
2655        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
2656    }
2657    kvm_arch_update_guest_debug(cpu, &data.dbg);
2658
2659    run_on_cpu(cpu, kvm_invoke_set_guest_debug,
2660               RUN_ON_CPU_HOST_PTR(&data));
2661    return data.err;
2662}
2663
2664int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2665                          target_ulong len, int type)
2666{
2667    struct kvm_sw_breakpoint *bp;
2668    int err;
2669
2670    if (type == GDB_BREAKPOINT_SW) {
2671        bp = kvm_find_sw_breakpoint(cpu, addr);
2672        if (bp) {
2673            bp->use_count++;
2674            return 0;
2675        }
2676
2677        bp = g_malloc(sizeof(struct kvm_sw_breakpoint));
2678        bp->pc = addr;
2679        bp->use_count = 1;
2680        err = kvm_arch_insert_sw_breakpoint(cpu, bp);
2681        if (err) {
2682            g_free(bp);
2683            return err;
2684        }
2685
2686        QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
2687    } else {
2688        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
2689        if (err) {
2690            return err;
2691        }
2692    }
2693
2694    CPU_FOREACH(cpu) {
2695        err = kvm_update_guest_debug(cpu, 0);
2696        if (err) {
2697            return err;
2698        }
2699    }
2700    return 0;
2701}
2702
2703int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2704                          target_ulong len, int type)
2705{
2706    struct kvm_sw_breakpoint *bp;
2707    int err;
2708
2709    if (type == GDB_BREAKPOINT_SW) {
2710        bp = kvm_find_sw_breakpoint(cpu, addr);
2711        if (!bp) {
2712            return -ENOENT;
2713        }
2714
2715        if (bp->use_count > 1) {
2716            bp->use_count--;
2717            return 0;
2718        }
2719
2720        err = kvm_arch_remove_sw_breakpoint(cpu, bp);
2721        if (err) {
2722            return err;
2723        }
2724
2725        QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
2726        g_free(bp);
2727    } else {
2728        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
2729        if (err) {
2730            return err;
2731        }
2732    }
2733
2734    CPU_FOREACH(cpu) {
2735        err = kvm_update_guest_debug(cpu, 0);
2736        if (err) {
2737            return err;
2738        }
2739    }
2740    return 0;
2741}
2742
2743void kvm_remove_all_breakpoints(CPUState *cpu)
2744{
2745    struct kvm_sw_breakpoint *bp, *next;
2746    KVMState *s = cpu->kvm_state;
2747    CPUState *tmpcpu;
2748
2749    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
2750        if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
2751            /* Try harder to find a CPU that currently sees the breakpoint. */
2752            CPU_FOREACH(tmpcpu) {
2753                if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
2754                    break;
2755                }
2756            }
2757        }
2758        QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
2759        g_free(bp);
2760    }
2761    kvm_arch_remove_all_hw_breakpoints();
2762
2763    CPU_FOREACH(cpu) {
2764        kvm_update_guest_debug(cpu, 0);
2765    }
2766}
2767
2768#else /* !KVM_CAP_SET_GUEST_DEBUG */
2769
2770int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
2771{
2772    return -EINVAL;
2773}
2774
2775int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2776                          target_ulong len, int type)
2777{
2778    return -EINVAL;
2779}
2780
2781int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2782                          target_ulong len, int type)
2783{
2784    return -EINVAL;
2785}
2786
2787void kvm_remove_all_breakpoints(CPUState *cpu)
2788{
2789}
2790#endif /* !KVM_CAP_SET_GUEST_DEBUG */
2791
2792static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
2793{
2794    KVMState *s = kvm_state;
2795    struct kvm_signal_mask *sigmask;
2796    int r;
2797
2798    sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
2799
2800    sigmask->len = s->sigmask_len;
2801    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
2802    r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
2803    g_free(sigmask);
2804
2805    return r;
2806}
2807
2808static void kvm_ipi_signal(int sig)
2809{
2810    if (current_cpu) {
2811        assert(kvm_immediate_exit);
2812        kvm_cpu_kick(current_cpu);
2813    }
2814}
2815
2816void kvm_init_cpu_signals(CPUState *cpu)
2817{
2818    int r;
2819    sigset_t set;
2820    struct sigaction sigact;
2821
2822    memset(&sigact, 0, sizeof(sigact));
2823    sigact.sa_handler = kvm_ipi_signal;
2824    sigaction(SIG_IPI, &sigact, NULL);
2825
2826    pthread_sigmask(SIG_BLOCK, NULL, &set);
2827#if defined KVM_HAVE_MCE_INJECTION
2828    sigdelset(&set, SIGBUS);
2829    pthread_sigmask(SIG_SETMASK, &set, NULL);
2830#endif
2831    sigdelset(&set, SIG_IPI);
2832    if (kvm_immediate_exit) {
2833        r = pthread_sigmask(SIG_SETMASK, &set, NULL);
2834    } else {
2835        r = kvm_set_signal_mask(cpu, &set);
2836    }
2837    if (r) {
2838        fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
2839        exit(1);
2840    }
2841}
2842
2843/* Called asynchronously in VCPU thread.  */
2844int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
2845{
2846#ifdef KVM_HAVE_MCE_INJECTION
2847    if (have_sigbus_pending) {
2848        return 1;
2849    }
2850    have_sigbus_pending = true;
2851    pending_sigbus_addr = addr;
2852    pending_sigbus_code = code;
2853    atomic_set(&cpu->exit_request, 1);
2854    return 0;
2855#else
2856    return 1;
2857#endif
2858}
2859
2860/* Called synchronously (via signalfd) in main thread.  */
2861int kvm_on_sigbus(int code, void *addr)
2862{
2863#ifdef KVM_HAVE_MCE_INJECTION
2864    /* Action required MCE kills the process if SIGBUS is blocked.  Because
2865     * that's what happens in the I/O thread, where we handle MCE via signalfd,
2866     * we can only get action optional here.
2867     */
2868    assert(code != BUS_MCEERR_AR);
2869    kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
2870    return 0;
2871#else
2872    return 1;
2873#endif
2874}
2875
2876int kvm_create_device(KVMState *s, uint64_t type, bool test)
2877{
2878    int ret;
2879    struct kvm_create_device create_dev;
2880
2881    create_dev.type = type;
2882    create_dev.fd = -1;
2883    create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
2884
2885    if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
2886        return -ENOTSUP;
2887    }
2888
2889    ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
2890    if (ret) {
2891        return ret;
2892    }
2893
2894    return test ? 0 : create_dev.fd;
2895}
2896
2897bool kvm_device_supported(int vmfd, uint64_t type)
2898{
2899    struct kvm_create_device create_dev = {
2900        .type = type,
2901        .fd = -1,
2902        .flags = KVM_CREATE_DEVICE_TEST,
2903    };
2904
2905    if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
2906        return false;
2907    }
2908
2909    return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
2910}
2911
2912int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
2913{
2914    struct kvm_one_reg reg;
2915    int r;
2916
2917    reg.id = id;
2918    reg.addr = (uintptr_t) source;
2919    r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
2920    if (r) {
2921        trace_kvm_failed_reg_set(id, strerror(-r));
2922    }
2923    return r;
2924}
2925
2926int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
2927{
2928    struct kvm_one_reg reg;
2929    int r;
2930
2931    reg.id = id;
2932    reg.addr = (uintptr_t) target;
2933    r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
2934    if (r) {
2935        trace_kvm_failed_reg_get(id, strerror(-r));
2936    }
2937    return r;
2938}
2939
2940static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as,
2941                                 hwaddr start_addr, hwaddr size)
2942{
2943    KVMState *kvm = KVM_STATE(ms->accelerator);
2944    int i;
2945
2946    for (i = 0; i < kvm->nr_as; ++i) {
2947        if (kvm->as[i].as == as && kvm->as[i].ml) {
2948            size = MIN(kvm_max_slot_size, size);
2949            return NULL != kvm_lookup_matching_slot(kvm->as[i].ml,
2950                                                    start_addr, size);
2951        }
2952    }
2953
2954    return false;
2955}
2956
2957static void kvm_accel_class_init(ObjectClass *oc, void *data)
2958{
2959    AccelClass *ac = ACCEL_CLASS(oc);
2960    ac->name = "KVM";
2961    ac->init_machine = kvm_init;
2962    ac->has_memory = kvm_accel_has_memory;
2963    ac->allowed = &kvm_allowed;
2964}
2965
2966static const TypeInfo kvm_accel_type = {
2967    .name = TYPE_KVM_ACCEL,
2968    .parent = TYPE_ACCEL,
2969    .class_init = kvm_accel_class_init,
2970    .instance_size = sizeof(KVMState),
2971};
2972
2973static void kvm_type_init(void)
2974{
2975    type_register_static(&kvm_accel_type);
2976}
2977
2978type_init(kvm_type_init);
2979