qemu/accel/kvm/kvm-all.c
<<
>>
Prefs
   1/*
   2 * QEMU KVM support
   3 *
   4 * Copyright IBM, Corp. 2008
   5 *           Red Hat, Inc. 2008
   6 *
   7 * Authors:
   8 *  Anthony Liguori   <aliguori@us.ibm.com>
   9 *  Glauber Costa     <gcosta@redhat.com>
  10 *
  11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
  12 * See the COPYING file in the top-level directory.
  13 *
  14 */
  15
  16#include "qemu/osdep.h"
  17#include <sys/ioctl.h>
  18#include <poll.h>
  19
  20#include <linux/kvm.h>
  21
  22#include "qemu/atomic.h"
  23#include "qemu/option.h"
  24#include "qemu/config-file.h"
  25#include "qemu/error-report.h"
  26#include "qapi/error.h"
  27#include "hw/pci/msi.h"
  28#include "hw/pci/msix.h"
  29#include "hw/s390x/adapter.h"
  30#include "exec/gdbstub.h"
  31#include "sysemu/kvm_int.h"
  32#include "sysemu/runstate.h"
  33#include "sysemu/cpus.h"
  34#include "qemu/bswap.h"
  35#include "exec/memory.h"
  36#include "exec/ram_addr.h"
  37#include "qemu/event_notifier.h"
  38#include "qemu/main-loop.h"
  39#include "trace.h"
  40#include "hw/irq.h"
  41#include "qapi/visitor.h"
  42#include "qapi/qapi-types-common.h"
  43#include "qapi/qapi-visit-common.h"
  44#include "sysemu/reset.h"
  45#include "qemu/guest-random.h"
  46#include "sysemu/hw_accel.h"
  47#include "kvm-cpus.h"
  48
  49#include "hw/boards.h"
  50
  51/* This check must be after config-host.h is included */
  52#ifdef CONFIG_EVENTFD
  53#include <sys/eventfd.h>
  54#endif
  55
  56/* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
  57 * need to use the real host PAGE_SIZE, as that's what KVM will use.
  58 */
  59#ifdef PAGE_SIZE
  60#undef PAGE_SIZE
  61#endif
  62#define PAGE_SIZE qemu_real_host_page_size
  63
  64//#define DEBUG_KVM
  65
  66#ifdef DEBUG_KVM
  67#define DPRINTF(fmt, ...) \
  68    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
  69#else
  70#define DPRINTF(fmt, ...) \
  71    do { } while (0)
  72#endif
  73
  74#define KVM_MSI_HASHTAB_SIZE    256
  75
  76struct KVMParkedVcpu {
  77    unsigned long vcpu_id;
  78    int kvm_fd;
  79    QLIST_ENTRY(KVMParkedVcpu) node;
  80};
  81
  82enum KVMDirtyRingReaperState {
  83    KVM_DIRTY_RING_REAPER_NONE = 0,
  84    /* The reaper is sleeping */
  85    KVM_DIRTY_RING_REAPER_WAIT,
  86    /* The reaper is reaping for dirty pages */
  87    KVM_DIRTY_RING_REAPER_REAPING,
  88};
  89
  90/*
  91 * KVM reaper instance, responsible for collecting the KVM dirty bits
  92 * via the dirty ring.
  93 */
  94struct KVMDirtyRingReaper {
  95    /* The reaper thread */
  96    QemuThread reaper_thr;
  97    volatile uint64_t reaper_iteration; /* iteration number of reaper thr */
  98    volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */
  99};
 100
 101struct KVMState
 102{
 103    AccelState parent_obj;
 104
 105    int nr_slots;
 106    int fd;
 107    int vmfd;
 108    int coalesced_mmio;
 109    int coalesced_pio;
 110    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
 111    bool coalesced_flush_in_progress;
 112    int vcpu_events;
 113    int robust_singlestep;
 114    int debugregs;
 115#ifdef KVM_CAP_SET_GUEST_DEBUG
 116    QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
 117#endif
 118    int max_nested_state_len;
 119    int many_ioeventfds;
 120    int intx_set_mask;
 121    int kvm_shadow_mem;
 122    bool kernel_irqchip_allowed;
 123    bool kernel_irqchip_required;
 124    OnOffAuto kernel_irqchip_split;
 125    bool sync_mmu;
 126    uint64_t manual_dirty_log_protect;
 127    /* The man page (and posix) say ioctl numbers are signed int, but
 128     * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
 129     * unsigned, and treating them as signed here can break things */
 130    unsigned irq_set_ioctl;
 131    unsigned int sigmask_len;
 132    GHashTable *gsimap;
 133#ifdef KVM_CAP_IRQ_ROUTING
 134    struct kvm_irq_routing *irq_routes;
 135    int nr_allocated_irq_routes;
 136    unsigned long *used_gsi_bitmap;
 137    unsigned int gsi_count;
 138    QTAILQ_HEAD(, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
 139#endif
 140    KVMMemoryListener memory_listener;
 141    QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
 142
 143    /* For "info mtree -f" to tell if an MR is registered in KVM */
 144    int nr_as;
 145    struct KVMAs {
 146        KVMMemoryListener *ml;
 147        AddressSpace *as;
 148    } *as;
 149    uint64_t kvm_dirty_ring_bytes;  /* Size of the per-vcpu dirty ring */
 150    uint32_t kvm_dirty_ring_size;   /* Number of dirty GFNs per ring */
 151    struct KVMDirtyRingReaper reaper;
 152};
 153
 154KVMState *kvm_state;
 155bool kvm_kernel_irqchip;
 156bool kvm_split_irqchip;
 157bool kvm_async_interrupts_allowed;
 158bool kvm_halt_in_kernel_allowed;
 159bool kvm_eventfds_allowed;
 160bool kvm_irqfds_allowed;
 161bool kvm_resamplefds_allowed;
 162bool kvm_msi_via_irqfd_allowed;
 163bool kvm_gsi_routing_allowed;
 164bool kvm_gsi_direct_mapping;
 165bool kvm_allowed;
 166bool kvm_readonly_mem_allowed;
 167bool kvm_vm_attributes_allowed;
 168bool kvm_direct_msi_allowed;
 169bool kvm_ioeventfd_any_length_allowed;
 170bool kvm_msi_use_devid;
 171static bool kvm_immediate_exit;
 172static hwaddr kvm_max_slot_size = ~0;
 173
 174static const KVMCapabilityInfo kvm_required_capabilites[] = {
 175    KVM_CAP_INFO(USER_MEMORY),
 176    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
 177    KVM_CAP_INFO(JOIN_MEMORY_REGIONS_WORKS),
 178    KVM_CAP_LAST_INFO
 179};
 180
 181static NotifierList kvm_irqchip_change_notifiers =
 182    NOTIFIER_LIST_INITIALIZER(kvm_irqchip_change_notifiers);
 183
 184struct KVMResampleFd {
 185    int gsi;
 186    EventNotifier *resample_event;
 187    QLIST_ENTRY(KVMResampleFd) node;
 188};
 189typedef struct KVMResampleFd KVMResampleFd;
 190
 191/*
 192 * Only used with split irqchip where we need to do the resample fd
 193 * kick for the kernel from userspace.
 194 */
 195static QLIST_HEAD(, KVMResampleFd) kvm_resample_fd_list =
 196    QLIST_HEAD_INITIALIZER(kvm_resample_fd_list);
 197
 198static QemuMutex kml_slots_lock;
 199
 200#define kvm_slots_lock()    qemu_mutex_lock(&kml_slots_lock)
 201#define kvm_slots_unlock()  qemu_mutex_unlock(&kml_slots_lock)
 202
 203static void kvm_slot_init_dirty_bitmap(KVMSlot *mem);
 204
 205static inline void kvm_resample_fd_remove(int gsi)
 206{
 207    KVMResampleFd *rfd;
 208
 209    QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
 210        if (rfd->gsi == gsi) {
 211            QLIST_REMOVE(rfd, node);
 212            g_free(rfd);
 213            break;
 214        }
 215    }
 216}
 217
 218static inline void kvm_resample_fd_insert(int gsi, EventNotifier *event)
 219{
 220    KVMResampleFd *rfd = g_new0(KVMResampleFd, 1);
 221
 222    rfd->gsi = gsi;
 223    rfd->resample_event = event;
 224
 225    QLIST_INSERT_HEAD(&kvm_resample_fd_list, rfd, node);
 226}
 227
 228void kvm_resample_fd_notify(int gsi)
 229{
 230    KVMResampleFd *rfd;
 231
 232    QLIST_FOREACH(rfd, &kvm_resample_fd_list, node) {
 233        if (rfd->gsi == gsi) {
 234            event_notifier_set(rfd->resample_event);
 235            trace_kvm_resample_fd_notify(gsi);
 236            return;
 237        }
 238    }
 239}
 240
 241int kvm_get_max_memslots(void)
 242{
 243    KVMState *s = KVM_STATE(current_accel());
 244
 245    return s->nr_slots;
 246}
 247
 248/* Called with KVMMemoryListener.slots_lock held */
 249static KVMSlot *kvm_get_free_slot(KVMMemoryListener *kml)
 250{
 251    KVMState *s = kvm_state;
 252    int i;
 253
 254    for (i = 0; i < s->nr_slots; i++) {
 255        if (kml->slots[i].memory_size == 0) {
 256            return &kml->slots[i];
 257        }
 258    }
 259
 260    return NULL;
 261}
 262
 263bool kvm_has_free_slot(MachineState *ms)
 264{
 265    KVMState *s = KVM_STATE(ms->accelerator);
 266    bool result;
 267    KVMMemoryListener *kml = &s->memory_listener;
 268
 269    kvm_slots_lock();
 270    result = !!kvm_get_free_slot(kml);
 271    kvm_slots_unlock();
 272
 273    return result;
 274}
 275
 276/* Called with KVMMemoryListener.slots_lock held */
 277static KVMSlot *kvm_alloc_slot(KVMMemoryListener *kml)
 278{
 279    KVMSlot *slot = kvm_get_free_slot(kml);
 280
 281    if (slot) {
 282        return slot;
 283    }
 284
 285    fprintf(stderr, "%s: no free slot available\n", __func__);
 286    abort();
 287}
 288
 289static KVMSlot *kvm_lookup_matching_slot(KVMMemoryListener *kml,
 290                                         hwaddr start_addr,
 291                                         hwaddr size)
 292{
 293    KVMState *s = kvm_state;
 294    int i;
 295
 296    for (i = 0; i < s->nr_slots; i++) {
 297        KVMSlot *mem = &kml->slots[i];
 298
 299        if (start_addr == mem->start_addr && size == mem->memory_size) {
 300            return mem;
 301        }
 302    }
 303
 304    return NULL;
 305}
 306
 307/*
 308 * Calculate and align the start address and the size of the section.
 309 * Return the size. If the size is 0, the aligned section is empty.
 310 */
 311static hwaddr kvm_align_section(MemoryRegionSection *section,
 312                                hwaddr *start)
 313{
 314    hwaddr size = int128_get64(section->size);
 315    hwaddr delta, aligned;
 316
 317    /* kvm works in page size chunks, but the function may be called
 318       with sub-page size and unaligned start address. Pad the start
 319       address to next and truncate size to previous page boundary. */
 320    aligned = ROUND_UP(section->offset_within_address_space,
 321                       qemu_real_host_page_size);
 322    delta = aligned - section->offset_within_address_space;
 323    *start = aligned;
 324    if (delta > size) {
 325        return 0;
 326    }
 327
 328    return (size - delta) & qemu_real_host_page_mask;
 329}
 330
 331int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
 332                                       hwaddr *phys_addr)
 333{
 334    KVMMemoryListener *kml = &s->memory_listener;
 335    int i, ret = 0;
 336
 337    kvm_slots_lock();
 338    for (i = 0; i < s->nr_slots; i++) {
 339        KVMSlot *mem = &kml->slots[i];
 340
 341        if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
 342            *phys_addr = mem->start_addr + (ram - mem->ram);
 343            ret = 1;
 344            break;
 345        }
 346    }
 347    kvm_slots_unlock();
 348
 349    return ret;
 350}
 351
 352static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new)
 353{
 354    KVMState *s = kvm_state;
 355    struct kvm_userspace_memory_region mem;
 356    int ret;
 357
 358    mem.slot = slot->slot | (kml->as_id << 16);
 359    mem.guest_phys_addr = slot->start_addr;
 360    mem.userspace_addr = (unsigned long)slot->ram;
 361    mem.flags = slot->flags;
 362
 363    if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) {
 364        /* Set the slot size to 0 before setting the slot to the desired
 365         * value. This is needed based on KVM commit 75d61fbc. */
 366        mem.memory_size = 0;
 367        ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 368        if (ret < 0) {
 369            goto err;
 370        }
 371    }
 372    mem.memory_size = slot->memory_size;
 373    ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 374    slot->old_flags = mem.flags;
 375err:
 376    trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr,
 377                              mem.memory_size, mem.userspace_addr, ret);
 378    if (ret < 0) {
 379        error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d,"
 380                     " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s",
 381                     __func__, mem.slot, slot->start_addr,
 382                     (uint64_t)mem.memory_size, strerror(errno));
 383    }
 384    return ret;
 385}
 386
 387static int do_kvm_destroy_vcpu(CPUState *cpu)
 388{
 389    KVMState *s = kvm_state;
 390    long mmap_size;
 391    struct KVMParkedVcpu *vcpu = NULL;
 392    int ret = 0;
 393
 394    DPRINTF("kvm_destroy_vcpu\n");
 395
 396    ret = kvm_arch_destroy_vcpu(cpu);
 397    if (ret < 0) {
 398        goto err;
 399    }
 400
 401    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
 402    if (mmap_size < 0) {
 403        ret = mmap_size;
 404        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
 405        goto err;
 406    }
 407
 408    ret = munmap(cpu->kvm_run, mmap_size);
 409    if (ret < 0) {
 410        goto err;
 411    }
 412
 413    if (cpu->kvm_dirty_gfns) {
 414        ret = munmap(cpu->kvm_dirty_gfns, s->kvm_dirty_ring_bytes);
 415        if (ret < 0) {
 416            goto err;
 417        }
 418    }
 419
 420    vcpu = g_malloc0(sizeof(*vcpu));
 421    vcpu->vcpu_id = kvm_arch_vcpu_id(cpu);
 422    vcpu->kvm_fd = cpu->kvm_fd;
 423    QLIST_INSERT_HEAD(&kvm_state->kvm_parked_vcpus, vcpu, node);
 424err:
 425    return ret;
 426}
 427
 428void kvm_destroy_vcpu(CPUState *cpu)
 429{
 430    if (do_kvm_destroy_vcpu(cpu) < 0) {
 431        error_report("kvm_destroy_vcpu failed");
 432        exit(EXIT_FAILURE);
 433    }
 434}
 435
 436static int kvm_get_vcpu(KVMState *s, unsigned long vcpu_id)
 437{
 438    struct KVMParkedVcpu *cpu;
 439
 440    QLIST_FOREACH(cpu, &s->kvm_parked_vcpus, node) {
 441        if (cpu->vcpu_id == vcpu_id) {
 442            int kvm_fd;
 443
 444            QLIST_REMOVE(cpu, node);
 445            kvm_fd = cpu->kvm_fd;
 446            g_free(cpu);
 447            return kvm_fd;
 448        }
 449    }
 450
 451    return kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)vcpu_id);
 452}
 453
 454int kvm_init_vcpu(CPUState *cpu, Error **errp)
 455{
 456    KVMState *s = kvm_state;
 457    long mmap_size;
 458    int ret;
 459
 460    trace_kvm_init_vcpu(cpu->cpu_index, kvm_arch_vcpu_id(cpu));
 461
 462    ret = kvm_get_vcpu(s, kvm_arch_vcpu_id(cpu));
 463    if (ret < 0) {
 464        error_setg_errno(errp, -ret, "kvm_init_vcpu: kvm_get_vcpu failed (%lu)",
 465                         kvm_arch_vcpu_id(cpu));
 466        goto err;
 467    }
 468
 469    cpu->kvm_fd = ret;
 470    cpu->kvm_state = s;
 471    cpu->vcpu_dirty = true;
 472
 473    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
 474    if (mmap_size < 0) {
 475        ret = mmap_size;
 476        error_setg_errno(errp, -mmap_size,
 477                         "kvm_init_vcpu: KVM_GET_VCPU_MMAP_SIZE failed");
 478        goto err;
 479    }
 480
 481    cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
 482                        cpu->kvm_fd, 0);
 483    if (cpu->kvm_run == MAP_FAILED) {
 484        ret = -errno;
 485        error_setg_errno(errp, ret,
 486                         "kvm_init_vcpu: mmap'ing vcpu state failed (%lu)",
 487                         kvm_arch_vcpu_id(cpu));
 488        goto err;
 489    }
 490
 491    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
 492        s->coalesced_mmio_ring =
 493            (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
 494    }
 495
 496    if (s->kvm_dirty_ring_size) {
 497        /* Use MAP_SHARED to share pages with the kernel */
 498        cpu->kvm_dirty_gfns = mmap(NULL, s->kvm_dirty_ring_bytes,
 499                                   PROT_READ | PROT_WRITE, MAP_SHARED,
 500                                   cpu->kvm_fd,
 501                                   PAGE_SIZE * KVM_DIRTY_LOG_PAGE_OFFSET);
 502        if (cpu->kvm_dirty_gfns == MAP_FAILED) {
 503            ret = -errno;
 504            DPRINTF("mmap'ing vcpu dirty gfns failed: %d\n", ret);
 505            goto err;
 506        }
 507    }
 508
 509    ret = kvm_arch_init_vcpu(cpu);
 510    if (ret < 0) {
 511        error_setg_errno(errp, -ret,
 512                         "kvm_init_vcpu: kvm_arch_init_vcpu failed (%lu)",
 513                         kvm_arch_vcpu_id(cpu));
 514    }
 515err:
 516    return ret;
 517}
 518
 519/*
 520 * dirty pages logging control
 521 */
 522
 523static int kvm_mem_flags(MemoryRegion *mr)
 524{
 525    bool readonly = mr->readonly || memory_region_is_romd(mr);
 526    int flags = 0;
 527
 528    if (memory_region_get_dirty_log_mask(mr) != 0) {
 529        flags |= KVM_MEM_LOG_DIRTY_PAGES;
 530    }
 531    if (readonly && kvm_readonly_mem_allowed) {
 532        flags |= KVM_MEM_READONLY;
 533    }
 534    return flags;
 535}
 536
 537/* Called with KVMMemoryListener.slots_lock held */
 538static int kvm_slot_update_flags(KVMMemoryListener *kml, KVMSlot *mem,
 539                                 MemoryRegion *mr)
 540{
 541    mem->flags = kvm_mem_flags(mr);
 542
 543    /* If nothing changed effectively, no need to issue ioctl */
 544    if (mem->flags == mem->old_flags) {
 545        return 0;
 546    }
 547
 548    kvm_slot_init_dirty_bitmap(mem);
 549    return kvm_set_user_memory_region(kml, mem, false);
 550}
 551
 552static int kvm_section_update_flags(KVMMemoryListener *kml,
 553                                    MemoryRegionSection *section)
 554{
 555    hwaddr start_addr, size, slot_size;
 556    KVMSlot *mem;
 557    int ret = 0;
 558
 559    size = kvm_align_section(section, &start_addr);
 560    if (!size) {
 561        return 0;
 562    }
 563
 564    kvm_slots_lock();
 565
 566    while (size && !ret) {
 567        slot_size = MIN(kvm_max_slot_size, size);
 568        mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
 569        if (!mem) {
 570            /* We don't have a slot if we want to trap every access. */
 571            goto out;
 572        }
 573
 574        ret = kvm_slot_update_flags(kml, mem, section->mr);
 575        start_addr += slot_size;
 576        size -= slot_size;
 577    }
 578
 579out:
 580    kvm_slots_unlock();
 581    return ret;
 582}
 583
 584static void kvm_log_start(MemoryListener *listener,
 585                          MemoryRegionSection *section,
 586                          int old, int new)
 587{
 588    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 589    int r;
 590
 591    if (old != 0) {
 592        return;
 593    }
 594
 595    r = kvm_section_update_flags(kml, section);
 596    if (r < 0) {
 597        abort();
 598    }
 599}
 600
 601static void kvm_log_stop(MemoryListener *listener,
 602                          MemoryRegionSection *section,
 603                          int old, int new)
 604{
 605    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
 606    int r;
 607
 608    if (new != 0) {
 609        return;
 610    }
 611
 612    r = kvm_section_update_flags(kml, section);
 613    if (r < 0) {
 614        abort();
 615    }
 616}
 617
 618/* get kvm's dirty pages bitmap and update qemu's */
 619static void kvm_slot_sync_dirty_pages(KVMSlot *slot)
 620{
 621    ram_addr_t start = slot->ram_start_offset;
 622    ram_addr_t pages = slot->memory_size / qemu_real_host_page_size;
 623
 624    cpu_physical_memory_set_dirty_lebitmap(slot->dirty_bmap, start, pages);
 625}
 626
 627static void kvm_slot_reset_dirty_pages(KVMSlot *slot)
 628{
 629    memset(slot->dirty_bmap, 0, slot->dirty_bmap_size);
 630}
 631
 632#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
 633
 634/* Allocate the dirty bitmap for a slot  */
 635static void kvm_slot_init_dirty_bitmap(KVMSlot *mem)
 636{
 637    if (!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) || mem->dirty_bmap) {
 638        return;
 639    }
 640
 641    /*
 642     * XXX bad kernel interface alert
 643     * For dirty bitmap, kernel allocates array of size aligned to
 644     * bits-per-long.  But for case when the kernel is 64bits and
 645     * the userspace is 32bits, userspace can't align to the same
 646     * bits-per-long, since sizeof(long) is different between kernel
 647     * and user space.  This way, userspace will provide buffer which
 648     * may be 4 bytes less than the kernel will use, resulting in
 649     * userspace memory corruption (which is not detectable by valgrind
 650     * too, in most cases).
 651     * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
 652     * a hope that sizeof(long) won't become >8 any time soon.
 653     *
 654     * Note: the granule of kvm dirty log is qemu_real_host_page_size.
 655     * And mem->memory_size is aligned to it (otherwise this mem can't
 656     * be registered to KVM).
 657     */
 658    hwaddr bitmap_size = ALIGN(mem->memory_size / qemu_real_host_page_size,
 659                                        /*HOST_LONG_BITS*/ 64) / 8;
 660    mem->dirty_bmap = g_malloc0(bitmap_size);
 661    mem->dirty_bmap_size = bitmap_size;
 662}
 663
 664/*
 665 * Sync dirty bitmap from kernel to KVMSlot.dirty_bmap, return true if
 666 * succeeded, false otherwise
 667 */
 668static bool kvm_slot_get_dirty_log(KVMState *s, KVMSlot *slot)
 669{
 670    struct kvm_dirty_log d = {};
 671    int ret;
 672
 673    d.dirty_bitmap = slot->dirty_bmap;
 674    d.slot = slot->slot | (slot->as_id << 16);
 675    ret = kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d);
 676
 677    if (ret == -ENOENT) {
 678        /* kernel does not have dirty bitmap in this slot */
 679        ret = 0;
 680    }
 681    if (ret) {
 682        error_report_once("%s: KVM_GET_DIRTY_LOG failed with %d",
 683                          __func__, ret);
 684    }
 685    return ret == 0;
 686}
 687
 688/* Should be with all slots_lock held for the address spaces. */
 689static void kvm_dirty_ring_mark_page(KVMState *s, uint32_t as_id,
 690                                     uint32_t slot_id, uint64_t offset)
 691{
 692    KVMMemoryListener *kml;
 693    KVMSlot *mem;
 694
 695    if (as_id >= s->nr_as) {
 696        return;
 697    }
 698
 699    kml = s->as[as_id].ml;
 700    mem = &kml->slots[slot_id];
 701
 702    if (!mem->memory_size || offset >=
 703        (mem->memory_size / qemu_real_host_page_size)) {
 704        return;
 705    }
 706
 707    set_bit(offset, mem->dirty_bmap);
 708}
 709
 710static bool dirty_gfn_is_dirtied(struct kvm_dirty_gfn *gfn)
 711{
 712    return gfn->flags == KVM_DIRTY_GFN_F_DIRTY;
 713}
 714
 715static void dirty_gfn_set_collected(struct kvm_dirty_gfn *gfn)
 716{
 717    gfn->flags = KVM_DIRTY_GFN_F_RESET;
 718}
 719
 720/*
 721 * Should be with all slots_lock held for the address spaces.  It returns the
 722 * dirty page we've collected on this dirty ring.
 723 */
 724static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
 725{
 726    struct kvm_dirty_gfn *dirty_gfns = cpu->kvm_dirty_gfns, *cur;
 727    uint32_t ring_size = s->kvm_dirty_ring_size;
 728    uint32_t count = 0, fetch = cpu->kvm_fetch_index;
 729
 730    assert(dirty_gfns && ring_size);
 731    trace_kvm_dirty_ring_reap_vcpu(cpu->cpu_index);
 732
 733    while (true) {
 734        cur = &dirty_gfns[fetch % ring_size];
 735        if (!dirty_gfn_is_dirtied(cur)) {
 736            break;
 737        }
 738        kvm_dirty_ring_mark_page(s, cur->slot >> 16, cur->slot & 0xffff,
 739                                 cur->offset);
 740        dirty_gfn_set_collected(cur);
 741        trace_kvm_dirty_ring_page(cpu->cpu_index, fetch, cur->offset);
 742        fetch++;
 743        count++;
 744    }
 745    cpu->kvm_fetch_index = fetch;
 746
 747    return count;
 748}
 749
 750/* Must be with slots_lock held */
 751static uint64_t kvm_dirty_ring_reap_locked(KVMState *s)
 752{
 753    int ret;
 754    CPUState *cpu;
 755    uint64_t total = 0;
 756    int64_t stamp;
 757
 758    stamp = get_clock();
 759
 760    CPU_FOREACH(cpu) {
 761        total += kvm_dirty_ring_reap_one(s, cpu);
 762    }
 763
 764    if (total) {
 765        ret = kvm_vm_ioctl(s, KVM_RESET_DIRTY_RINGS);
 766        assert(ret == total);
 767    }
 768
 769    stamp = get_clock() - stamp;
 770
 771    if (total) {
 772        trace_kvm_dirty_ring_reap(total, stamp / 1000);
 773    }
 774
 775    return total;
 776}
 777
 778/*
 779 * Currently for simplicity, we must hold BQL before calling this.  We can
 780 * consider to drop the BQL if we're clear with all the race conditions.
 781 */
 782static uint64_t kvm_dirty_ring_reap(KVMState *s)
 783{
 784    uint64_t total;
 785
 786    /*
 787     * We need to lock all kvm slots for all address spaces here,
 788     * because:
 789     *
 790     * (1) We need to mark dirty for dirty bitmaps in multiple slots
 791     *     and for tons of pages, so it's better to take the lock here
 792     *     once rather than once per page.  And more importantly,
 793     *
 794     * (2) We must _NOT_ publish dirty bits to the other threads
 795     *     (e.g., the migration thread) via the kvm memory slot dirty
 796     *     bitmaps before correctly re-protect those dirtied pages.
 797     *     Otherwise we can have potential risk of data corruption if
 798     *     the page data is read in the other thread before we do
 799     *     reset below.
 800     */
 801    kvm_slots_lock();
 802    total = kvm_dirty_ring_reap_locked(s);
 803    kvm_slots_unlock();
 804
 805    return total;
 806}
 807
 808static void do_kvm_cpu_synchronize_kick(CPUState *cpu, run_on_cpu_data arg)
 809{
 810    /* No need to do anything */
 811}
 812
 813/*
 814 * Kick all vcpus out in a synchronized way.  When returned, we
 815 * guarantee that every vcpu has been kicked and at least returned to
 816 * userspace once.
 817 */
 818static void kvm_cpu_synchronize_kick_all(void)
 819{
 820    CPUState *cpu;
 821
 822    CPU_FOREACH(cpu) {
 823        run_on_cpu(cpu, do_kvm_cpu_synchronize_kick, RUN_ON_CPU_NULL);
 824    }
 825}
 826
 827/*
 828 * Flush all the existing dirty pages to the KVM slot buffers.  When
 829 * this call returns, we guarantee that all the touched dirty pages
 830 * before calling this function have been put into the per-kvmslot
 831 * dirty bitmap.
 832 *
 833 * This function must be called with BQL held.
 834 */
 835static void kvm_dirty_ring_flush(void)
 836{
 837    trace_kvm_dirty_ring_flush(0);
 838    /*
 839     * The function needs to be serialized.  Since this function
 840     * should always be with BQL held, serialization is guaranteed.
 841     * However, let's be sure of it.
 842     */
 843    assert(qemu_mutex_iothread_locked());
 844    /*
 845     * First make sure to flush the hardware buffers by kicking all
 846     * vcpus out in a synchronous way.
 847     */
 848    kvm_cpu_synchronize_kick_all();
 849    kvm_dirty_ring_reap(kvm_state);
 850    trace_kvm_dirty_ring_flush(1);
 851}
 852
 853/**
 854 * kvm_physical_sync_dirty_bitmap - Sync dirty bitmap from kernel space
 855 *
 856 * This function will first try to fetch dirty bitmap from the kernel,
 857 * and then updates qemu's dirty bitmap.
 858 *
 859 * NOTE: caller must be with kml->slots_lock held.
 860 *
 861 * @kml: the KVM memory listener object
 862 * @section: the memory section to sync the dirty bitmap with
 863 */
 864static void kvm_physical_sync_dirty_bitmap(KVMMemoryListener *kml,
 865                                           MemoryRegionSection *section)
 866{
 867    KVMState *s = kvm_state;
 868    KVMSlot *mem;
 869    hwaddr start_addr, size;
 870    hwaddr slot_size;
 871
 872    size = kvm_align_section(section, &start_addr);
 873    while (size) {
 874        slot_size = MIN(kvm_max_slot_size, size);
 875        mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
 876        if (!mem) {
 877            /* We don't have a slot if we want to trap every access. */
 878            return;
 879        }
 880        if (kvm_slot_get_dirty_log(s, mem)) {
 881            kvm_slot_sync_dirty_pages(mem);
 882        }
 883        start_addr += slot_size;
 884        size -= slot_size;
 885    }
 886}
 887
 888/* Alignment requirement for KVM_CLEAR_DIRTY_LOG - 64 pages */
 889#define KVM_CLEAR_LOG_SHIFT  6
 890#define KVM_CLEAR_LOG_ALIGN  (qemu_real_host_page_size << KVM_CLEAR_LOG_SHIFT)
 891#define KVM_CLEAR_LOG_MASK   (-KVM_CLEAR_LOG_ALIGN)
 892
 893static int kvm_log_clear_one_slot(KVMSlot *mem, int as_id, uint64_t start,
 894                                  uint64_t size)
 895{
 896    KVMState *s = kvm_state;
 897    uint64_t end, bmap_start, start_delta, bmap_npages;
 898    struct kvm_clear_dirty_log d;
 899    unsigned long *bmap_clear = NULL, psize = qemu_real_host_page_size;
 900    int ret;
 901
 902    /*
 903     * We need to extend either the start or the size or both to
 904     * satisfy the KVM interface requirement.  Firstly, do the start
 905     * page alignment on 64 host pages
 906     */
 907    bmap_start = start & KVM_CLEAR_LOG_MASK;
 908    start_delta = start - bmap_start;
 909    bmap_start /= psize;
 910
 911    /*
 912     * The kernel interface has restriction on the size too, that either:
 913     *
 914     * (1) the size is 64 host pages aligned (just like the start), or
 915     * (2) the size fills up until the end of the KVM memslot.
 916     */
 917    bmap_npages = DIV_ROUND_UP(size + start_delta, KVM_CLEAR_LOG_ALIGN)
 918        << KVM_CLEAR_LOG_SHIFT;
 919    end = mem->memory_size / psize;
 920    if (bmap_npages > end - bmap_start) {
 921        bmap_npages = end - bmap_start;
 922    }
 923    start_delta /= psize;
 924
 925    /*
 926     * Prepare the bitmap to clear dirty bits.  Here we must guarantee
 927     * that we won't clear any unknown dirty bits otherwise we might
 928     * accidentally clear some set bits which are not yet synced from
 929     * the kernel into QEMU's bitmap, then we'll lose track of the
 930     * guest modifications upon those pages (which can directly lead
 931     * to guest data loss or panic after migration).
 932     *
 933     * Layout of the KVMSlot.dirty_bmap:
 934     *
 935     *                   |<-------- bmap_npages -----------..>|
 936     *                                                     [1]
 937     *                     start_delta         size
 938     *  |----------------|-------------|------------------|------------|
 939     *  ^                ^             ^                               ^
 940     *  |                |             |                               |
 941     * start          bmap_start     (start)                         end
 942     * of memslot                                             of memslot
 943     *
 944     * [1] bmap_npages can be aligned to either 64 pages or the end of slot
 945     */
 946
 947    assert(bmap_start % BITS_PER_LONG == 0);
 948    /* We should never do log_clear before log_sync */
 949    assert(mem->dirty_bmap);
 950    if (start_delta || bmap_npages - size / psize) {
 951        /* Slow path - we need to manipulate a temp bitmap */
 952        bmap_clear = bitmap_new(bmap_npages);
 953        bitmap_copy_with_src_offset(bmap_clear, mem->dirty_bmap,
 954                                    bmap_start, start_delta + size / psize);
 955        /*
 956         * We need to fill the holes at start because that was not
 957         * specified by the caller and we extended the bitmap only for
 958         * 64 pages alignment
 959         */
 960        bitmap_clear(bmap_clear, 0, start_delta);
 961        d.dirty_bitmap = bmap_clear;
 962    } else {
 963        /*
 964         * Fast path - both start and size align well with BITS_PER_LONG
 965         * (or the end of memory slot)
 966         */
 967        d.dirty_bitmap = mem->dirty_bmap + BIT_WORD(bmap_start);
 968    }
 969
 970    d.first_page = bmap_start;
 971    /* It should never overflow.  If it happens, say something */
 972    assert(bmap_npages <= UINT32_MAX);
 973    d.num_pages = bmap_npages;
 974    d.slot = mem->slot | (as_id << 16);
 975
 976    ret = kvm_vm_ioctl(s, KVM_CLEAR_DIRTY_LOG, &d);
 977    if (ret < 0 && ret != -ENOENT) {
 978        error_report("%s: KVM_CLEAR_DIRTY_LOG failed, slot=%d, "
 979                     "start=0x%"PRIx64", size=0x%"PRIx32", errno=%d",
 980                     __func__, d.slot, (uint64_t)d.first_page,
 981                     (uint32_t)d.num_pages, ret);
 982    } else {
 983        ret = 0;
 984        trace_kvm_clear_dirty_log(d.slot, d.first_page, d.num_pages);
 985    }
 986
 987    /*
 988     * After we have updated the remote dirty bitmap, we update the
 989     * cached bitmap as well for the memslot, then if another user
 990     * clears the same region we know we shouldn't clear it again on
 991     * the remote otherwise it's data loss as well.
 992     */
 993    bitmap_clear(mem->dirty_bmap, bmap_start + start_delta,
 994                 size / psize);
 995    /* This handles the NULL case well */
 996    g_free(bmap_clear);
 997    return ret;
 998}
 999
1000
1001/**
1002 * kvm_physical_log_clear - Clear the kernel's dirty bitmap for range
1003 *
1004 * NOTE: this will be a no-op if we haven't enabled manual dirty log
1005 * protection in the host kernel because in that case this operation
1006 * will be done within log_sync().
1007 *
1008 * @kml:     the kvm memory listener
1009 * @section: the memory range to clear dirty bitmap
1010 */
1011static int kvm_physical_log_clear(KVMMemoryListener *kml,
1012                                  MemoryRegionSection *section)
1013{
1014    KVMState *s = kvm_state;
1015    uint64_t start, size, offset, count;
1016    KVMSlot *mem;
1017    int ret = 0, i;
1018
1019    if (!s->manual_dirty_log_protect) {
1020        /* No need to do explicit clear */
1021        return ret;
1022    }
1023
1024    start = section->offset_within_address_space;
1025    size = int128_get64(section->size);
1026
1027    if (!size) {
1028        /* Nothing more we can do... */
1029        return ret;
1030    }
1031
1032    kvm_slots_lock();
1033
1034    for (i = 0; i < s->nr_slots; i++) {
1035        mem = &kml->slots[i];
1036        /* Discard slots that are empty or do not overlap the section */
1037        if (!mem->memory_size ||
1038            mem->start_addr > start + size - 1 ||
1039            start > mem->start_addr + mem->memory_size - 1) {
1040            continue;
1041        }
1042
1043        if (start >= mem->start_addr) {
1044            /* The slot starts before section or is aligned to it.  */
1045            offset = start - mem->start_addr;
1046            count = MIN(mem->memory_size - offset, size);
1047        } else {
1048            /* The slot starts after section.  */
1049            offset = 0;
1050            count = MIN(mem->memory_size, size - (mem->start_addr - start));
1051        }
1052        ret = kvm_log_clear_one_slot(mem, kml->as_id, offset, count);
1053        if (ret < 0) {
1054            break;
1055        }
1056    }
1057
1058    kvm_slots_unlock();
1059
1060    return ret;
1061}
1062
1063static void kvm_coalesce_mmio_region(MemoryListener *listener,
1064                                     MemoryRegionSection *secion,
1065                                     hwaddr start, hwaddr size)
1066{
1067    KVMState *s = kvm_state;
1068
1069    if (s->coalesced_mmio) {
1070        struct kvm_coalesced_mmio_zone zone;
1071
1072        zone.addr = start;
1073        zone.size = size;
1074        zone.pad = 0;
1075
1076        (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1077    }
1078}
1079
1080static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
1081                                       MemoryRegionSection *secion,
1082                                       hwaddr start, hwaddr size)
1083{
1084    KVMState *s = kvm_state;
1085
1086    if (s->coalesced_mmio) {
1087        struct kvm_coalesced_mmio_zone zone;
1088
1089        zone.addr = start;
1090        zone.size = size;
1091        zone.pad = 0;
1092
1093        (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1094    }
1095}
1096
1097static void kvm_coalesce_pio_add(MemoryListener *listener,
1098                                MemoryRegionSection *section,
1099                                hwaddr start, hwaddr size)
1100{
1101    KVMState *s = kvm_state;
1102
1103    if (s->coalesced_pio) {
1104        struct kvm_coalesced_mmio_zone zone;
1105
1106        zone.addr = start;
1107        zone.size = size;
1108        zone.pio = 1;
1109
1110        (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
1111    }
1112}
1113
1114static void kvm_coalesce_pio_del(MemoryListener *listener,
1115                                MemoryRegionSection *section,
1116                                hwaddr start, hwaddr size)
1117{
1118    KVMState *s = kvm_state;
1119
1120    if (s->coalesced_pio) {
1121        struct kvm_coalesced_mmio_zone zone;
1122
1123        zone.addr = start;
1124        zone.size = size;
1125        zone.pio = 1;
1126
1127        (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
1128     }
1129}
1130
1131static MemoryListener kvm_coalesced_pio_listener = {
1132    .coalesced_io_add = kvm_coalesce_pio_add,
1133    .coalesced_io_del = kvm_coalesce_pio_del,
1134};
1135
1136int kvm_check_extension(KVMState *s, unsigned int extension)
1137{
1138    int ret;
1139
1140    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
1141    if (ret < 0) {
1142        ret = 0;
1143    }
1144
1145    return ret;
1146}
1147
1148int kvm_vm_check_extension(KVMState *s, unsigned int extension)
1149{
1150    int ret;
1151
1152    ret = kvm_vm_ioctl(s, KVM_CHECK_EXTENSION, extension);
1153    if (ret < 0) {
1154        /* VM wide version not implemented, use global one instead */
1155        ret = kvm_check_extension(s, extension);
1156    }
1157
1158    return ret;
1159}
1160
1161typedef struct HWPoisonPage {
1162    ram_addr_t ram_addr;
1163    QLIST_ENTRY(HWPoisonPage) list;
1164} HWPoisonPage;
1165
1166static QLIST_HEAD(, HWPoisonPage) hwpoison_page_list =
1167    QLIST_HEAD_INITIALIZER(hwpoison_page_list);
1168
1169static void kvm_unpoison_all(void *param)
1170{
1171    HWPoisonPage *page, *next_page;
1172
1173    QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
1174        QLIST_REMOVE(page, list);
1175        qemu_ram_remap(page->ram_addr, TARGET_PAGE_SIZE);
1176        g_free(page);
1177    }
1178}
1179
1180void kvm_hwpoison_page_add(ram_addr_t ram_addr)
1181{
1182    HWPoisonPage *page;
1183
1184    QLIST_FOREACH(page, &hwpoison_page_list, list) {
1185        if (page->ram_addr == ram_addr) {
1186            return;
1187        }
1188    }
1189    page = g_new(HWPoisonPage, 1);
1190    page->ram_addr = ram_addr;
1191    QLIST_INSERT_HEAD(&hwpoison_page_list, page, list);
1192}
1193
1194static uint32_t adjust_ioeventfd_endianness(uint32_t val, uint32_t size)
1195{
1196#if defined(HOST_WORDS_BIGENDIAN) != defined(TARGET_WORDS_BIGENDIAN)
1197    /* The kernel expects ioeventfd values in HOST_WORDS_BIGENDIAN
1198     * endianness, but the memory core hands them in target endianness.
1199     * For example, PPC is always treated as big-endian even if running
1200     * on KVM and on PPC64LE.  Correct here.
1201     */
1202    switch (size) {
1203    case 2:
1204        val = bswap16(val);
1205        break;
1206    case 4:
1207        val = bswap32(val);
1208        break;
1209    }
1210#endif
1211    return val;
1212}
1213
1214static int kvm_set_ioeventfd_mmio(int fd, hwaddr addr, uint32_t val,
1215                                  bool assign, uint32_t size, bool datamatch)
1216{
1217    int ret;
1218    struct kvm_ioeventfd iofd = {
1219        .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
1220        .addr = addr,
1221        .len = size,
1222        .flags = 0,
1223        .fd = fd,
1224    };
1225
1226    trace_kvm_set_ioeventfd_mmio(fd, (uint64_t)addr, val, assign, size,
1227                                 datamatch);
1228    if (!kvm_enabled()) {
1229        return -ENOSYS;
1230    }
1231
1232    if (datamatch) {
1233        iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
1234    }
1235    if (!assign) {
1236        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1237    }
1238
1239    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1240
1241    if (ret < 0) {
1242        return -errno;
1243    }
1244
1245    return 0;
1246}
1247
1248static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
1249                                 bool assign, uint32_t size, bool datamatch)
1250{
1251    struct kvm_ioeventfd kick = {
1252        .datamatch = datamatch ? adjust_ioeventfd_endianness(val, size) : 0,
1253        .addr = addr,
1254        .flags = KVM_IOEVENTFD_FLAG_PIO,
1255        .len = size,
1256        .fd = fd,
1257    };
1258    int r;
1259    trace_kvm_set_ioeventfd_pio(fd, addr, val, assign, size, datamatch);
1260    if (!kvm_enabled()) {
1261        return -ENOSYS;
1262    }
1263    if (datamatch) {
1264        kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
1265    }
1266    if (!assign) {
1267        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1268    }
1269    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1270    if (r < 0) {
1271        return r;
1272    }
1273    return 0;
1274}
1275
1276
1277static int kvm_check_many_ioeventfds(void)
1278{
1279    /* Userspace can use ioeventfd for io notification.  This requires a host
1280     * that supports eventfd(2) and an I/O thread; since eventfd does not
1281     * support SIGIO it cannot interrupt the vcpu.
1282     *
1283     * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
1284     * can avoid creating too many ioeventfds.
1285     */
1286#if defined(CONFIG_EVENTFD)
1287    int ioeventfds[7];
1288    int i, ret = 0;
1289    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
1290        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
1291        if (ioeventfds[i] < 0) {
1292            break;
1293        }
1294        ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true);
1295        if (ret < 0) {
1296            close(ioeventfds[i]);
1297            break;
1298        }
1299    }
1300
1301    /* Decide whether many devices are supported or not */
1302    ret = i == ARRAY_SIZE(ioeventfds);
1303
1304    while (i-- > 0) {
1305        kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true);
1306        close(ioeventfds[i]);
1307    }
1308    return ret;
1309#else
1310    return 0;
1311#endif
1312}
1313
1314static const KVMCapabilityInfo *
1315kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
1316{
1317    while (list->name) {
1318        if (!kvm_check_extension(s, list->value)) {
1319            return list;
1320        }
1321        list++;
1322    }
1323    return NULL;
1324}
1325
1326void kvm_set_max_memslot_size(hwaddr max_slot_size)
1327{
1328    g_assert(
1329        ROUND_UP(max_slot_size, qemu_real_host_page_size) == max_slot_size
1330    );
1331    kvm_max_slot_size = max_slot_size;
1332}
1333
1334static void kvm_set_phys_mem(KVMMemoryListener *kml,
1335                             MemoryRegionSection *section, bool add)
1336{
1337    KVMSlot *mem;
1338    int err;
1339    MemoryRegion *mr = section->mr;
1340    bool writeable = !mr->readonly && !mr->rom_device;
1341    hwaddr start_addr, size, slot_size, mr_offset;
1342    ram_addr_t ram_start_offset;
1343    void *ram;
1344
1345    if (!memory_region_is_ram(mr)) {
1346        if (writeable || !kvm_readonly_mem_allowed) {
1347            return;
1348        } else if (!mr->romd_mode) {
1349            /* If the memory device is not in romd_mode, then we actually want
1350             * to remove the kvm memory slot so all accesses will trap. */
1351            add = false;
1352        }
1353    }
1354
1355    size = kvm_align_section(section, &start_addr);
1356    if (!size) {
1357        return;
1358    }
1359
1360    /* The offset of the kvmslot within the memory region */
1361    mr_offset = section->offset_within_region + start_addr -
1362        section->offset_within_address_space;
1363
1364    /* use aligned delta to align the ram address and offset */
1365    ram = memory_region_get_ram_ptr(mr) + mr_offset;
1366    ram_start_offset = memory_region_get_ram_addr(mr) + mr_offset;
1367
1368    kvm_slots_lock();
1369
1370    if (!add) {
1371        do {
1372            slot_size = MIN(kvm_max_slot_size, size);
1373            mem = kvm_lookup_matching_slot(kml, start_addr, slot_size);
1374            if (!mem) {
1375                goto out;
1376            }
1377            if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1378                /*
1379                 * NOTE: We should be aware of the fact that here we're only
1380                 * doing a best effort to sync dirty bits.  No matter whether
1381                 * we're using dirty log or dirty ring, we ignored two facts:
1382                 *
1383                 * (1) dirty bits can reside in hardware buffers (PML)
1384                 *
1385                 * (2) after we collected dirty bits here, pages can be dirtied
1386                 * again before we do the final KVM_SET_USER_MEMORY_REGION to
1387                 * remove the slot.
1388                 *
1389                 * Not easy.  Let's cross the fingers until it's fixed.
1390                 */
1391                if (kvm_state->kvm_dirty_ring_size) {
1392                    kvm_dirty_ring_reap_locked(kvm_state);
1393                } else {
1394                    kvm_slot_get_dirty_log(kvm_state, mem);
1395                }
1396                kvm_slot_sync_dirty_pages(mem);
1397            }
1398
1399            /* unregister the slot */
1400            g_free(mem->dirty_bmap);
1401            mem->dirty_bmap = NULL;
1402            mem->memory_size = 0;
1403            mem->flags = 0;
1404            err = kvm_set_user_memory_region(kml, mem, false);
1405            if (err) {
1406                fprintf(stderr, "%s: error unregistering slot: %s\n",
1407                        __func__, strerror(-err));
1408                abort();
1409            }
1410            start_addr += slot_size;
1411            size -= slot_size;
1412        } while (size);
1413        goto out;
1414    }
1415
1416    /* register the new slot */
1417    do {
1418        slot_size = MIN(kvm_max_slot_size, size);
1419        mem = kvm_alloc_slot(kml);
1420        mem->as_id = kml->as_id;
1421        mem->memory_size = slot_size;
1422        mem->start_addr = start_addr;
1423        mem->ram_start_offset = ram_start_offset;
1424        mem->ram = ram;
1425        mem->flags = kvm_mem_flags(mr);
1426        kvm_slot_init_dirty_bitmap(mem);
1427        err = kvm_set_user_memory_region(kml, mem, true);
1428        if (err) {
1429            fprintf(stderr, "%s: error registering slot: %s\n", __func__,
1430                    strerror(-err));
1431            abort();
1432        }
1433        start_addr += slot_size;
1434        ram_start_offset += slot_size;
1435        ram += slot_size;
1436        size -= slot_size;
1437    } while (size);
1438
1439out:
1440    kvm_slots_unlock();
1441}
1442
1443static void *kvm_dirty_ring_reaper_thread(void *data)
1444{
1445    KVMState *s = data;
1446    struct KVMDirtyRingReaper *r = &s->reaper;
1447
1448    rcu_register_thread();
1449
1450    trace_kvm_dirty_ring_reaper("init");
1451
1452    while (true) {
1453        r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT;
1454        trace_kvm_dirty_ring_reaper("wait");
1455        /*
1456         * TODO: provide a smarter timeout rather than a constant?
1457         */
1458        sleep(1);
1459
1460        trace_kvm_dirty_ring_reaper("wakeup");
1461        r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
1462
1463        qemu_mutex_lock_iothread();
1464        kvm_dirty_ring_reap(s);
1465        qemu_mutex_unlock_iothread();
1466
1467        r->reaper_iteration++;
1468    }
1469
1470    trace_kvm_dirty_ring_reaper("exit");
1471
1472    rcu_unregister_thread();
1473
1474    return NULL;
1475}
1476
1477static int kvm_dirty_ring_reaper_init(KVMState *s)
1478{
1479    struct KVMDirtyRingReaper *r = &s->reaper;
1480
1481    qemu_thread_create(&r->reaper_thr, "kvm-reaper",
1482                       kvm_dirty_ring_reaper_thread,
1483                       s, QEMU_THREAD_JOINABLE);
1484
1485    return 0;
1486}
1487
1488static void kvm_region_add(MemoryListener *listener,
1489                           MemoryRegionSection *section)
1490{
1491    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1492
1493    memory_region_ref(section->mr);
1494    kvm_set_phys_mem(kml, section, true);
1495}
1496
1497static void kvm_region_del(MemoryListener *listener,
1498                           MemoryRegionSection *section)
1499{
1500    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1501
1502    kvm_set_phys_mem(kml, section, false);
1503    memory_region_unref(section->mr);
1504}
1505
1506static void kvm_log_sync(MemoryListener *listener,
1507                         MemoryRegionSection *section)
1508{
1509    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1510
1511    kvm_slots_lock();
1512    kvm_physical_sync_dirty_bitmap(kml, section);
1513    kvm_slots_unlock();
1514}
1515
1516static void kvm_log_sync_global(MemoryListener *l)
1517{
1518    KVMMemoryListener *kml = container_of(l, KVMMemoryListener, listener);
1519    KVMState *s = kvm_state;
1520    KVMSlot *mem;
1521    int i;
1522
1523    /* Flush all kernel dirty addresses into KVMSlot dirty bitmap */
1524    kvm_dirty_ring_flush();
1525
1526    /*
1527     * TODO: make this faster when nr_slots is big while there are
1528     * only a few used slots (small VMs).
1529     */
1530    kvm_slots_lock();
1531    for (i = 0; i < s->nr_slots; i++) {
1532        mem = &kml->slots[i];
1533        if (mem->memory_size && mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
1534            kvm_slot_sync_dirty_pages(mem);
1535            /*
1536             * This is not needed by KVM_GET_DIRTY_LOG because the
1537             * ioctl will unconditionally overwrite the whole region.
1538             * However kvm dirty ring has no such side effect.
1539             */
1540            kvm_slot_reset_dirty_pages(mem);
1541        }
1542    }
1543    kvm_slots_unlock();
1544}
1545
1546static void kvm_log_clear(MemoryListener *listener,
1547                          MemoryRegionSection *section)
1548{
1549    KVMMemoryListener *kml = container_of(listener, KVMMemoryListener, listener);
1550    int r;
1551
1552    r = kvm_physical_log_clear(kml, section);
1553    if (r < 0) {
1554        error_report_once("%s: kvm log clear failed: mr=%s "
1555                          "offset=%"HWADDR_PRIx" size=%"PRIx64, __func__,
1556                          section->mr->name, section->offset_within_region,
1557                          int128_get64(section->size));
1558        abort();
1559    }
1560}
1561
1562static void kvm_mem_ioeventfd_add(MemoryListener *listener,
1563                                  MemoryRegionSection *section,
1564                                  bool match_data, uint64_t data,
1565                                  EventNotifier *e)
1566{
1567    int fd = event_notifier_get_fd(e);
1568    int r;
1569
1570    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1571                               data, true, int128_get64(section->size),
1572                               match_data);
1573    if (r < 0) {
1574        fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1575                __func__, strerror(-r), -r);
1576        abort();
1577    }
1578}
1579
1580static void kvm_mem_ioeventfd_del(MemoryListener *listener,
1581                                  MemoryRegionSection *section,
1582                                  bool match_data, uint64_t data,
1583                                  EventNotifier *e)
1584{
1585    int fd = event_notifier_get_fd(e);
1586    int r;
1587
1588    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
1589                               data, false, int128_get64(section->size),
1590                               match_data);
1591    if (r < 0) {
1592        fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1593                __func__, strerror(-r), -r);
1594        abort();
1595    }
1596}
1597
1598static void kvm_io_ioeventfd_add(MemoryListener *listener,
1599                                 MemoryRegionSection *section,
1600                                 bool match_data, uint64_t data,
1601                                 EventNotifier *e)
1602{
1603    int fd = event_notifier_get_fd(e);
1604    int r;
1605
1606    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1607                              data, true, int128_get64(section->size),
1608                              match_data);
1609    if (r < 0) {
1610        fprintf(stderr, "%s: error adding ioeventfd: %s (%d)\n",
1611                __func__, strerror(-r), -r);
1612        abort();
1613    }
1614}
1615
1616static void kvm_io_ioeventfd_del(MemoryListener *listener,
1617                                 MemoryRegionSection *section,
1618                                 bool match_data, uint64_t data,
1619                                 EventNotifier *e)
1620
1621{
1622    int fd = event_notifier_get_fd(e);
1623    int r;
1624
1625    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
1626                              data, false, int128_get64(section->size),
1627                              match_data);
1628    if (r < 0) {
1629        fprintf(stderr, "%s: error deleting ioeventfd: %s (%d)\n",
1630                __func__, strerror(-r), -r);
1631        abort();
1632    }
1633}
1634
1635void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
1636                                  AddressSpace *as, int as_id)
1637{
1638    int i;
1639
1640    kml->slots = g_malloc0(s->nr_slots * sizeof(KVMSlot));
1641    kml->as_id = as_id;
1642
1643    for (i = 0; i < s->nr_slots; i++) {
1644        kml->slots[i].slot = i;
1645    }
1646
1647    kml->listener.region_add = kvm_region_add;
1648    kml->listener.region_del = kvm_region_del;
1649    kml->listener.log_start = kvm_log_start;
1650    kml->listener.log_stop = kvm_log_stop;
1651    kml->listener.priority = 10;
1652
1653    if (s->kvm_dirty_ring_size) {
1654        kml->listener.log_sync_global = kvm_log_sync_global;
1655    } else {
1656        kml->listener.log_sync = kvm_log_sync;
1657        kml->listener.log_clear = kvm_log_clear;
1658    }
1659
1660    memory_listener_register(&kml->listener, as);
1661
1662    for (i = 0; i < s->nr_as; ++i) {
1663        if (!s->as[i].as) {
1664            s->as[i].as = as;
1665            s->as[i].ml = kml;
1666            break;
1667        }
1668    }
1669}
1670
1671static MemoryListener kvm_io_listener = {
1672    .eventfd_add = kvm_io_ioeventfd_add,
1673    .eventfd_del = kvm_io_ioeventfd_del,
1674    .priority = 10,
1675};
1676
1677int kvm_set_irq(KVMState *s, int irq, int level)
1678{
1679    struct kvm_irq_level event;
1680    int ret;
1681
1682    assert(kvm_async_interrupts_enabled());
1683
1684    event.level = level;
1685    event.irq = irq;
1686    ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
1687    if (ret < 0) {
1688        perror("kvm_set_irq");
1689        abort();
1690    }
1691
1692    return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
1693}
1694
1695#ifdef KVM_CAP_IRQ_ROUTING
1696typedef struct KVMMSIRoute {
1697    struct kvm_irq_routing_entry kroute;
1698    QTAILQ_ENTRY(KVMMSIRoute) entry;
1699} KVMMSIRoute;
1700
1701static void set_gsi(KVMState *s, unsigned int gsi)
1702{
1703    set_bit(gsi, s->used_gsi_bitmap);
1704}
1705
1706static void clear_gsi(KVMState *s, unsigned int gsi)
1707{
1708    clear_bit(gsi, s->used_gsi_bitmap);
1709}
1710
1711void kvm_init_irq_routing(KVMState *s)
1712{
1713    int gsi_count, i;
1714
1715    gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
1716    if (gsi_count > 0) {
1717        /* Round up so we can search ints using ffs */
1718        s->used_gsi_bitmap = bitmap_new(gsi_count);
1719        s->gsi_count = gsi_count;
1720    }
1721
1722    s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
1723    s->nr_allocated_irq_routes = 0;
1724
1725    if (!kvm_direct_msi_allowed) {
1726        for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
1727            QTAILQ_INIT(&s->msi_hashtab[i]);
1728        }
1729    }
1730
1731    kvm_arch_init_irq_routing(s);
1732}
1733
1734void kvm_irqchip_commit_routes(KVMState *s)
1735{
1736    int ret;
1737
1738    if (kvm_gsi_direct_mapping()) {
1739        return;
1740    }
1741
1742    if (!kvm_gsi_routing_enabled()) {
1743        return;
1744    }
1745
1746    s->irq_routes->flags = 0;
1747    trace_kvm_irqchip_commit_routes();
1748    ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
1749    assert(ret == 0);
1750}
1751
1752static void kvm_add_routing_entry(KVMState *s,
1753                                  struct kvm_irq_routing_entry *entry)
1754{
1755    struct kvm_irq_routing_entry *new;
1756    int n, size;
1757
1758    if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
1759        n = s->nr_allocated_irq_routes * 2;
1760        if (n < 64) {
1761            n = 64;
1762        }
1763        size = sizeof(struct kvm_irq_routing);
1764        size += n * sizeof(*new);
1765        s->irq_routes = g_realloc(s->irq_routes, size);
1766        s->nr_allocated_irq_routes = n;
1767    }
1768    n = s->irq_routes->nr++;
1769    new = &s->irq_routes->entries[n];
1770
1771    *new = *entry;
1772
1773    set_gsi(s, entry->gsi);
1774}
1775
1776static int kvm_update_routing_entry(KVMState *s,
1777                                    struct kvm_irq_routing_entry *new_entry)
1778{
1779    struct kvm_irq_routing_entry *entry;
1780    int n;
1781
1782    for (n = 0; n < s->irq_routes->nr; n++) {
1783        entry = &s->irq_routes->entries[n];
1784        if (entry->gsi != new_entry->gsi) {
1785            continue;
1786        }
1787
1788        if(!memcmp(entry, new_entry, sizeof *entry)) {
1789            return 0;
1790        }
1791
1792        *entry = *new_entry;
1793
1794        return 0;
1795    }
1796
1797    return -ESRCH;
1798}
1799
1800void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
1801{
1802    struct kvm_irq_routing_entry e = {};
1803
1804    assert(pin < s->gsi_count);
1805
1806    e.gsi = irq;
1807    e.type = KVM_IRQ_ROUTING_IRQCHIP;
1808    e.flags = 0;
1809    e.u.irqchip.irqchip = irqchip;
1810    e.u.irqchip.pin = pin;
1811    kvm_add_routing_entry(s, &e);
1812}
1813
1814void kvm_irqchip_release_virq(KVMState *s, int virq)
1815{
1816    struct kvm_irq_routing_entry *e;
1817    int i;
1818
1819    if (kvm_gsi_direct_mapping()) {
1820        return;
1821    }
1822
1823    for (i = 0; i < s->irq_routes->nr; i++) {
1824        e = &s->irq_routes->entries[i];
1825        if (e->gsi == virq) {
1826            s->irq_routes->nr--;
1827            *e = s->irq_routes->entries[s->irq_routes->nr];
1828        }
1829    }
1830    clear_gsi(s, virq);
1831    kvm_arch_release_virq_post(virq);
1832    trace_kvm_irqchip_release_virq(virq);
1833}
1834
1835void kvm_irqchip_add_change_notifier(Notifier *n)
1836{
1837    notifier_list_add(&kvm_irqchip_change_notifiers, n);
1838}
1839
1840void kvm_irqchip_remove_change_notifier(Notifier *n)
1841{
1842    notifier_remove(n);
1843}
1844
1845void kvm_irqchip_change_notify(void)
1846{
1847    notifier_list_notify(&kvm_irqchip_change_notifiers, NULL);
1848}
1849
1850static unsigned int kvm_hash_msi(uint32_t data)
1851{
1852    /* This is optimized for IA32 MSI layout. However, no other arch shall
1853     * repeat the mistake of not providing a direct MSI injection API. */
1854    return data & 0xff;
1855}
1856
1857static void kvm_flush_dynamic_msi_routes(KVMState *s)
1858{
1859    KVMMSIRoute *route, *next;
1860    unsigned int hash;
1861
1862    for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
1863        QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
1864            kvm_irqchip_release_virq(s, route->kroute.gsi);
1865            QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
1866            g_free(route);
1867        }
1868    }
1869}
1870
1871static int kvm_irqchip_get_virq(KVMState *s)
1872{
1873    int next_virq;
1874
1875    /*
1876     * PIC and IOAPIC share the first 16 GSI numbers, thus the available
1877     * GSI numbers are more than the number of IRQ route. Allocating a GSI
1878     * number can succeed even though a new route entry cannot be added.
1879     * When this happens, flush dynamic MSI entries to free IRQ route entries.
1880     */
1881    if (!kvm_direct_msi_allowed && s->irq_routes->nr == s->gsi_count) {
1882        kvm_flush_dynamic_msi_routes(s);
1883    }
1884
1885    /* Return the lowest unused GSI in the bitmap */
1886    next_virq = find_first_zero_bit(s->used_gsi_bitmap, s->gsi_count);
1887    if (next_virq >= s->gsi_count) {
1888        return -ENOSPC;
1889    } else {
1890        return next_virq;
1891    }
1892}
1893
1894static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
1895{
1896    unsigned int hash = kvm_hash_msi(msg.data);
1897    KVMMSIRoute *route;
1898
1899    QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
1900        if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
1901            route->kroute.u.msi.address_hi == (msg.address >> 32) &&
1902            route->kroute.u.msi.data == le32_to_cpu(msg.data)) {
1903            return route;
1904        }
1905    }
1906    return NULL;
1907}
1908
1909int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1910{
1911    struct kvm_msi msi;
1912    KVMMSIRoute *route;
1913
1914    if (kvm_direct_msi_allowed) {
1915        msi.address_lo = (uint32_t)msg.address;
1916        msi.address_hi = msg.address >> 32;
1917        msi.data = le32_to_cpu(msg.data);
1918        msi.flags = 0;
1919        memset(msi.pad, 0, sizeof(msi.pad));
1920
1921        return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
1922    }
1923
1924    route = kvm_lookup_msi_route(s, msg);
1925    if (!route) {
1926        int virq;
1927
1928        virq = kvm_irqchip_get_virq(s);
1929        if (virq < 0) {
1930            return virq;
1931        }
1932
1933        route = g_malloc0(sizeof(KVMMSIRoute));
1934        route->kroute.gsi = virq;
1935        route->kroute.type = KVM_IRQ_ROUTING_MSI;
1936        route->kroute.flags = 0;
1937        route->kroute.u.msi.address_lo = (uint32_t)msg.address;
1938        route->kroute.u.msi.address_hi = msg.address >> 32;
1939        route->kroute.u.msi.data = le32_to_cpu(msg.data);
1940
1941        kvm_add_routing_entry(s, &route->kroute);
1942        kvm_irqchip_commit_routes(s);
1943
1944        QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
1945                           entry);
1946    }
1947
1948    assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
1949
1950    return kvm_set_irq(s, route->kroute.gsi, 1);
1951}
1952
1953int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
1954{
1955    struct kvm_irq_routing_entry kroute = {};
1956    int virq;
1957    MSIMessage msg = {0, 0};
1958
1959    if (pci_available && dev) {
1960        msg = pci_get_msi_message(dev, vector);
1961    }
1962
1963    if (kvm_gsi_direct_mapping()) {
1964        return kvm_arch_msi_data_to_gsi(msg.data);
1965    }
1966
1967    if (!kvm_gsi_routing_enabled()) {
1968        return -ENOSYS;
1969    }
1970
1971    virq = kvm_irqchip_get_virq(s);
1972    if (virq < 0) {
1973        return virq;
1974    }
1975
1976    kroute.gsi = virq;
1977    kroute.type = KVM_IRQ_ROUTING_MSI;
1978    kroute.flags = 0;
1979    kroute.u.msi.address_lo = (uint32_t)msg.address;
1980    kroute.u.msi.address_hi = msg.address >> 32;
1981    kroute.u.msi.data = le32_to_cpu(msg.data);
1982    if (pci_available && kvm_msi_devid_required()) {
1983        kroute.flags = KVM_MSI_VALID_DEVID;
1984        kroute.u.msi.devid = pci_requester_id(dev);
1985    }
1986    if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
1987        kvm_irqchip_release_virq(s, virq);
1988        return -EINVAL;
1989    }
1990
1991    trace_kvm_irqchip_add_msi_route(dev ? dev->name : (char *)"N/A",
1992                                    vector, virq);
1993
1994    kvm_add_routing_entry(s, &kroute);
1995    kvm_arch_add_msi_route_post(&kroute, vector, dev);
1996    kvm_irqchip_commit_routes(s);
1997
1998    return virq;
1999}
2000
2001int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
2002                                 PCIDevice *dev)
2003{
2004    struct kvm_irq_routing_entry kroute = {};
2005
2006    if (kvm_gsi_direct_mapping()) {
2007        return 0;
2008    }
2009
2010    if (!kvm_irqchip_in_kernel()) {
2011        return -ENOSYS;
2012    }
2013
2014    kroute.gsi = virq;
2015    kroute.type = KVM_IRQ_ROUTING_MSI;
2016    kroute.flags = 0;
2017    kroute.u.msi.address_lo = (uint32_t)msg.address;
2018    kroute.u.msi.address_hi = msg.address >> 32;
2019    kroute.u.msi.data = le32_to_cpu(msg.data);
2020    if (pci_available && kvm_msi_devid_required()) {
2021        kroute.flags = KVM_MSI_VALID_DEVID;
2022        kroute.u.msi.devid = pci_requester_id(dev);
2023    }
2024    if (kvm_arch_fixup_msi_route(&kroute, msg.address, msg.data, dev)) {
2025        return -EINVAL;
2026    }
2027
2028    trace_kvm_irqchip_update_msi_route(virq);
2029
2030    return kvm_update_routing_entry(s, &kroute);
2031}
2032
2033static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2034                                    EventNotifier *resample, int virq,
2035                                    bool assign)
2036{
2037    int fd = event_notifier_get_fd(event);
2038    int rfd = resample ? event_notifier_get_fd(resample) : -1;
2039
2040    struct kvm_irqfd irqfd = {
2041        .fd = fd,
2042        .gsi = virq,
2043        .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
2044    };
2045
2046    if (rfd != -1) {
2047        assert(assign);
2048        if (kvm_irqchip_is_split()) {
2049            /*
2050             * When the slow irqchip (e.g. IOAPIC) is in the
2051             * userspace, KVM kernel resamplefd will not work because
2052             * the EOI of the interrupt will be delivered to userspace
2053             * instead, so the KVM kernel resamplefd kick will be
2054             * skipped.  The userspace here mimics what the kernel
2055             * provides with resamplefd, remember the resamplefd and
2056             * kick it when we receive EOI of this IRQ.
2057             *
2058             * This is hackery because IOAPIC is mostly bypassed
2059             * (except EOI broadcasts) when irqfd is used.  However
2060             * this can bring much performance back for split irqchip
2061             * with INTx IRQs (for VFIO, this gives 93% perf of the
2062             * full fast path, which is 46% perf boost comparing to
2063             * the INTx slow path).
2064             */
2065            kvm_resample_fd_insert(virq, resample);
2066        } else {
2067            irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
2068            irqfd.resamplefd = rfd;
2069        }
2070    } else if (!assign) {
2071        if (kvm_irqchip_is_split()) {
2072            kvm_resample_fd_remove(virq);
2073        }
2074    }
2075
2076    if (!kvm_irqfds_enabled()) {
2077        return -ENOSYS;
2078    }
2079
2080    return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
2081}
2082
2083int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
2084{
2085    struct kvm_irq_routing_entry kroute = {};
2086    int virq;
2087
2088    if (!kvm_gsi_routing_enabled()) {
2089        return -ENOSYS;
2090    }
2091
2092    virq = kvm_irqchip_get_virq(s);
2093    if (virq < 0) {
2094        return virq;
2095    }
2096
2097    kroute.gsi = virq;
2098    kroute.type = KVM_IRQ_ROUTING_S390_ADAPTER;
2099    kroute.flags = 0;
2100    kroute.u.adapter.summary_addr = adapter->summary_addr;
2101    kroute.u.adapter.ind_addr = adapter->ind_addr;
2102    kroute.u.adapter.summary_offset = adapter->summary_offset;
2103    kroute.u.adapter.ind_offset = adapter->ind_offset;
2104    kroute.u.adapter.adapter_id = adapter->adapter_id;
2105
2106    kvm_add_routing_entry(s, &kroute);
2107
2108    return virq;
2109}
2110
2111int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
2112{
2113    struct kvm_irq_routing_entry kroute = {};
2114    int virq;
2115
2116    if (!kvm_gsi_routing_enabled()) {
2117        return -ENOSYS;
2118    }
2119    if (!kvm_check_extension(s, KVM_CAP_HYPERV_SYNIC)) {
2120        return -ENOSYS;
2121    }
2122    virq = kvm_irqchip_get_virq(s);
2123    if (virq < 0) {
2124        return virq;
2125    }
2126
2127    kroute.gsi = virq;
2128    kroute.type = KVM_IRQ_ROUTING_HV_SINT;
2129    kroute.flags = 0;
2130    kroute.u.hv_sint.vcpu = vcpu;
2131    kroute.u.hv_sint.sint = sint;
2132
2133    kvm_add_routing_entry(s, &kroute);
2134    kvm_irqchip_commit_routes(s);
2135
2136    return virq;
2137}
2138
2139#else /* !KVM_CAP_IRQ_ROUTING */
2140
2141void kvm_init_irq_routing(KVMState *s)
2142{
2143}
2144
2145void kvm_irqchip_release_virq(KVMState *s, int virq)
2146{
2147}
2148
2149int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
2150{
2151    abort();
2152}
2153
2154int kvm_irqchip_add_msi_route(KVMState *s, int vector, PCIDevice *dev)
2155{
2156    return -ENOSYS;
2157}
2158
2159int kvm_irqchip_add_adapter_route(KVMState *s, AdapterInfo *adapter)
2160{
2161    return -ENOSYS;
2162}
2163
2164int kvm_irqchip_add_hv_sint_route(KVMState *s, uint32_t vcpu, uint32_t sint)
2165{
2166    return -ENOSYS;
2167}
2168
2169static int kvm_irqchip_assign_irqfd(KVMState *s, EventNotifier *event,
2170                                    EventNotifier *resample, int virq,
2171                                    bool assign)
2172{
2173    abort();
2174}
2175
2176int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
2177{
2178    return -ENOSYS;
2179}
2180#endif /* !KVM_CAP_IRQ_ROUTING */
2181
2182int kvm_irqchip_add_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
2183                                       EventNotifier *rn, int virq)
2184{
2185    return kvm_irqchip_assign_irqfd(s, n, rn, virq, true);
2186}
2187
2188int kvm_irqchip_remove_irqfd_notifier_gsi(KVMState *s, EventNotifier *n,
2189                                          int virq)
2190{
2191    return kvm_irqchip_assign_irqfd(s, n, NULL, virq, false);
2192}
2193
2194int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
2195                                   EventNotifier *rn, qemu_irq irq)
2196{
2197    gpointer key, gsi;
2198    gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
2199
2200    if (!found) {
2201        return -ENXIO;
2202    }
2203    return kvm_irqchip_add_irqfd_notifier_gsi(s, n, rn, GPOINTER_TO_INT(gsi));
2204}
2205
2206int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n,
2207                                      qemu_irq irq)
2208{
2209    gpointer key, gsi;
2210    gboolean found = g_hash_table_lookup_extended(s->gsimap, irq, &key, &gsi);
2211
2212    if (!found) {
2213        return -ENXIO;
2214    }
2215    return kvm_irqchip_remove_irqfd_notifier_gsi(s, n, GPOINTER_TO_INT(gsi));
2216}
2217
2218void kvm_irqchip_set_qemuirq_gsi(KVMState *s, qemu_irq irq, int gsi)
2219{
2220    g_hash_table_insert(s->gsimap, irq, GINT_TO_POINTER(gsi));
2221}
2222
2223static void kvm_irqchip_create(KVMState *s)
2224{
2225    int ret;
2226
2227    assert(s->kernel_irqchip_split != ON_OFF_AUTO_AUTO);
2228    if (kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
2229        ;
2230    } else if (kvm_check_extension(s, KVM_CAP_S390_IRQCHIP)) {
2231        ret = kvm_vm_enable_cap(s, KVM_CAP_S390_IRQCHIP, 0);
2232        if (ret < 0) {
2233            fprintf(stderr, "Enable kernel irqchip failed: %s\n", strerror(-ret));
2234            exit(1);
2235        }
2236    } else {
2237        return;
2238    }
2239
2240    /* First probe and see if there's a arch-specific hook to create the
2241     * in-kernel irqchip for us */
2242    ret = kvm_arch_irqchip_create(s);
2243    if (ret == 0) {
2244        if (s->kernel_irqchip_split == ON_OFF_AUTO_ON) {
2245            perror("Split IRQ chip mode not supported.");
2246            exit(1);
2247        } else {
2248            ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
2249        }
2250    }
2251    if (ret < 0) {
2252        fprintf(stderr, "Create kernel irqchip failed: %s\n", strerror(-ret));
2253        exit(1);
2254    }
2255
2256    kvm_kernel_irqchip = true;
2257    /* If we have an in-kernel IRQ chip then we must have asynchronous
2258     * interrupt delivery (though the reverse is not necessarily true)
2259     */
2260    kvm_async_interrupts_allowed = true;
2261    kvm_halt_in_kernel_allowed = true;
2262
2263    kvm_init_irq_routing(s);
2264
2265    s->gsimap = g_hash_table_new(g_direct_hash, g_direct_equal);
2266}
2267
2268/* Find number of supported CPUs using the recommended
2269 * procedure from the kernel API documentation to cope with
2270 * older kernels that may be missing capabilities.
2271 */
2272static int kvm_recommended_vcpus(KVMState *s)
2273{
2274    int ret = kvm_vm_check_extension(s, KVM_CAP_NR_VCPUS);
2275    return (ret) ? ret : 4;
2276}
2277
2278static int kvm_max_vcpus(KVMState *s)
2279{
2280    int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
2281    return (ret) ? ret : kvm_recommended_vcpus(s);
2282}
2283
2284static int kvm_max_vcpu_id(KVMState *s)
2285{
2286    int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPU_ID);
2287    return (ret) ? ret : kvm_max_vcpus(s);
2288}
2289
2290bool kvm_vcpu_id_is_valid(int vcpu_id)
2291{
2292    KVMState *s = KVM_STATE(current_accel());
2293    return vcpu_id >= 0 && vcpu_id < kvm_max_vcpu_id(s);
2294}
2295
2296static int kvm_init(MachineState *ms)
2297{
2298    MachineClass *mc = MACHINE_GET_CLASS(ms);
2299    static const char upgrade_note[] =
2300        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
2301        "(see http://sourceforge.net/projects/kvm).\n";
2302    struct {
2303        const char *name;
2304        int num;
2305    } num_cpus[] = {
2306        { "SMP",          ms->smp.cpus },
2307        { "hotpluggable", ms->smp.max_cpus },
2308        { NULL, }
2309    }, *nc = num_cpus;
2310    int soft_vcpus_limit, hard_vcpus_limit;
2311    KVMState *s;
2312    const KVMCapabilityInfo *missing_cap;
2313    int ret;
2314    int type = 0;
2315    uint64_t dirty_log_manual_caps;
2316
2317    qemu_mutex_init(&kml_slots_lock);
2318
2319    s = KVM_STATE(ms->accelerator);
2320
2321    /*
2322     * On systems where the kernel can support different base page
2323     * sizes, host page size may be different from TARGET_PAGE_SIZE,
2324     * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
2325     * page size for the system though.
2326     */
2327    assert(TARGET_PAGE_SIZE <= qemu_real_host_page_size);
2328
2329    s->sigmask_len = 8;
2330
2331#ifdef KVM_CAP_SET_GUEST_DEBUG
2332    QTAILQ_INIT(&s->kvm_sw_breakpoints);
2333#endif
2334    QLIST_INIT(&s->kvm_parked_vcpus);
2335    s->fd = qemu_open_old("/dev/kvm", O_RDWR);
2336    if (s->fd == -1) {
2337        fprintf(stderr, "Could not access KVM kernel module: %m\n");
2338        ret = -errno;
2339        goto err;
2340    }
2341
2342    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
2343    if (ret < KVM_API_VERSION) {
2344        if (ret >= 0) {
2345            ret = -EINVAL;
2346        }
2347        fprintf(stderr, "kvm version too old\n");
2348        goto err;
2349    }
2350
2351    if (ret > KVM_API_VERSION) {
2352        ret = -EINVAL;
2353        fprintf(stderr, "kvm version not supported\n");
2354        goto err;
2355    }
2356
2357    kvm_immediate_exit = kvm_check_extension(s, KVM_CAP_IMMEDIATE_EXIT);
2358    s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
2359
2360    /* If unspecified, use the default value */
2361    if (!s->nr_slots) {
2362        s->nr_slots = 32;
2363    }
2364
2365    s->nr_as = kvm_check_extension(s, KVM_CAP_MULTI_ADDRESS_SPACE);
2366    if (s->nr_as <= 1) {
2367        s->nr_as = 1;
2368    }
2369    s->as = g_new0(struct KVMAs, s->nr_as);
2370
2371    if (object_property_find(OBJECT(current_machine), "kvm-type")) {
2372        g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine),
2373                                                            "kvm-type",
2374                                                            &error_abort);
2375        type = mc->kvm_type(ms, kvm_type);
2376    } else if (mc->kvm_type) {
2377        type = mc->kvm_type(ms, NULL);
2378    }
2379
2380    do {
2381        ret = kvm_ioctl(s, KVM_CREATE_VM, type);
2382    } while (ret == -EINTR);
2383
2384    if (ret < 0) {
2385        fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret,
2386                strerror(-ret));
2387
2388#ifdef TARGET_S390X
2389        if (ret == -EINVAL) {
2390            fprintf(stderr,
2391                    "Host kernel setup problem detected. Please verify:\n");
2392            fprintf(stderr, "- for kernels supporting the switch_amode or"
2393                    " user_mode parameters, whether\n");
2394            fprintf(stderr,
2395                    "  user space is running in primary address space\n");
2396            fprintf(stderr,
2397                    "- for kernels supporting the vm.allocate_pgste sysctl, "
2398                    "whether it is enabled\n");
2399        }
2400#elif defined(TARGET_PPC)
2401        if (ret == -EINVAL) {
2402            fprintf(stderr,
2403                    "PPC KVM module is not loaded. Try modprobe kvm_%s.\n",
2404                    (type == 2) ? "pr" : "hv");
2405        }
2406#endif
2407        goto err;
2408    }
2409
2410    s->vmfd = ret;
2411
2412    /* check the vcpu limits */
2413    soft_vcpus_limit = kvm_recommended_vcpus(s);
2414    hard_vcpus_limit = kvm_max_vcpus(s);
2415
2416    while (nc->name) {
2417        if (nc->num > soft_vcpus_limit) {
2418            warn_report("Number of %s cpus requested (%d) exceeds "
2419                        "the recommended cpus supported by KVM (%d)",
2420                        nc->name, nc->num, soft_vcpus_limit);
2421
2422            if (nc->num > hard_vcpus_limit) {
2423                fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
2424                        "the maximum cpus supported by KVM (%d)\n",
2425                        nc->name, nc->num, hard_vcpus_limit);
2426                exit(1);
2427            }
2428        }
2429        nc++;
2430    }
2431
2432    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
2433    if (!missing_cap) {
2434        missing_cap =
2435            kvm_check_extension_list(s, kvm_arch_required_capabilities);
2436    }
2437    if (missing_cap) {
2438        ret = -EINVAL;
2439        fprintf(stderr, "kvm does not support %s\n%s",
2440                missing_cap->name, upgrade_note);
2441        goto err;
2442    }
2443
2444    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
2445    s->coalesced_pio = s->coalesced_mmio &&
2446                       kvm_check_extension(s, KVM_CAP_COALESCED_PIO);
2447
2448    /*
2449     * Enable KVM dirty ring if supported, otherwise fall back to
2450     * dirty logging mode
2451     */
2452    if (s->kvm_dirty_ring_size > 0) {
2453        uint64_t ring_bytes;
2454
2455        ring_bytes = s->kvm_dirty_ring_size * sizeof(struct kvm_dirty_gfn);
2456
2457        /* Read the max supported pages */
2458        ret = kvm_vm_check_extension(s, KVM_CAP_DIRTY_LOG_RING);
2459        if (ret > 0) {
2460            if (ring_bytes > ret) {
2461                error_report("KVM dirty ring size %" PRIu32 " too big "
2462                             "(maximum is %ld).  Please use a smaller value.",
2463                             s->kvm_dirty_ring_size,
2464                             (long)ret / sizeof(struct kvm_dirty_gfn));
2465                ret = -EINVAL;
2466                goto err;
2467            }
2468
2469            ret = kvm_vm_enable_cap(s, KVM_CAP_DIRTY_LOG_RING, 0, ring_bytes);
2470            if (ret) {
2471                error_report("Enabling of KVM dirty ring failed: %s. "
2472                             "Suggested mininum value is 1024.", strerror(-ret));
2473                goto err;
2474            }
2475
2476            s->kvm_dirty_ring_bytes = ring_bytes;
2477         } else {
2478             warn_report("KVM dirty ring not available, using bitmap method");
2479             s->kvm_dirty_ring_size = 0;
2480        }
2481    }
2482
2483    /*
2484     * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is not needed when dirty ring is
2485     * enabled.  More importantly, KVM_DIRTY_LOG_INITIALLY_SET will assume no
2486     * page is wr-protected initially, which is against how kvm dirty ring is
2487     * usage - kvm dirty ring requires all pages are wr-protected at the very
2488     * beginning.  Enabling this feature for dirty ring causes data corruption.
2489     *
2490     * TODO: Without KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and kvm clear dirty log,
2491     * we may expect a higher stall time when starting the migration.  In the
2492     * future we can enable KVM_CLEAR_DIRTY_LOG to work with dirty ring too:
2493     * instead of clearing dirty bit, it can be a way to explicitly wr-protect
2494     * guest pages.
2495     */
2496    if (!s->kvm_dirty_ring_size) {
2497        dirty_log_manual_caps =
2498            kvm_check_extension(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2);
2499        dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE |
2500                                  KVM_DIRTY_LOG_INITIALLY_SET);
2501        s->manual_dirty_log_protect = dirty_log_manual_caps;
2502        if (dirty_log_manual_caps) {
2503            ret = kvm_vm_enable_cap(s, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, 0,
2504                                    dirty_log_manual_caps);
2505            if (ret) {
2506                warn_report("Trying to enable capability %"PRIu64" of "
2507                            "KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 but failed. "
2508                            "Falling back to the legacy mode. ",
2509                            dirty_log_manual_caps);
2510                s->manual_dirty_log_protect = 0;
2511            }
2512        }
2513    }
2514
2515#ifdef KVM_CAP_VCPU_EVENTS
2516    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
2517#endif
2518
2519    s->robust_singlestep =
2520        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
2521
2522#ifdef KVM_CAP_DEBUGREGS
2523    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
2524#endif
2525
2526    s->max_nested_state_len = kvm_check_extension(s, KVM_CAP_NESTED_STATE);
2527
2528#ifdef KVM_CAP_IRQ_ROUTING
2529    kvm_direct_msi_allowed = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
2530#endif
2531
2532    s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
2533
2534    s->irq_set_ioctl = KVM_IRQ_LINE;
2535    if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
2536        s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
2537    }
2538
2539    kvm_readonly_mem_allowed =
2540        (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
2541
2542    kvm_eventfds_allowed =
2543        (kvm_check_extension(s, KVM_CAP_IOEVENTFD) > 0);
2544
2545    kvm_irqfds_allowed =
2546        (kvm_check_extension(s, KVM_CAP_IRQFD) > 0);
2547
2548    kvm_resamplefds_allowed =
2549        (kvm_check_extension(s, KVM_CAP_IRQFD_RESAMPLE) > 0);
2550
2551    kvm_vm_attributes_allowed =
2552        (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
2553
2554    kvm_ioeventfd_any_length_allowed =
2555        (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0);
2556
2557    kvm_state = s;
2558
2559    ret = kvm_arch_init(ms, s);
2560    if (ret < 0) {
2561        goto err;
2562    }
2563
2564    if (s->kernel_irqchip_split == ON_OFF_AUTO_AUTO) {
2565        s->kernel_irqchip_split = mc->default_kernel_irqchip_split ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
2566    }
2567
2568    qemu_register_reset(kvm_unpoison_all, NULL);
2569
2570    if (s->kernel_irqchip_allowed) {
2571        kvm_irqchip_create(s);
2572    }
2573
2574    if (kvm_eventfds_allowed) {
2575        s->memory_listener.listener.eventfd_add = kvm_mem_ioeventfd_add;
2576        s->memory_listener.listener.eventfd_del = kvm_mem_ioeventfd_del;
2577    }
2578    s->memory_listener.listener.coalesced_io_add = kvm_coalesce_mmio_region;
2579    s->memory_listener.listener.coalesced_io_del = kvm_uncoalesce_mmio_region;
2580
2581    kvm_memory_listener_register(s, &s->memory_listener,
2582                                 &address_space_memory, 0);
2583    if (kvm_eventfds_allowed) {
2584        memory_listener_register(&kvm_io_listener,
2585                                 &address_space_io);
2586    }
2587    memory_listener_register(&kvm_coalesced_pio_listener,
2588                             &address_space_io);
2589
2590    s->many_ioeventfds = kvm_check_many_ioeventfds();
2591
2592    s->sync_mmu = !!kvm_vm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
2593    if (!s->sync_mmu) {
2594        ret = ram_block_discard_disable(true);
2595        assert(!ret);
2596    }
2597
2598    if (s->kvm_dirty_ring_size) {
2599        ret = kvm_dirty_ring_reaper_init(s);
2600        if (ret) {
2601            goto err;
2602        }
2603    }
2604
2605    return 0;
2606
2607err:
2608    assert(ret < 0);
2609    if (s->vmfd >= 0) {
2610        close(s->vmfd);
2611    }
2612    if (s->fd != -1) {
2613        close(s->fd);
2614    }
2615    g_free(s->memory_listener.slots);
2616
2617    return ret;
2618}
2619
2620void kvm_set_sigmask_len(KVMState *s, unsigned int sigmask_len)
2621{
2622    s->sigmask_len = sigmask_len;
2623}
2624
2625static void kvm_handle_io(uint16_t port, MemTxAttrs attrs, void *data, int direction,
2626                          int size, uint32_t count)
2627{
2628    int i;
2629    uint8_t *ptr = data;
2630
2631    for (i = 0; i < count; i++) {
2632        address_space_rw(&address_space_io, port, attrs,
2633                         ptr, size,
2634                         direction == KVM_EXIT_IO_OUT);
2635        ptr += size;
2636    }
2637}
2638
2639static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
2640{
2641    fprintf(stderr, "KVM internal error. Suberror: %d\n",
2642            run->internal.suberror);
2643
2644    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
2645        int i;
2646
2647        for (i = 0; i < run->internal.ndata; ++i) {
2648            fprintf(stderr, "extra data[%d]: 0x%016"PRIx64"\n",
2649                    i, (uint64_t)run->internal.data[i]);
2650        }
2651    }
2652    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
2653        fprintf(stderr, "emulation failure\n");
2654        if (!kvm_arch_stop_on_emulation_error(cpu)) {
2655            cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2656            return EXCP_INTERRUPT;
2657        }
2658    }
2659    /* FIXME: Should trigger a qmp message to let management know
2660     * something went wrong.
2661     */
2662    return -1;
2663}
2664
2665void kvm_flush_coalesced_mmio_buffer(void)
2666{
2667    KVMState *s = kvm_state;
2668
2669    if (s->coalesced_flush_in_progress) {
2670        return;
2671    }
2672
2673    s->coalesced_flush_in_progress = true;
2674
2675    if (s->coalesced_mmio_ring) {
2676        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
2677        while (ring->first != ring->last) {
2678            struct kvm_coalesced_mmio *ent;
2679
2680            ent = &ring->coalesced_mmio[ring->first];
2681
2682            if (ent->pio == 1) {
2683                address_space_write(&address_space_io, ent->phys_addr,
2684                                    MEMTXATTRS_UNSPECIFIED, ent->data,
2685                                    ent->len);
2686            } else {
2687                cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
2688            }
2689            smp_wmb();
2690            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
2691        }
2692    }
2693
2694    s->coalesced_flush_in_progress = false;
2695}
2696
2697bool kvm_cpu_check_are_resettable(void)
2698{
2699    return kvm_arch_cpu_check_are_resettable();
2700}
2701
2702static void do_kvm_cpu_synchronize_state(CPUState *cpu, run_on_cpu_data arg)
2703{
2704    if (!cpu->vcpu_dirty) {
2705        kvm_arch_get_registers(cpu);
2706        cpu->vcpu_dirty = true;
2707    }
2708}
2709
2710void kvm_cpu_synchronize_state(CPUState *cpu)
2711{
2712    if (!cpu->vcpu_dirty) {
2713        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, RUN_ON_CPU_NULL);
2714    }
2715}
2716
2717static void do_kvm_cpu_synchronize_post_reset(CPUState *cpu, run_on_cpu_data arg)
2718{
2719    kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
2720    cpu->vcpu_dirty = false;
2721}
2722
2723void kvm_cpu_synchronize_post_reset(CPUState *cpu)
2724{
2725    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_reset, RUN_ON_CPU_NULL);
2726}
2727
2728static void do_kvm_cpu_synchronize_post_init(CPUState *cpu, run_on_cpu_data arg)
2729{
2730    kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
2731    cpu->vcpu_dirty = false;
2732}
2733
2734void kvm_cpu_synchronize_post_init(CPUState *cpu)
2735{
2736    run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
2737}
2738
2739static void do_kvm_cpu_synchronize_pre_loadvm(CPUState *cpu, run_on_cpu_data arg)
2740{
2741    cpu->vcpu_dirty = true;
2742}
2743
2744void kvm_cpu_synchronize_pre_loadvm(CPUState *cpu)
2745{
2746    run_on_cpu(cpu, do_kvm_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
2747}
2748
2749#ifdef KVM_HAVE_MCE_INJECTION
2750static __thread void *pending_sigbus_addr;
2751static __thread int pending_sigbus_code;
2752static __thread bool have_sigbus_pending;
2753#endif
2754
2755static void kvm_cpu_kick(CPUState *cpu)
2756{
2757    qatomic_set(&cpu->kvm_run->immediate_exit, 1);
2758}
2759
2760static void kvm_cpu_kick_self(void)
2761{
2762    if (kvm_immediate_exit) {
2763        kvm_cpu_kick(current_cpu);
2764    } else {
2765        qemu_cpu_kick_self();
2766    }
2767}
2768
2769static void kvm_eat_signals(CPUState *cpu)
2770{
2771    struct timespec ts = { 0, 0 };
2772    siginfo_t siginfo;
2773    sigset_t waitset;
2774    sigset_t chkset;
2775    int r;
2776
2777    if (kvm_immediate_exit) {
2778        qatomic_set(&cpu->kvm_run->immediate_exit, 0);
2779        /* Write kvm_run->immediate_exit before the cpu->exit_request
2780         * write in kvm_cpu_exec.
2781         */
2782        smp_wmb();
2783        return;
2784    }
2785
2786    sigemptyset(&waitset);
2787    sigaddset(&waitset, SIG_IPI);
2788
2789    do {
2790        r = sigtimedwait(&waitset, &siginfo, &ts);
2791        if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
2792            perror("sigtimedwait");
2793            exit(1);
2794        }
2795
2796        r = sigpending(&chkset);
2797        if (r == -1) {
2798            perror("sigpending");
2799            exit(1);
2800        }
2801    } while (sigismember(&chkset, SIG_IPI));
2802}
2803
2804int kvm_cpu_exec(CPUState *cpu)
2805{
2806    struct kvm_run *run = cpu->kvm_run;
2807    int ret, run_ret;
2808
2809    DPRINTF("kvm_cpu_exec()\n");
2810
2811    if (kvm_arch_process_async_events(cpu)) {
2812        qatomic_set(&cpu->exit_request, 0);
2813        return EXCP_HLT;
2814    }
2815
2816    qemu_mutex_unlock_iothread();
2817    cpu_exec_start(cpu);
2818
2819    do {
2820        MemTxAttrs attrs;
2821
2822        if (cpu->vcpu_dirty) {
2823            kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
2824            cpu->vcpu_dirty = false;
2825        }
2826
2827        kvm_arch_pre_run(cpu, run);
2828        if (qatomic_read(&cpu->exit_request)) {
2829            DPRINTF("interrupt exit requested\n");
2830            /*
2831             * KVM requires us to reenter the kernel after IO exits to complete
2832             * instruction emulation. This self-signal will ensure that we
2833             * leave ASAP again.
2834             */
2835            kvm_cpu_kick_self();
2836        }
2837
2838        /* Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
2839         * Matching barrier in kvm_eat_signals.
2840         */
2841        smp_rmb();
2842
2843        run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
2844
2845        attrs = kvm_arch_post_run(cpu, run);
2846
2847#ifdef KVM_HAVE_MCE_INJECTION
2848        if (unlikely(have_sigbus_pending)) {
2849            qemu_mutex_lock_iothread();
2850            kvm_arch_on_sigbus_vcpu(cpu, pending_sigbus_code,
2851                                    pending_sigbus_addr);
2852            have_sigbus_pending = false;
2853            qemu_mutex_unlock_iothread();
2854        }
2855#endif
2856
2857        if (run_ret < 0) {
2858            if (run_ret == -EINTR || run_ret == -EAGAIN) {
2859                DPRINTF("io window exit\n");
2860                kvm_eat_signals(cpu);
2861                ret = EXCP_INTERRUPT;
2862                break;
2863            }
2864            fprintf(stderr, "error: kvm run failed %s\n",
2865                    strerror(-run_ret));
2866#ifdef TARGET_PPC
2867            if (run_ret == -EBUSY) {
2868                fprintf(stderr,
2869                        "This is probably because your SMT is enabled.\n"
2870                        "VCPU can only run on primary threads with all "
2871                        "secondary threads offline.\n");
2872            }
2873#endif
2874            ret = -1;
2875            break;
2876        }
2877
2878        trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
2879        switch (run->exit_reason) {
2880        case KVM_EXIT_IO:
2881            DPRINTF("handle_io\n");
2882            /* Called outside BQL */
2883            kvm_handle_io(run->io.port, attrs,
2884                          (uint8_t *)run + run->io.data_offset,
2885                          run->io.direction,
2886                          run->io.size,
2887                          run->io.count);
2888            ret = 0;
2889            break;
2890        case KVM_EXIT_MMIO:
2891            DPRINTF("handle_mmio\n");
2892            /* Called outside BQL */
2893            address_space_rw(&address_space_memory,
2894                             run->mmio.phys_addr, attrs,
2895                             run->mmio.data,
2896                             run->mmio.len,
2897                             run->mmio.is_write);
2898            ret = 0;
2899            break;
2900        case KVM_EXIT_IRQ_WINDOW_OPEN:
2901            DPRINTF("irq_window_open\n");
2902            ret = EXCP_INTERRUPT;
2903            break;
2904        case KVM_EXIT_SHUTDOWN:
2905            DPRINTF("shutdown\n");
2906            qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2907            ret = EXCP_INTERRUPT;
2908            break;
2909        case KVM_EXIT_UNKNOWN:
2910            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
2911                    (uint64_t)run->hw.hardware_exit_reason);
2912            ret = -1;
2913            break;
2914        case KVM_EXIT_INTERNAL_ERROR:
2915            ret = kvm_handle_internal_error(cpu, run);
2916            break;
2917        case KVM_EXIT_DIRTY_RING_FULL:
2918            /*
2919             * We shouldn't continue if the dirty ring of this vcpu is
2920             * still full.  Got kicked by KVM_RESET_DIRTY_RINGS.
2921             */
2922            trace_kvm_dirty_ring_full(cpu->cpu_index);
2923            qemu_mutex_lock_iothread();
2924            kvm_dirty_ring_reap(kvm_state);
2925            qemu_mutex_unlock_iothread();
2926            ret = 0;
2927            break;
2928        case KVM_EXIT_SYSTEM_EVENT:
2929            switch (run->system_event.type) {
2930            case KVM_SYSTEM_EVENT_SHUTDOWN:
2931                qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
2932                ret = EXCP_INTERRUPT;
2933                break;
2934            case KVM_SYSTEM_EVENT_RESET:
2935                qemu_system_reset_request(SHUTDOWN_CAUSE_GUEST_RESET);
2936                ret = EXCP_INTERRUPT;
2937                break;
2938            case KVM_SYSTEM_EVENT_CRASH:
2939                kvm_cpu_synchronize_state(cpu);
2940                qemu_mutex_lock_iothread();
2941                qemu_system_guest_panicked(cpu_get_crash_info(cpu));
2942                qemu_mutex_unlock_iothread();
2943                ret = 0;
2944                break;
2945            default:
2946                DPRINTF("kvm_arch_handle_exit\n");
2947                ret = kvm_arch_handle_exit(cpu, run);
2948                break;
2949            }
2950            break;
2951        default:
2952            DPRINTF("kvm_arch_handle_exit\n");
2953            ret = kvm_arch_handle_exit(cpu, run);
2954            break;
2955        }
2956    } while (ret == 0);
2957
2958    cpu_exec_end(cpu);
2959    qemu_mutex_lock_iothread();
2960
2961    if (ret < 0) {
2962        cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
2963        vm_stop(RUN_STATE_INTERNAL_ERROR);
2964    }
2965
2966    qatomic_set(&cpu->exit_request, 0);
2967    return ret;
2968}
2969
2970int kvm_ioctl(KVMState *s, int type, ...)
2971{
2972    int ret;
2973    void *arg;
2974    va_list ap;
2975
2976    va_start(ap, type);
2977    arg = va_arg(ap, void *);
2978    va_end(ap);
2979
2980    trace_kvm_ioctl(type, arg);
2981    ret = ioctl(s->fd, type, arg);
2982    if (ret == -1) {
2983        ret = -errno;
2984    }
2985    return ret;
2986}
2987
2988int kvm_vm_ioctl(KVMState *s, int type, ...)
2989{
2990    int ret;
2991    void *arg;
2992    va_list ap;
2993
2994    va_start(ap, type);
2995    arg = va_arg(ap, void *);
2996    va_end(ap);
2997
2998    trace_kvm_vm_ioctl(type, arg);
2999    ret = ioctl(s->vmfd, type, arg);
3000    if (ret == -1) {
3001        ret = -errno;
3002    }
3003    return ret;
3004}
3005
3006int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
3007{
3008    int ret;
3009    void *arg;
3010    va_list ap;
3011
3012    va_start(ap, type);
3013    arg = va_arg(ap, void *);
3014    va_end(ap);
3015
3016    trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
3017    ret = ioctl(cpu->kvm_fd, type, arg);
3018    if (ret == -1) {
3019        ret = -errno;
3020    }
3021    return ret;
3022}
3023
3024int kvm_device_ioctl(int fd, int type, ...)
3025{
3026    int ret;
3027    void *arg;
3028    va_list ap;
3029
3030    va_start(ap, type);
3031    arg = va_arg(ap, void *);
3032    va_end(ap);
3033
3034    trace_kvm_device_ioctl(fd, type, arg);
3035    ret = ioctl(fd, type, arg);
3036    if (ret == -1) {
3037        ret = -errno;
3038    }
3039    return ret;
3040}
3041
3042int kvm_vm_check_attr(KVMState *s, uint32_t group, uint64_t attr)
3043{
3044    int ret;
3045    struct kvm_device_attr attribute = {
3046        .group = group,
3047        .attr = attr,
3048    };
3049
3050    if (!kvm_vm_attributes_allowed) {
3051        return 0;
3052    }
3053
3054    ret = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &attribute);
3055    /* kvm returns 0 on success for HAS_DEVICE_ATTR */
3056    return ret ? 0 : 1;
3057}
3058
3059int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr)
3060{
3061    struct kvm_device_attr attribute = {
3062        .group = group,
3063        .attr = attr,
3064        .flags = 0,
3065    };
3066
3067    return kvm_device_ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute) ? 0 : 1;
3068}
3069
3070int kvm_device_access(int fd, int group, uint64_t attr,
3071                      void *val, bool write, Error **errp)
3072{
3073    struct kvm_device_attr kvmattr;
3074    int err;
3075
3076    kvmattr.flags = 0;
3077    kvmattr.group = group;
3078    kvmattr.attr = attr;
3079    kvmattr.addr = (uintptr_t)val;
3080
3081    err = kvm_device_ioctl(fd,
3082                           write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR,
3083                           &kvmattr);
3084    if (err < 0) {
3085        error_setg_errno(errp, -err,
3086                         "KVM_%s_DEVICE_ATTR failed: Group %d "
3087                         "attr 0x%016" PRIx64,
3088                         write ? "SET" : "GET", group, attr);
3089    }
3090    return err;
3091}
3092
3093bool kvm_has_sync_mmu(void)
3094{
3095    return kvm_state->sync_mmu;
3096}
3097
3098int kvm_has_vcpu_events(void)
3099{
3100    return kvm_state->vcpu_events;
3101}
3102
3103int kvm_has_robust_singlestep(void)
3104{
3105    return kvm_state->robust_singlestep;
3106}
3107
3108int kvm_has_debugregs(void)
3109{
3110    return kvm_state->debugregs;
3111}
3112
3113int kvm_max_nested_state_length(void)
3114{
3115    return kvm_state->max_nested_state_len;
3116}
3117
3118int kvm_has_many_ioeventfds(void)
3119{
3120    if (!kvm_enabled()) {
3121        return 0;
3122    }
3123    return kvm_state->many_ioeventfds;
3124}
3125
3126int kvm_has_gsi_routing(void)
3127{
3128#ifdef KVM_CAP_IRQ_ROUTING
3129    return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
3130#else
3131    return false;
3132#endif
3133}
3134
3135int kvm_has_intx_set_mask(void)
3136{
3137    return kvm_state->intx_set_mask;
3138}
3139
3140bool kvm_arm_supports_user_irq(void)
3141{
3142    return kvm_check_extension(kvm_state, KVM_CAP_ARM_USER_IRQ);
3143}
3144
3145#ifdef KVM_CAP_SET_GUEST_DEBUG
3146struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu,
3147                                                 target_ulong pc)
3148{
3149    struct kvm_sw_breakpoint *bp;
3150
3151    QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
3152        if (bp->pc == pc) {
3153            return bp;
3154        }
3155    }
3156    return NULL;
3157}
3158
3159int kvm_sw_breakpoints_active(CPUState *cpu)
3160{
3161    return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
3162}
3163
3164struct kvm_set_guest_debug_data {
3165    struct kvm_guest_debug dbg;
3166    int err;
3167};
3168
3169static void kvm_invoke_set_guest_debug(CPUState *cpu, run_on_cpu_data data)
3170{
3171    struct kvm_set_guest_debug_data *dbg_data =
3172        (struct kvm_set_guest_debug_data *) data.host_ptr;
3173
3174    dbg_data->err = kvm_vcpu_ioctl(cpu, KVM_SET_GUEST_DEBUG,
3175                                   &dbg_data->dbg);
3176}
3177
3178int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
3179{
3180    struct kvm_set_guest_debug_data data;
3181
3182    data.dbg.control = reinject_trap;
3183
3184    if (cpu->singlestep_enabled) {
3185        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
3186    }
3187    kvm_arch_update_guest_debug(cpu, &data.dbg);
3188
3189    run_on_cpu(cpu, kvm_invoke_set_guest_debug,
3190               RUN_ON_CPU_HOST_PTR(&data));
3191    return data.err;
3192}
3193
3194int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
3195                          target_ulong len, int type)
3196{
3197    struct kvm_sw_breakpoint *bp;
3198    int err;
3199
3200    if (type == GDB_BREAKPOINT_SW) {
3201        bp = kvm_find_sw_breakpoint(cpu, addr);
3202        if (bp) {
3203            bp->use_count++;
3204            return 0;
3205        }
3206
3207        bp = g_malloc(sizeof(struct kvm_sw_breakpoint));
3208        bp->pc = addr;
3209        bp->use_count = 1;
3210        err = kvm_arch_insert_sw_breakpoint(cpu, bp);
3211        if (err) {
3212            g_free(bp);
3213            return err;
3214        }
3215
3216        QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
3217    } else {
3218        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
3219        if (err) {
3220            return err;
3221        }
3222    }
3223
3224    CPU_FOREACH(cpu) {
3225        err = kvm_update_guest_debug(cpu, 0);
3226        if (err) {
3227            return err;
3228        }
3229    }
3230    return 0;
3231}
3232
3233int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
3234                          target_ulong len, int type)
3235{
3236    struct kvm_sw_breakpoint *bp;
3237    int err;
3238
3239    if (type == GDB_BREAKPOINT_SW) {
3240        bp = kvm_find_sw_breakpoint(cpu, addr);
3241        if (!bp) {
3242            return -ENOENT;
3243        }
3244
3245        if (bp->use_count > 1) {
3246            bp->use_count--;
3247            return 0;
3248        }
3249
3250        err = kvm_arch_remove_sw_breakpoint(cpu, bp);
3251        if (err) {
3252            return err;
3253        }
3254
3255        QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
3256        g_free(bp);
3257    } else {
3258        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
3259        if (err) {
3260            return err;
3261        }
3262    }
3263
3264    CPU_FOREACH(cpu) {
3265        err = kvm_update_guest_debug(cpu, 0);
3266        if (err) {
3267            return err;
3268        }
3269    }
3270    return 0;
3271}
3272
3273void kvm_remove_all_breakpoints(CPUState *cpu)
3274{
3275    struct kvm_sw_breakpoint *bp, *next;
3276    KVMState *s = cpu->kvm_state;
3277    CPUState *tmpcpu;
3278
3279    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
3280        if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
3281            /* Try harder to find a CPU that currently sees the breakpoint. */
3282            CPU_FOREACH(tmpcpu) {
3283                if (kvm_arch_remove_sw_breakpoint(tmpcpu, bp) == 0) {
3284                    break;
3285                }
3286            }
3287        }
3288        QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
3289        g_free(bp);
3290    }
3291    kvm_arch_remove_all_hw_breakpoints();
3292
3293    CPU_FOREACH(cpu) {
3294        kvm_update_guest_debug(cpu, 0);
3295    }
3296}
3297
3298#else /* !KVM_CAP_SET_GUEST_DEBUG */
3299
3300int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
3301{
3302    return -EINVAL;
3303}
3304
3305int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
3306                          target_ulong len, int type)
3307{
3308    return -EINVAL;
3309}
3310
3311int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
3312                          target_ulong len, int type)
3313{
3314    return -EINVAL;
3315}
3316
3317void kvm_remove_all_breakpoints(CPUState *cpu)
3318{
3319}
3320#endif /* !KVM_CAP_SET_GUEST_DEBUG */
3321
3322static int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
3323{
3324    KVMState *s = kvm_state;
3325    struct kvm_signal_mask *sigmask;
3326    int r;
3327
3328    sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
3329
3330    sigmask->len = s->sigmask_len;
3331    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
3332    r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
3333    g_free(sigmask);
3334
3335    return r;
3336}
3337
3338static void kvm_ipi_signal(int sig)
3339{
3340    if (current_cpu) {
3341        assert(kvm_immediate_exit);
3342        kvm_cpu_kick(current_cpu);
3343    }
3344}
3345
3346void kvm_init_cpu_signals(CPUState *cpu)
3347{
3348    int r;
3349    sigset_t set;
3350    struct sigaction sigact;
3351
3352    memset(&sigact, 0, sizeof(sigact));
3353    sigact.sa_handler = kvm_ipi_signal;
3354    sigaction(SIG_IPI, &sigact, NULL);
3355
3356    pthread_sigmask(SIG_BLOCK, NULL, &set);
3357#if defined KVM_HAVE_MCE_INJECTION
3358    sigdelset(&set, SIGBUS);
3359    pthread_sigmask(SIG_SETMASK, &set, NULL);
3360#endif
3361    sigdelset(&set, SIG_IPI);
3362    if (kvm_immediate_exit) {
3363        r = pthread_sigmask(SIG_SETMASK, &set, NULL);
3364    } else {
3365        r = kvm_set_signal_mask(cpu, &set);
3366    }
3367    if (r) {
3368        fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
3369        exit(1);
3370    }
3371}
3372
3373/* Called asynchronously in VCPU thread.  */
3374int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
3375{
3376#ifdef KVM_HAVE_MCE_INJECTION
3377    if (have_sigbus_pending) {
3378        return 1;
3379    }
3380    have_sigbus_pending = true;
3381    pending_sigbus_addr = addr;
3382    pending_sigbus_code = code;
3383    qatomic_set(&cpu->exit_request, 1);
3384    return 0;
3385#else
3386    return 1;
3387#endif
3388}
3389
3390/* Called synchronously (via signalfd) in main thread.  */
3391int kvm_on_sigbus(int code, void *addr)
3392{
3393#ifdef KVM_HAVE_MCE_INJECTION
3394    /* Action required MCE kills the process if SIGBUS is blocked.  Because
3395     * that's what happens in the I/O thread, where we handle MCE via signalfd,
3396     * we can only get action optional here.
3397     */
3398    assert(code != BUS_MCEERR_AR);
3399    kvm_arch_on_sigbus_vcpu(first_cpu, code, addr);
3400    return 0;
3401#else
3402    return 1;
3403#endif
3404}
3405
3406int kvm_create_device(KVMState *s, uint64_t type, bool test)
3407{
3408    int ret;
3409    struct kvm_create_device create_dev;
3410
3411    create_dev.type = type;
3412    create_dev.fd = -1;
3413    create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0;
3414
3415    if (!kvm_check_extension(s, KVM_CAP_DEVICE_CTRL)) {
3416        return -ENOTSUP;
3417    }
3418
3419    ret = kvm_vm_ioctl(s, KVM_CREATE_DEVICE, &create_dev);
3420    if (ret) {
3421        return ret;
3422    }
3423
3424    return test ? 0 : create_dev.fd;
3425}
3426
3427bool kvm_device_supported(int vmfd, uint64_t type)
3428{
3429    struct kvm_create_device create_dev = {
3430        .type = type,
3431        .fd = -1,
3432        .flags = KVM_CREATE_DEVICE_TEST,
3433    };
3434
3435    if (ioctl(vmfd, KVM_CHECK_EXTENSION, KVM_CAP_DEVICE_CTRL) <= 0) {
3436        return false;
3437    }
3438
3439    return (ioctl(vmfd, KVM_CREATE_DEVICE, &create_dev) >= 0);
3440}
3441
3442int kvm_set_one_reg(CPUState *cs, uint64_t id, void *source)
3443{
3444    struct kvm_one_reg reg;
3445    int r;
3446
3447    reg.id = id;
3448    reg.addr = (uintptr_t) source;
3449    r = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, &reg);
3450    if (r) {
3451        trace_kvm_failed_reg_set(id, strerror(-r));
3452    }
3453    return r;
3454}
3455
3456int kvm_get_one_reg(CPUState *cs, uint64_t id, void *target)
3457{
3458    struct kvm_one_reg reg;
3459    int r;
3460
3461    reg.id = id;
3462    reg.addr = (uintptr_t) target;
3463    r = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, &reg);
3464    if (r) {
3465        trace_kvm_failed_reg_get(id, strerror(-r));
3466    }
3467    return r;
3468}
3469
3470static bool kvm_accel_has_memory(MachineState *ms, AddressSpace *as,
3471                                 hwaddr start_addr, hwaddr size)
3472{
3473    KVMState *kvm = KVM_STATE(ms->accelerator);
3474    int i;
3475
3476    for (i = 0; i < kvm->nr_as; ++i) {
3477        if (kvm->as[i].as == as && kvm->as[i].ml) {
3478            size = MIN(kvm_max_slot_size, size);
3479            return NULL != kvm_lookup_matching_slot(kvm->as[i].ml,
3480                                                    start_addr, size);
3481        }
3482    }
3483
3484    return false;
3485}
3486
3487static void kvm_get_kvm_shadow_mem(Object *obj, Visitor *v,
3488                                   const char *name, void *opaque,
3489                                   Error **errp)
3490{
3491    KVMState *s = KVM_STATE(obj);
3492    int64_t value = s->kvm_shadow_mem;
3493
3494    visit_type_int(v, name, &value, errp);
3495}
3496
3497static void kvm_set_kvm_shadow_mem(Object *obj, Visitor *v,
3498                                   const char *name, void *opaque,
3499                                   Error **errp)
3500{
3501    KVMState *s = KVM_STATE(obj);
3502    int64_t value;
3503
3504    if (s->fd != -1) {
3505        error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3506        return;
3507    }
3508
3509    if (!visit_type_int(v, name, &value, errp)) {
3510        return;
3511    }
3512
3513    s->kvm_shadow_mem = value;
3514}
3515
3516static void kvm_set_kernel_irqchip(Object *obj, Visitor *v,
3517                                   const char *name, void *opaque,
3518                                   Error **errp)
3519{
3520    KVMState *s = KVM_STATE(obj);
3521    OnOffSplit mode;
3522
3523    if (s->fd != -1) {
3524        error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3525        return;
3526    }
3527
3528    if (!visit_type_OnOffSplit(v, name, &mode, errp)) {
3529        return;
3530    }
3531    switch (mode) {
3532    case ON_OFF_SPLIT_ON:
3533        s->kernel_irqchip_allowed = true;
3534        s->kernel_irqchip_required = true;
3535        s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3536        break;
3537    case ON_OFF_SPLIT_OFF:
3538        s->kernel_irqchip_allowed = false;
3539        s->kernel_irqchip_required = false;
3540        s->kernel_irqchip_split = ON_OFF_AUTO_OFF;
3541        break;
3542    case ON_OFF_SPLIT_SPLIT:
3543        s->kernel_irqchip_allowed = true;
3544        s->kernel_irqchip_required = true;
3545        s->kernel_irqchip_split = ON_OFF_AUTO_ON;
3546        break;
3547    default:
3548        /* The value was checked in visit_type_OnOffSplit() above. If
3549         * we get here, then something is wrong in QEMU.
3550         */
3551        abort();
3552    }
3553}
3554
3555bool kvm_kernel_irqchip_allowed(void)
3556{
3557    return kvm_state->kernel_irqchip_allowed;
3558}
3559
3560bool kvm_kernel_irqchip_required(void)
3561{
3562    return kvm_state->kernel_irqchip_required;
3563}
3564
3565bool kvm_kernel_irqchip_split(void)
3566{
3567    return kvm_state->kernel_irqchip_split == ON_OFF_AUTO_ON;
3568}
3569
3570static void kvm_get_dirty_ring_size(Object *obj, Visitor *v,
3571                                    const char *name, void *opaque,
3572                                    Error **errp)
3573{
3574    KVMState *s = KVM_STATE(obj);
3575    uint32_t value = s->kvm_dirty_ring_size;
3576
3577    visit_type_uint32(v, name, &value, errp);
3578}
3579
3580static void kvm_set_dirty_ring_size(Object *obj, Visitor *v,
3581                                    const char *name, void *opaque,
3582                                    Error **errp)
3583{
3584    KVMState *s = KVM_STATE(obj);
3585    Error *error = NULL;
3586    uint32_t value;
3587
3588    if (s->fd != -1) {
3589        error_setg(errp, "Cannot set properties after the accelerator has been initialized");
3590        return;
3591    }
3592
3593    visit_type_uint32(v, name, &value, &error);
3594    if (error) {
3595        error_propagate(errp, error);
3596        return;
3597    }
3598    if (value & (value - 1)) {
3599        error_setg(errp, "dirty-ring-size must be a power of two.");
3600        return;
3601    }
3602
3603    s->kvm_dirty_ring_size = value;
3604}
3605
3606static void kvm_accel_instance_init(Object *obj)
3607{
3608    KVMState *s = KVM_STATE(obj);
3609
3610    s->fd = -1;
3611    s->vmfd = -1;
3612    s->kvm_shadow_mem = -1;
3613    s->kernel_irqchip_allowed = true;
3614    s->kernel_irqchip_split = ON_OFF_AUTO_AUTO;
3615    /* KVM dirty ring is by default off */
3616    s->kvm_dirty_ring_size = 0;
3617}
3618
3619static void kvm_accel_class_init(ObjectClass *oc, void *data)
3620{
3621    AccelClass *ac = ACCEL_CLASS(oc);
3622    ac->name = "KVM";
3623    ac->init_machine = kvm_init;
3624    ac->has_memory = kvm_accel_has_memory;
3625    ac->allowed = &kvm_allowed;
3626
3627    object_class_property_add(oc, "kernel-irqchip", "on|off|split",
3628        NULL, kvm_set_kernel_irqchip,
3629        NULL, NULL);
3630    object_class_property_set_description(oc, "kernel-irqchip",
3631        "Configure KVM in-kernel irqchip");
3632
3633    object_class_property_add(oc, "kvm-shadow-mem", "int",
3634        kvm_get_kvm_shadow_mem, kvm_set_kvm_shadow_mem,
3635        NULL, NULL);
3636    object_class_property_set_description(oc, "kvm-shadow-mem",
3637        "KVM shadow MMU size");
3638
3639    object_class_property_add(oc, "dirty-ring-size", "uint32",
3640        kvm_get_dirty_ring_size, kvm_set_dirty_ring_size,
3641        NULL, NULL);
3642    object_class_property_set_description(oc, "dirty-ring-size",
3643        "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)");
3644}
3645
3646static const TypeInfo kvm_accel_type = {
3647    .name = TYPE_KVM_ACCEL,
3648    .parent = TYPE_ACCEL,
3649    .instance_init = kvm_accel_instance_init,
3650    .class_init = kvm_accel_class_init,
3651    .instance_size = sizeof(KVMState),
3652};
3653
3654static void kvm_type_init(void)
3655{
3656    type_register_static(&kvm_accel_type);
3657}
3658
3659type_init(kvm_type_init);
3660