qemu/kvm-all.c
<<
>>
Prefs
   1/*
   2 * QEMU KVM support
   3 *
   4 * Copyright IBM, Corp. 2008
   5 *           Red Hat, Inc. 2008
   6 *
   7 * Authors:
   8 *  Anthony Liguori   <aliguori@us.ibm.com>
   9 *  Glauber Costa     <gcosta@redhat.com>
  10 *
  11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
  12 * See the COPYING file in the top-level directory.
  13 *
  14 */
  15
  16#include <sys/types.h>
  17#include <sys/ioctl.h>
  18#include <sys/mman.h>
  19#include <stdarg.h>
  20
  21#include <linux/kvm.h>
  22
  23#include "qemu-common.h"
  24#include "qemu/atomic.h"
  25#include "qemu/option.h"
  26#include "qemu/config-file.h"
  27#include "sysemu/sysemu.h"
  28#include "hw/hw.h"
  29#include "hw/pci/msi.h"
  30#include "exec/gdbstub.h"
  31#include "sysemu/kvm.h"
  32#include "qemu/bswap.h"
  33#include "exec/memory.h"
  34#include "exec/address-spaces.h"
  35#include "qemu/event_notifier.h"
  36#include "trace.h"
  37
  38/* This check must be after config-host.h is included */
  39#ifdef CONFIG_EVENTFD
  40#include <sys/eventfd.h>
  41#endif
  42
  43#ifdef CONFIG_VALGRIND_H
  44#include <valgrind/memcheck.h>
  45#endif
  46
  47/* KVM uses PAGE_SIZE in its definition of COALESCED_MMIO_MAX */
  48#define PAGE_SIZE TARGET_PAGE_SIZE
  49
  50//#define DEBUG_KVM
  51
  52#ifdef DEBUG_KVM
  53#define DPRINTF(fmt, ...) \
  54    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
  55#else
  56#define DPRINTF(fmt, ...) \
  57    do { } while (0)
  58#endif
  59
  60#define KVM_MSI_HASHTAB_SIZE    256
  61
  62typedef struct KVMSlot
  63{
  64    hwaddr start_addr;
  65    ram_addr_t memory_size;
  66    void *ram;
  67    int slot;
  68    int flags;
  69} KVMSlot;
  70
  71typedef struct kvm_dirty_log KVMDirtyLog;
  72
  73struct KVMState
  74{
  75    KVMSlot slots[32];
  76    int fd;
  77    int vmfd;
  78    int coalesced_mmio;
  79    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
  80    bool coalesced_flush_in_progress;
  81    int broken_set_mem_region;
  82    int migration_log;
  83    int vcpu_events;
  84    int robust_singlestep;
  85    int debugregs;
  86#ifdef KVM_CAP_SET_GUEST_DEBUG
  87    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
  88#endif
  89    int pit_state2;
  90    int xsave, xcrs;
  91    int many_ioeventfds;
  92    int intx_set_mask;
  93    /* The man page (and posix) say ioctl numbers are signed int, but
  94     * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
  95     * unsigned, and treating them as signed here can break things */
  96    unsigned irq_set_ioctl;
  97#ifdef KVM_CAP_IRQ_ROUTING
  98    struct kvm_irq_routing *irq_routes;
  99    int nr_allocated_irq_routes;
 100    uint32_t *used_gsi_bitmap;
 101    unsigned int gsi_count;
 102    QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
 103    bool direct_msi;
 104#endif
 105};
 106
 107KVMState *kvm_state;
 108bool kvm_kernel_irqchip;
 109bool kvm_async_interrupts_allowed;
 110bool kvm_halt_in_kernel_allowed;
 111bool kvm_irqfds_allowed;
 112bool kvm_msi_via_irqfd_allowed;
 113bool kvm_gsi_routing_allowed;
 114bool kvm_gsi_direct_mapping;
 115bool kvm_allowed;
 116bool kvm_readonly_mem_allowed;
 117
 118static const KVMCapabilityInfo kvm_required_capabilites[] = {
 119    KVM_CAP_INFO(USER_MEMORY),
 120    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
 121    KVM_CAP_LAST_INFO
 122};
 123
 124static KVMSlot *kvm_alloc_slot(KVMState *s)
 125{
 126    int i;
 127
 128    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 129        if (s->slots[i].memory_size == 0) {
 130            return &s->slots[i];
 131        }
 132    }
 133
 134    fprintf(stderr, "%s: no free slot available\n", __func__);
 135    abort();
 136}
 137
 138static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
 139                                         hwaddr start_addr,
 140                                         hwaddr end_addr)
 141{
 142    int i;
 143
 144    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 145        KVMSlot *mem = &s->slots[i];
 146
 147        if (start_addr == mem->start_addr &&
 148            end_addr == mem->start_addr + mem->memory_size) {
 149            return mem;
 150        }
 151    }
 152
 153    return NULL;
 154}
 155
 156/*
 157 * Find overlapping slot with lowest start address
 158 */
 159static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
 160                                            hwaddr start_addr,
 161                                            hwaddr end_addr)
 162{
 163    KVMSlot *found = NULL;
 164    int i;
 165
 166    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 167        KVMSlot *mem = &s->slots[i];
 168
 169        if (mem->memory_size == 0 ||
 170            (found && found->start_addr < mem->start_addr)) {
 171            continue;
 172        }
 173
 174        if (end_addr > mem->start_addr &&
 175            start_addr < mem->start_addr + mem->memory_size) {
 176            found = mem;
 177        }
 178    }
 179
 180    return found;
 181}
 182
 183int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
 184                                       hwaddr *phys_addr)
 185{
 186    int i;
 187
 188    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 189        KVMSlot *mem = &s->slots[i];
 190
 191        if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
 192            *phys_addr = mem->start_addr + (ram - mem->ram);
 193            return 1;
 194        }
 195    }
 196
 197    return 0;
 198}
 199
 200static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
 201{
 202    struct kvm_userspace_memory_region mem;
 203
 204    mem.slot = slot->slot;
 205    mem.guest_phys_addr = slot->start_addr;
 206    mem.userspace_addr = (unsigned long)slot->ram;
 207    mem.flags = slot->flags;
 208    if (s->migration_log) {
 209        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
 210    }
 211
 212    if (slot->memory_size && mem.flags & KVM_MEM_READONLY) {
 213        /* Set the slot size to 0 before setting the slot to the desired
 214         * value. This is needed based on KVM commit 75d61fbc. */
 215        mem.memory_size = 0;
 216        kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 217    }
 218    mem.memory_size = slot->memory_size;
 219    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 220}
 221
 222static void kvm_reset_vcpu(void *opaque)
 223{
 224    CPUState *cpu = opaque;
 225
 226    kvm_arch_reset_vcpu(cpu);
 227}
 228
 229int kvm_init_vcpu(CPUState *cpu)
 230{
 231    KVMState *s = kvm_state;
 232    long mmap_size;
 233    int ret;
 234
 235    DPRINTF("kvm_init_vcpu\n");
 236
 237    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)kvm_arch_vcpu_id(cpu));
 238    if (ret < 0) {
 239        DPRINTF("kvm_create_vcpu failed\n");
 240        goto err;
 241    }
 242
 243    cpu->kvm_fd = ret;
 244    cpu->kvm_state = s;
 245    cpu->kvm_vcpu_dirty = true;
 246
 247    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
 248    if (mmap_size < 0) {
 249        ret = mmap_size;
 250        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
 251        goto err;
 252    }
 253
 254    cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
 255                        cpu->kvm_fd, 0);
 256    if (cpu->kvm_run == MAP_FAILED) {
 257        ret = -errno;
 258        DPRINTF("mmap'ing vcpu state failed\n");
 259        goto err;
 260    }
 261
 262    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
 263        s->coalesced_mmio_ring =
 264            (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
 265    }
 266
 267    ret = kvm_arch_init_vcpu(cpu);
 268    if (ret == 0) {
 269        qemu_register_reset(kvm_reset_vcpu, cpu);
 270        kvm_arch_reset_vcpu(cpu);
 271    }
 272err:
 273    return ret;
 274}
 275
 276/*
 277 * dirty pages logging control
 278 */
 279
 280static int kvm_mem_flags(KVMState *s, bool log_dirty, bool readonly)
 281{
 282    int flags = 0;
 283    flags = log_dirty ? KVM_MEM_LOG_DIRTY_PAGES : 0;
 284    if (readonly && kvm_readonly_mem_allowed) {
 285        flags |= KVM_MEM_READONLY;
 286    }
 287    return flags;
 288}
 289
 290static int kvm_slot_dirty_pages_log_change(KVMSlot *mem, bool log_dirty)
 291{
 292    KVMState *s = kvm_state;
 293    int flags, mask = KVM_MEM_LOG_DIRTY_PAGES;
 294    int old_flags;
 295
 296    old_flags = mem->flags;
 297
 298    flags = (mem->flags & ~mask) | kvm_mem_flags(s, log_dirty, false);
 299    mem->flags = flags;
 300
 301    /* If nothing changed effectively, no need to issue ioctl */
 302    if (s->migration_log) {
 303        flags |= KVM_MEM_LOG_DIRTY_PAGES;
 304    }
 305
 306    if (flags == old_flags) {
 307        return 0;
 308    }
 309
 310    return kvm_set_user_memory_region(s, mem);
 311}
 312
 313static int kvm_dirty_pages_log_change(hwaddr phys_addr,
 314                                      ram_addr_t size, bool log_dirty)
 315{
 316    KVMState *s = kvm_state;
 317    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
 318
 319    if (mem == NULL)  {
 320        fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
 321                TARGET_FMT_plx "\n", __func__, phys_addr,
 322                (hwaddr)(phys_addr + size - 1));
 323        return -EINVAL;
 324    }
 325    return kvm_slot_dirty_pages_log_change(mem, log_dirty);
 326}
 327
 328static void kvm_log_start(MemoryListener *listener,
 329                          MemoryRegionSection *section)
 330{
 331    int r;
 332
 333    r = kvm_dirty_pages_log_change(section->offset_within_address_space,
 334                                   int128_get64(section->size), true);
 335    if (r < 0) {
 336        abort();
 337    }
 338}
 339
 340static void kvm_log_stop(MemoryListener *listener,
 341                          MemoryRegionSection *section)
 342{
 343    int r;
 344
 345    r = kvm_dirty_pages_log_change(section->offset_within_address_space,
 346                                   int128_get64(section->size), false);
 347    if (r < 0) {
 348        abort();
 349    }
 350}
 351
 352static int kvm_set_migration_log(int enable)
 353{
 354    KVMState *s = kvm_state;
 355    KVMSlot *mem;
 356    int i, err;
 357
 358    s->migration_log = enable;
 359
 360    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 361        mem = &s->slots[i];
 362
 363        if (!mem->memory_size) {
 364            continue;
 365        }
 366        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
 367            continue;
 368        }
 369        err = kvm_set_user_memory_region(s, mem);
 370        if (err) {
 371            return err;
 372        }
 373    }
 374    return 0;
 375}
 376
 377/* get kvm's dirty pages bitmap and update qemu's */
 378static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
 379                                         unsigned long *bitmap)
 380{
 381    unsigned int i, j;
 382    unsigned long page_number, c;
 383    hwaddr addr, addr1;
 384    unsigned int pages = int128_get64(section->size) / getpagesize();
 385    unsigned int len = (pages + HOST_LONG_BITS - 1) / HOST_LONG_BITS;
 386    unsigned long hpratio = getpagesize() / TARGET_PAGE_SIZE;
 387
 388    /*
 389     * bitmap-traveling is faster than memory-traveling (for addr...)
 390     * especially when most of the memory is not dirty.
 391     */
 392    for (i = 0; i < len; i++) {
 393        if (bitmap[i] != 0) {
 394            c = leul_to_cpu(bitmap[i]);
 395            do {
 396                j = ffsl(c) - 1;
 397                c &= ~(1ul << j);
 398                page_number = (i * HOST_LONG_BITS + j) * hpratio;
 399                addr1 = page_number * TARGET_PAGE_SIZE;
 400                addr = section->offset_within_region + addr1;
 401                memory_region_set_dirty(section->mr, addr,
 402                                        TARGET_PAGE_SIZE * hpratio);
 403            } while (c != 0);
 404        }
 405    }
 406    return 0;
 407}
 408
 409#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
 410
 411/**
 412 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
 413 * This function updates qemu's dirty bitmap using
 414 * memory_region_set_dirty().  This means all bits are set
 415 * to dirty.
 416 *
 417 * @start_add: start of logged region.
 418 * @end_addr: end of logged region.
 419 */
 420static int kvm_physical_sync_dirty_bitmap(MemoryRegionSection *section)
 421{
 422    KVMState *s = kvm_state;
 423    unsigned long size, allocated_size = 0;
 424    KVMDirtyLog d;
 425    KVMSlot *mem;
 426    int ret = 0;
 427    hwaddr start_addr = section->offset_within_address_space;
 428    hwaddr end_addr = start_addr + int128_get64(section->size);
 429
 430    d.dirty_bitmap = NULL;
 431    while (start_addr < end_addr) {
 432        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
 433        if (mem == NULL) {
 434            break;
 435        }
 436
 437        /* XXX bad kernel interface alert
 438         * For dirty bitmap, kernel allocates array of size aligned to
 439         * bits-per-long.  But for case when the kernel is 64bits and
 440         * the userspace is 32bits, userspace can't align to the same
 441         * bits-per-long, since sizeof(long) is different between kernel
 442         * and user space.  This way, userspace will provide buffer which
 443         * may be 4 bytes less than the kernel will use, resulting in
 444         * userspace memory corruption (which is not detectable by valgrind
 445         * too, in most cases).
 446         * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
 447         * a hope that sizeof(long) wont become >8 any time soon.
 448         */
 449        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
 450                     /*HOST_LONG_BITS*/ 64) / 8;
 451        if (!d.dirty_bitmap) {
 452            d.dirty_bitmap = g_malloc(size);
 453        } else if (size > allocated_size) {
 454            d.dirty_bitmap = g_realloc(d.dirty_bitmap, size);
 455        }
 456        allocated_size = size;
 457        memset(d.dirty_bitmap, 0, allocated_size);
 458
 459        d.slot = mem->slot;
 460
 461        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
 462            DPRINTF("ioctl failed %d\n", errno);
 463            ret = -1;
 464            break;
 465        }
 466
 467        kvm_get_dirty_pages_log_range(section, d.dirty_bitmap);
 468        start_addr = mem->start_addr + mem->memory_size;
 469    }
 470    g_free(d.dirty_bitmap);
 471
 472    return ret;
 473}
 474
 475static void kvm_coalesce_mmio_region(MemoryListener *listener,
 476                                     MemoryRegionSection *secion,
 477                                     hwaddr start, hwaddr size)
 478{
 479    KVMState *s = kvm_state;
 480
 481    if (s->coalesced_mmio) {
 482        struct kvm_coalesced_mmio_zone zone;
 483
 484        zone.addr = start;
 485        zone.size = size;
 486        zone.pad = 0;
 487
 488        (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
 489    }
 490}
 491
 492static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
 493                                       MemoryRegionSection *secion,
 494                                       hwaddr start, hwaddr size)
 495{
 496    KVMState *s = kvm_state;
 497
 498    if (s->coalesced_mmio) {
 499        struct kvm_coalesced_mmio_zone zone;
 500
 501        zone.addr = start;
 502        zone.size = size;
 503        zone.pad = 0;
 504
 505        (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
 506    }
 507}
 508
 509int kvm_check_extension(KVMState *s, unsigned int extension)
 510{
 511    int ret;
 512
 513    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
 514    if (ret < 0) {
 515        ret = 0;
 516    }
 517
 518    return ret;
 519}
 520
 521static int kvm_set_ioeventfd_mmio(int fd, uint32_t addr, uint32_t val,
 522                                  bool assign, uint32_t size, bool datamatch)
 523{
 524    int ret;
 525    struct kvm_ioeventfd iofd;
 526
 527    iofd.datamatch = datamatch ? val : 0;
 528    iofd.addr = addr;
 529    iofd.len = size;
 530    iofd.flags = 0;
 531    iofd.fd = fd;
 532
 533    if (!kvm_enabled()) {
 534        return -ENOSYS;
 535    }
 536
 537    if (datamatch) {
 538        iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
 539    }
 540    if (!assign) {
 541        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
 542    }
 543
 544    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
 545
 546    if (ret < 0) {
 547        return -errno;
 548    }
 549
 550    return 0;
 551}
 552
 553static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
 554                                 bool assign, uint32_t size, bool datamatch)
 555{
 556    struct kvm_ioeventfd kick = {
 557        .datamatch = datamatch ? val : 0,
 558        .addr = addr,
 559        .flags = KVM_IOEVENTFD_FLAG_PIO,
 560        .len = size,
 561        .fd = fd,
 562    };
 563    int r;
 564    if (!kvm_enabled()) {
 565        return -ENOSYS;
 566    }
 567    if (datamatch) {
 568        kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
 569    }
 570    if (!assign) {
 571        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
 572    }
 573    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
 574    if (r < 0) {
 575        return r;
 576    }
 577    return 0;
 578}
 579
 580
 581static int kvm_check_many_ioeventfds(void)
 582{
 583    /* Userspace can use ioeventfd for io notification.  This requires a host
 584     * that supports eventfd(2) and an I/O thread; since eventfd does not
 585     * support SIGIO it cannot interrupt the vcpu.
 586     *
 587     * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
 588     * can avoid creating too many ioeventfds.
 589     */
 590#if defined(CONFIG_EVENTFD)
 591    int ioeventfds[7];
 592    int i, ret = 0;
 593    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
 594        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
 595        if (ioeventfds[i] < 0) {
 596            break;
 597        }
 598        ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true);
 599        if (ret < 0) {
 600            close(ioeventfds[i]);
 601            break;
 602        }
 603    }
 604
 605    /* Decide whether many devices are supported or not */
 606    ret = i == ARRAY_SIZE(ioeventfds);
 607
 608    while (i-- > 0) {
 609        kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true);
 610        close(ioeventfds[i]);
 611    }
 612    return ret;
 613#else
 614    return 0;
 615#endif
 616}
 617
 618static const KVMCapabilityInfo *
 619kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
 620{
 621    while (list->name) {
 622        if (!kvm_check_extension(s, list->value)) {
 623            return list;
 624        }
 625        list++;
 626    }
 627    return NULL;
 628}
 629
 630static void kvm_set_phys_mem(MemoryRegionSection *section, bool add)
 631{
 632    KVMState *s = kvm_state;
 633    KVMSlot *mem, old;
 634    int err;
 635    MemoryRegion *mr = section->mr;
 636    bool log_dirty = memory_region_is_logging(mr);
 637    bool writeable = !mr->readonly && !mr->rom_device;
 638    bool readonly_flag = mr->readonly || memory_region_is_romd(mr);
 639    hwaddr start_addr = section->offset_within_address_space;
 640    ram_addr_t size = int128_get64(section->size);
 641    void *ram = NULL;
 642    unsigned delta;
 643
 644    /* kvm works in page size chunks, but the function may be called
 645       with sub-page size and unaligned start address. */
 646    delta = TARGET_PAGE_ALIGN(size) - size;
 647    if (delta > size) {
 648        return;
 649    }
 650    start_addr += delta;
 651    size -= delta;
 652    size &= TARGET_PAGE_MASK;
 653    if (!size || (start_addr & ~TARGET_PAGE_MASK)) {
 654        return;
 655    }
 656
 657    if (!memory_region_is_ram(mr)) {
 658        if (writeable || !kvm_readonly_mem_allowed) {
 659            return;
 660        } else if (!mr->romd_mode) {
 661            /* If the memory device is not in romd_mode, then we actually want
 662             * to remove the kvm memory slot so all accesses will trap. */
 663            add = false;
 664        }
 665    }
 666
 667    ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + delta;
 668
 669    while (1) {
 670        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
 671        if (!mem) {
 672            break;
 673        }
 674
 675        if (add && start_addr >= mem->start_addr &&
 676            (start_addr + size <= mem->start_addr + mem->memory_size) &&
 677            (ram - start_addr == mem->ram - mem->start_addr)) {
 678            /* The new slot fits into the existing one and comes with
 679             * identical parameters - update flags and done. */
 680            kvm_slot_dirty_pages_log_change(mem, log_dirty);
 681            return;
 682        }
 683
 684        old = *mem;
 685
 686        if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
 687            kvm_physical_sync_dirty_bitmap(section);
 688        }
 689
 690        /* unregister the overlapping slot */
 691        mem->memory_size = 0;
 692        err = kvm_set_user_memory_region(s, mem);
 693        if (err) {
 694            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
 695                    __func__, strerror(-err));
 696            abort();
 697        }
 698
 699        /* Workaround for older KVM versions: we can't join slots, even not by
 700         * unregistering the previous ones and then registering the larger
 701         * slot. We have to maintain the existing fragmentation. Sigh.
 702         *
 703         * This workaround assumes that the new slot starts at the same
 704         * address as the first existing one. If not or if some overlapping
 705         * slot comes around later, we will fail (not seen in practice so far)
 706         * - and actually require a recent KVM version. */
 707        if (s->broken_set_mem_region &&
 708            old.start_addr == start_addr && old.memory_size < size && add) {
 709            mem = kvm_alloc_slot(s);
 710            mem->memory_size = old.memory_size;
 711            mem->start_addr = old.start_addr;
 712            mem->ram = old.ram;
 713            mem->flags = kvm_mem_flags(s, log_dirty, readonly_flag);
 714
 715            err = kvm_set_user_memory_region(s, mem);
 716            if (err) {
 717                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
 718                        strerror(-err));
 719                abort();
 720            }
 721
 722            start_addr += old.memory_size;
 723            ram += old.memory_size;
 724            size -= old.memory_size;
 725            continue;
 726        }
 727
 728        /* register prefix slot */
 729        if (old.start_addr < start_addr) {
 730            mem = kvm_alloc_slot(s);
 731            mem->memory_size = start_addr - old.start_addr;
 732            mem->start_addr = old.start_addr;
 733            mem->ram = old.ram;
 734            mem->flags =  kvm_mem_flags(s, log_dirty, readonly_flag);
 735
 736            err = kvm_set_user_memory_region(s, mem);
 737            if (err) {
 738                fprintf(stderr, "%s: error registering prefix slot: %s\n",
 739                        __func__, strerror(-err));
 740#ifdef TARGET_PPC
 741                fprintf(stderr, "%s: This is probably because your kernel's " \
 742                                "PAGE_SIZE is too big. Please try to use 4k " \
 743                                "PAGE_SIZE!\n", __func__);
 744#endif
 745                abort();
 746            }
 747        }
 748
 749        /* register suffix slot */
 750        if (old.start_addr + old.memory_size > start_addr + size) {
 751            ram_addr_t size_delta;
 752
 753            mem = kvm_alloc_slot(s);
 754            mem->start_addr = start_addr + size;
 755            size_delta = mem->start_addr - old.start_addr;
 756            mem->memory_size = old.memory_size - size_delta;
 757            mem->ram = old.ram + size_delta;
 758            mem->flags = kvm_mem_flags(s, log_dirty, readonly_flag);
 759
 760            err = kvm_set_user_memory_region(s, mem);
 761            if (err) {
 762                fprintf(stderr, "%s: error registering suffix slot: %s\n",
 763                        __func__, strerror(-err));
 764                abort();
 765            }
 766        }
 767    }
 768
 769    /* in case the KVM bug workaround already "consumed" the new slot */
 770    if (!size) {
 771        return;
 772    }
 773    if (!add) {
 774        return;
 775    }
 776    mem = kvm_alloc_slot(s);
 777    mem->memory_size = size;
 778    mem->start_addr = start_addr;
 779    mem->ram = ram;
 780    mem->flags = kvm_mem_flags(s, log_dirty, readonly_flag);
 781
 782    err = kvm_set_user_memory_region(s, mem);
 783    if (err) {
 784        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
 785                strerror(-err));
 786        abort();
 787    }
 788}
 789
 790static void kvm_region_add(MemoryListener *listener,
 791                           MemoryRegionSection *section)
 792{
 793    memory_region_ref(section->mr);
 794    kvm_set_phys_mem(section, true);
 795}
 796
 797static void kvm_region_del(MemoryListener *listener,
 798                           MemoryRegionSection *section)
 799{
 800    kvm_set_phys_mem(section, false);
 801    memory_region_unref(section->mr);
 802}
 803
 804static void kvm_log_sync(MemoryListener *listener,
 805                         MemoryRegionSection *section)
 806{
 807    int r;
 808
 809    r = kvm_physical_sync_dirty_bitmap(section);
 810    if (r < 0) {
 811        abort();
 812    }
 813}
 814
 815static void kvm_log_global_start(struct MemoryListener *listener)
 816{
 817    int r;
 818
 819    r = kvm_set_migration_log(1);
 820    assert(r >= 0);
 821}
 822
 823static void kvm_log_global_stop(struct MemoryListener *listener)
 824{
 825    int r;
 826
 827    r = kvm_set_migration_log(0);
 828    assert(r >= 0);
 829}
 830
 831static void kvm_mem_ioeventfd_add(MemoryListener *listener,
 832                                  MemoryRegionSection *section,
 833                                  bool match_data, uint64_t data,
 834                                  EventNotifier *e)
 835{
 836    int fd = event_notifier_get_fd(e);
 837    int r;
 838
 839    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
 840                               data, true, int128_get64(section->size),
 841                               match_data);
 842    if (r < 0) {
 843        fprintf(stderr, "%s: error adding ioeventfd: %s\n",
 844                __func__, strerror(-r));
 845        abort();
 846    }
 847}
 848
 849static void kvm_mem_ioeventfd_del(MemoryListener *listener,
 850                                  MemoryRegionSection *section,
 851                                  bool match_data, uint64_t data,
 852                                  EventNotifier *e)
 853{
 854    int fd = event_notifier_get_fd(e);
 855    int r;
 856
 857    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
 858                               data, false, int128_get64(section->size),
 859                               match_data);
 860    if (r < 0) {
 861        abort();
 862    }
 863}
 864
 865static void kvm_io_ioeventfd_add(MemoryListener *listener,
 866                                 MemoryRegionSection *section,
 867                                 bool match_data, uint64_t data,
 868                                 EventNotifier *e)
 869{
 870    int fd = event_notifier_get_fd(e);
 871    int r;
 872
 873    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
 874                              data, true, int128_get64(section->size),
 875                              match_data);
 876    if (r < 0) {
 877        fprintf(stderr, "%s: error adding ioeventfd: %s\n",
 878                __func__, strerror(-r));
 879        abort();
 880    }
 881}
 882
 883static void kvm_io_ioeventfd_del(MemoryListener *listener,
 884                                 MemoryRegionSection *section,
 885                                 bool match_data, uint64_t data,
 886                                 EventNotifier *e)
 887
 888{
 889    int fd = event_notifier_get_fd(e);
 890    int r;
 891
 892    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
 893                              data, false, int128_get64(section->size),
 894                              match_data);
 895    if (r < 0) {
 896        abort();
 897    }
 898}
 899
 900static MemoryListener kvm_memory_listener = {
 901    .region_add = kvm_region_add,
 902    .region_del = kvm_region_del,
 903    .log_start = kvm_log_start,
 904    .log_stop = kvm_log_stop,
 905    .log_sync = kvm_log_sync,
 906    .log_global_start = kvm_log_global_start,
 907    .log_global_stop = kvm_log_global_stop,
 908    .eventfd_add = kvm_mem_ioeventfd_add,
 909    .eventfd_del = kvm_mem_ioeventfd_del,
 910    .coalesced_mmio_add = kvm_coalesce_mmio_region,
 911    .coalesced_mmio_del = kvm_uncoalesce_mmio_region,
 912    .priority = 10,
 913};
 914
 915static MemoryListener kvm_io_listener = {
 916    .eventfd_add = kvm_io_ioeventfd_add,
 917    .eventfd_del = kvm_io_ioeventfd_del,
 918    .priority = 10,
 919};
 920
 921static void kvm_handle_interrupt(CPUState *cpu, int mask)
 922{
 923    cpu->interrupt_request |= mask;
 924
 925    if (!qemu_cpu_is_self(cpu)) {
 926        qemu_cpu_kick(cpu);
 927    }
 928}
 929
 930int kvm_set_irq(KVMState *s, int irq, int level)
 931{
 932    struct kvm_irq_level event;
 933    int ret;
 934
 935    assert(kvm_async_interrupts_enabled());
 936
 937    event.level = level;
 938    event.irq = irq;
 939    ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
 940    if (ret < 0) {
 941        perror("kvm_set_irq");
 942        abort();
 943    }
 944
 945    return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
 946}
 947
 948#ifdef KVM_CAP_IRQ_ROUTING
 949typedef struct KVMMSIRoute {
 950    struct kvm_irq_routing_entry kroute;
 951    QTAILQ_ENTRY(KVMMSIRoute) entry;
 952} KVMMSIRoute;
 953
 954static void set_gsi(KVMState *s, unsigned int gsi)
 955{
 956    s->used_gsi_bitmap[gsi / 32] |= 1U << (gsi % 32);
 957}
 958
 959static void clear_gsi(KVMState *s, unsigned int gsi)
 960{
 961    s->used_gsi_bitmap[gsi / 32] &= ~(1U << (gsi % 32));
 962}
 963
 964void kvm_init_irq_routing(KVMState *s)
 965{
 966    int gsi_count, i;
 967
 968    gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING) - 1;
 969    if (gsi_count > 0) {
 970        unsigned int gsi_bits, i;
 971
 972        /* Round up so we can search ints using ffs */
 973        gsi_bits = ALIGN(gsi_count, 32);
 974        s->used_gsi_bitmap = g_malloc0(gsi_bits / 8);
 975        s->gsi_count = gsi_count;
 976
 977        /* Mark any over-allocated bits as already in use */
 978        for (i = gsi_count; i < gsi_bits; i++) {
 979            set_gsi(s, i);
 980        }
 981    }
 982
 983    s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
 984    s->nr_allocated_irq_routes = 0;
 985
 986    if (!s->direct_msi) {
 987        for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
 988            QTAILQ_INIT(&s->msi_hashtab[i]);
 989        }
 990    }
 991
 992    kvm_arch_init_irq_routing(s);
 993}
 994
 995void kvm_irqchip_commit_routes(KVMState *s)
 996{
 997    int ret;
 998
 999    s->irq_routes->flags = 0;
1000    ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
1001    assert(ret == 0);
1002}
1003
1004static void kvm_add_routing_entry(KVMState *s,
1005                                  struct kvm_irq_routing_entry *entry)
1006{
1007    struct kvm_irq_routing_entry *new;
1008    int n, size;
1009
1010    if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
1011        n = s->nr_allocated_irq_routes * 2;
1012        if (n < 64) {
1013            n = 64;
1014        }
1015        size = sizeof(struct kvm_irq_routing);
1016        size += n * sizeof(*new);
1017        s->irq_routes = g_realloc(s->irq_routes, size);
1018        s->nr_allocated_irq_routes = n;
1019    }
1020    n = s->irq_routes->nr++;
1021    new = &s->irq_routes->entries[n];
1022
1023    *new = *entry;
1024
1025    set_gsi(s, entry->gsi);
1026}
1027
1028static int kvm_update_routing_entry(KVMState *s,
1029                                    struct kvm_irq_routing_entry *new_entry)
1030{
1031    struct kvm_irq_routing_entry *entry;
1032    int n;
1033
1034    for (n = 0; n < s->irq_routes->nr; n++) {
1035        entry = &s->irq_routes->entries[n];
1036        if (entry->gsi != new_entry->gsi) {
1037            continue;
1038        }
1039
1040        if(!memcmp(entry, new_entry, sizeof *entry)) {
1041            return 0;
1042        }
1043
1044        *entry = *new_entry;
1045
1046        kvm_irqchip_commit_routes(s);
1047
1048        return 0;
1049    }
1050
1051    return -ESRCH;
1052}
1053
1054void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
1055{
1056    struct kvm_irq_routing_entry e = {};
1057
1058    assert(pin < s->gsi_count);
1059
1060    e.gsi = irq;
1061    e.type = KVM_IRQ_ROUTING_IRQCHIP;
1062    e.flags = 0;
1063    e.u.irqchip.irqchip = irqchip;
1064    e.u.irqchip.pin = pin;
1065    kvm_add_routing_entry(s, &e);
1066}
1067
1068void kvm_irqchip_release_virq(KVMState *s, int virq)
1069{
1070    struct kvm_irq_routing_entry *e;
1071    int i;
1072
1073    if (kvm_gsi_direct_mapping()) {
1074        return;
1075    }
1076
1077    for (i = 0; i < s->irq_routes->nr; i++) {
1078        e = &s->irq_routes->entries[i];
1079        if (e->gsi == virq) {
1080            s->irq_routes->nr--;
1081            *e = s->irq_routes->entries[s->irq_routes->nr];
1082        }
1083    }
1084    clear_gsi(s, virq);
1085}
1086
1087static unsigned int kvm_hash_msi(uint32_t data)
1088{
1089    /* This is optimized for IA32 MSI layout. However, no other arch shall
1090     * repeat the mistake of not providing a direct MSI injection API. */
1091    return data & 0xff;
1092}
1093
1094static void kvm_flush_dynamic_msi_routes(KVMState *s)
1095{
1096    KVMMSIRoute *route, *next;
1097    unsigned int hash;
1098
1099    for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
1100        QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
1101            kvm_irqchip_release_virq(s, route->kroute.gsi);
1102            QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
1103            g_free(route);
1104        }
1105    }
1106}
1107
1108static int kvm_irqchip_get_virq(KVMState *s)
1109{
1110    uint32_t *word = s->used_gsi_bitmap;
1111    int max_words = ALIGN(s->gsi_count, 32) / 32;
1112    int i, bit;
1113    bool retry = true;
1114
1115again:
1116    /* Return the lowest unused GSI in the bitmap */
1117    for (i = 0; i < max_words; i++) {
1118        bit = ffs(~word[i]);
1119        if (!bit) {
1120            continue;
1121        }
1122
1123        return bit - 1 + i * 32;
1124    }
1125    if (!s->direct_msi && retry) {
1126        retry = false;
1127        kvm_flush_dynamic_msi_routes(s);
1128        goto again;
1129    }
1130    return -ENOSPC;
1131
1132}
1133
1134static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
1135{
1136    unsigned int hash = kvm_hash_msi(msg.data);
1137    KVMMSIRoute *route;
1138
1139    QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
1140        if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
1141            route->kroute.u.msi.address_hi == (msg.address >> 32) &&
1142            route->kroute.u.msi.data == le32_to_cpu(msg.data)) {
1143            return route;
1144        }
1145    }
1146    return NULL;
1147}
1148
1149int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1150{
1151    struct kvm_msi msi;
1152    KVMMSIRoute *route;
1153
1154    if (s->direct_msi) {
1155        msi.address_lo = (uint32_t)msg.address;
1156        msi.address_hi = msg.address >> 32;
1157        msi.data = le32_to_cpu(msg.data);
1158        msi.flags = 0;
1159        memset(msi.pad, 0, sizeof(msi.pad));
1160
1161        return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
1162    }
1163
1164    route = kvm_lookup_msi_route(s, msg);
1165    if (!route) {
1166        int virq;
1167
1168        virq = kvm_irqchip_get_virq(s);
1169        if (virq < 0) {
1170            return virq;
1171        }
1172
1173        route = g_malloc0(sizeof(KVMMSIRoute));
1174        route->kroute.gsi = virq;
1175        route->kroute.type = KVM_IRQ_ROUTING_MSI;
1176        route->kroute.flags = 0;
1177        route->kroute.u.msi.address_lo = (uint32_t)msg.address;
1178        route->kroute.u.msi.address_hi = msg.address >> 32;
1179        route->kroute.u.msi.data = le32_to_cpu(msg.data);
1180
1181        kvm_add_routing_entry(s, &route->kroute);
1182        kvm_irqchip_commit_routes(s);
1183
1184        QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
1185                           entry);
1186    }
1187
1188    assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
1189
1190    return kvm_set_irq(s, route->kroute.gsi, 1);
1191}
1192
1193int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
1194{
1195    struct kvm_irq_routing_entry kroute = {};
1196    int virq;
1197
1198    if (kvm_gsi_direct_mapping()) {
1199        return msg.data & 0xffff;
1200    }
1201
1202    if (!kvm_gsi_routing_enabled()) {
1203        return -ENOSYS;
1204    }
1205
1206    virq = kvm_irqchip_get_virq(s);
1207    if (virq < 0) {
1208        return virq;
1209    }
1210
1211    kroute.gsi = virq;
1212    kroute.type = KVM_IRQ_ROUTING_MSI;
1213    kroute.flags = 0;
1214    kroute.u.msi.address_lo = (uint32_t)msg.address;
1215    kroute.u.msi.address_hi = msg.address >> 32;
1216    kroute.u.msi.data = le32_to_cpu(msg.data);
1217
1218    kvm_add_routing_entry(s, &kroute);
1219    kvm_irqchip_commit_routes(s);
1220
1221    return virq;
1222}
1223
1224int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
1225{
1226    struct kvm_irq_routing_entry kroute = {};
1227
1228    if (kvm_gsi_direct_mapping()) {
1229        return 0;
1230    }
1231
1232    if (!kvm_irqchip_in_kernel()) {
1233        return -ENOSYS;
1234    }
1235
1236    kroute.gsi = virq;
1237    kroute.type = KVM_IRQ_ROUTING_MSI;
1238    kroute.flags = 0;
1239    kroute.u.msi.address_lo = (uint32_t)msg.address;
1240    kroute.u.msi.address_hi = msg.address >> 32;
1241    kroute.u.msi.data = le32_to_cpu(msg.data);
1242
1243    return kvm_update_routing_entry(s, &kroute);
1244}
1245
1246static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int rfd, int virq,
1247                                    bool assign)
1248{
1249    struct kvm_irqfd irqfd = {
1250        .fd = fd,
1251        .gsi = virq,
1252        .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
1253    };
1254
1255    if (rfd != -1) {
1256        irqfd.flags |= KVM_IRQFD_FLAG_RESAMPLE;
1257        irqfd.resamplefd = rfd;
1258    }
1259
1260    if (!kvm_irqfds_enabled()) {
1261        return -ENOSYS;
1262    }
1263
1264    return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
1265}
1266
1267#else /* !KVM_CAP_IRQ_ROUTING */
1268
1269void kvm_init_irq_routing(KVMState *s)
1270{
1271}
1272
1273void kvm_irqchip_release_virq(KVMState *s, int virq)
1274{
1275}
1276
1277int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1278{
1279    abort();
1280}
1281
1282int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
1283{
1284    return -ENOSYS;
1285}
1286
1287static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
1288{
1289    abort();
1290}
1291
1292int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
1293{
1294    return -ENOSYS;
1295}
1296#endif /* !KVM_CAP_IRQ_ROUTING */
1297
1298int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n,
1299                                   EventNotifier *rn, int virq)
1300{
1301    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n),
1302           rn ? event_notifier_get_fd(rn) : -1, virq, true);
1303}
1304
1305int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq)
1306{
1307    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), -1, virq,
1308           false);
1309}
1310
1311static int kvm_irqchip_create(KVMState *s)
1312{
1313    int ret;
1314
1315    if (!qemu_opt_get_bool(qemu_get_machine_opts(), "kernel_irqchip", true) ||
1316        !kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
1317        return 0;
1318    }
1319
1320    ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
1321    if (ret < 0) {
1322        fprintf(stderr, "Create kernel irqchip failed\n");
1323        return ret;
1324    }
1325
1326    kvm_kernel_irqchip = true;
1327    /* If we have an in-kernel IRQ chip then we must have asynchronous
1328     * interrupt delivery (though the reverse is not necessarily true)
1329     */
1330    kvm_async_interrupts_allowed = true;
1331    kvm_halt_in_kernel_allowed = true;
1332
1333    kvm_init_irq_routing(s);
1334
1335    return 0;
1336}
1337
1338/* Find number of supported CPUs using the recommended
1339 * procedure from the kernel API documentation to cope with
1340 * older kernels that may be missing capabilities.
1341 */
1342static int kvm_recommended_vcpus(KVMState *s)
1343{
1344    int ret = kvm_check_extension(s, KVM_CAP_NR_VCPUS);
1345    return (ret) ? ret : 4;
1346}
1347
1348static int kvm_max_vcpus(KVMState *s)
1349{
1350    int ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
1351    return (ret) ? ret : kvm_recommended_vcpus(s);
1352}
1353
1354int kvm_init(void)
1355{
1356    static const char upgrade_note[] =
1357        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1358        "(see http://sourceforge.net/projects/kvm).\n";
1359    struct {
1360        const char *name;
1361        int num;
1362    } num_cpus[] = {
1363        { "SMP",          smp_cpus },
1364        { "hotpluggable", max_cpus },
1365        { NULL, }
1366    }, *nc = num_cpus;
1367    int soft_vcpus_limit, hard_vcpus_limit;
1368    KVMState *s;
1369    const KVMCapabilityInfo *missing_cap;
1370    int ret;
1371    int i;
1372
1373    s = g_malloc0(sizeof(KVMState));
1374
1375    /*
1376     * On systems where the kernel can support different base page
1377     * sizes, host page size may be different from TARGET_PAGE_SIZE,
1378     * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
1379     * page size for the system though.
1380     */
1381    assert(TARGET_PAGE_SIZE <= getpagesize());
1382
1383#ifdef KVM_CAP_SET_GUEST_DEBUG
1384    QTAILQ_INIT(&s->kvm_sw_breakpoints);
1385#endif
1386    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
1387        s->slots[i].slot = i;
1388    }
1389    s->vmfd = -1;
1390    s->fd = qemu_open("/dev/kvm", O_RDWR);
1391    if (s->fd == -1) {
1392        fprintf(stderr, "Could not access KVM kernel module: %m\n");
1393        ret = -errno;
1394        goto err;
1395    }
1396
1397    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
1398    if (ret < KVM_API_VERSION) {
1399        if (ret > 0) {
1400            ret = -EINVAL;
1401        }
1402        fprintf(stderr, "kvm version too old\n");
1403        goto err;
1404    }
1405
1406    if (ret > KVM_API_VERSION) {
1407        ret = -EINVAL;
1408        fprintf(stderr, "kvm version not supported\n");
1409        goto err;
1410    }
1411
1412    /* check the vcpu limits */
1413    soft_vcpus_limit = kvm_recommended_vcpus(s);
1414    hard_vcpus_limit = kvm_max_vcpus(s);
1415
1416    while (nc->name) {
1417        if (nc->num > soft_vcpus_limit) {
1418            fprintf(stderr,
1419                    "Warning: Number of %s cpus requested (%d) exceeds "
1420                    "the recommended cpus supported by KVM (%d)\n",
1421                    nc->name, nc->num, soft_vcpus_limit);
1422
1423            if (nc->num > hard_vcpus_limit) {
1424                ret = -EINVAL;
1425                fprintf(stderr, "Number of %s cpus requested (%d) exceeds "
1426                        "the maximum cpus supported by KVM (%d)\n",
1427                        nc->name, nc->num, hard_vcpus_limit);
1428                goto err;
1429            }
1430        }
1431        nc++;
1432    }
1433
1434    do {
1435        ret = kvm_ioctl(s, KVM_CREATE_VM, 0);
1436    } while (ret == -EINTR);
1437
1438    if (ret < 0) {
1439        fprintf(stderr, "ioctl(KVM_CREATE_VM) failed: %d %s\n", -ret,
1440                strerror(-ret));
1441
1442#ifdef TARGET_S390X
1443        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
1444                        "your host kernel command line\n");
1445#endif
1446        goto err;
1447    }
1448
1449    s->vmfd = ret;
1450    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
1451    if (!missing_cap) {
1452        missing_cap =
1453            kvm_check_extension_list(s, kvm_arch_required_capabilities);
1454    }
1455    if (missing_cap) {
1456        ret = -EINVAL;
1457        fprintf(stderr, "kvm does not support %s\n%s",
1458                missing_cap->name, upgrade_note);
1459        goto err;
1460    }
1461
1462    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
1463
1464    s->broken_set_mem_region = 1;
1465    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
1466    if (ret > 0) {
1467        s->broken_set_mem_region = 0;
1468    }
1469
1470#ifdef KVM_CAP_VCPU_EVENTS
1471    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
1472#endif
1473
1474    s->robust_singlestep =
1475        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
1476
1477#ifdef KVM_CAP_DEBUGREGS
1478    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
1479#endif
1480
1481#ifdef KVM_CAP_XSAVE
1482    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
1483#endif
1484
1485#ifdef KVM_CAP_XCRS
1486    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
1487#endif
1488
1489#ifdef KVM_CAP_PIT_STATE2
1490    s->pit_state2 = kvm_check_extension(s, KVM_CAP_PIT_STATE2);
1491#endif
1492
1493#ifdef KVM_CAP_IRQ_ROUTING
1494    s->direct_msi = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
1495#endif
1496
1497    s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
1498
1499    s->irq_set_ioctl = KVM_IRQ_LINE;
1500    if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
1501        s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
1502    }
1503
1504#ifdef KVM_CAP_READONLY_MEM
1505    kvm_readonly_mem_allowed =
1506        (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
1507#endif
1508
1509    ret = kvm_arch_init(s);
1510    if (ret < 0) {
1511        goto err;
1512    }
1513
1514    ret = kvm_irqchip_create(s);
1515    if (ret < 0) {
1516        goto err;
1517    }
1518
1519    kvm_state = s;
1520    memory_listener_register(&kvm_memory_listener, &address_space_memory);
1521    memory_listener_register(&kvm_io_listener, &address_space_io);
1522
1523    s->many_ioeventfds = kvm_check_many_ioeventfds();
1524
1525    cpu_interrupt_handler = kvm_handle_interrupt;
1526
1527    return 0;
1528
1529err:
1530    if (s->vmfd >= 0) {
1531        close(s->vmfd);
1532    }
1533    if (s->fd != -1) {
1534        close(s->fd);
1535    }
1536    g_free(s);
1537
1538    return ret;
1539}
1540
1541static void kvm_handle_io(uint16_t port, void *data, int direction, int size,
1542                          uint32_t count)
1543{
1544    int i;
1545    uint8_t *ptr = data;
1546
1547    for (i = 0; i < count; i++) {
1548        address_space_rw(&address_space_io, port, ptr, size,
1549                         direction == KVM_EXIT_IO_OUT);
1550        ptr += size;
1551    }
1552}
1553
1554static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
1555{
1556    fprintf(stderr, "KVM internal error.");
1557    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
1558        int i;
1559
1560        fprintf(stderr, " Suberror: %d\n", run->internal.suberror);
1561        for (i = 0; i < run->internal.ndata; ++i) {
1562            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
1563                    i, (uint64_t)run->internal.data[i]);
1564        }
1565    } else {
1566        fprintf(stderr, "\n");
1567    }
1568    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
1569        fprintf(stderr, "emulation failure\n");
1570        if (!kvm_arch_stop_on_emulation_error(cpu)) {
1571            cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
1572            return EXCP_INTERRUPT;
1573        }
1574    }
1575    /* FIXME: Should trigger a qmp message to let management know
1576     * something went wrong.
1577     */
1578    return -1;
1579}
1580
1581void kvm_flush_coalesced_mmio_buffer(void)
1582{
1583    KVMState *s = kvm_state;
1584
1585    if (s->coalesced_flush_in_progress) {
1586        return;
1587    }
1588
1589    s->coalesced_flush_in_progress = true;
1590
1591    if (s->coalesced_mmio_ring) {
1592        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
1593        while (ring->first != ring->last) {
1594            struct kvm_coalesced_mmio *ent;
1595
1596            ent = &ring->coalesced_mmio[ring->first];
1597
1598            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
1599            smp_wmb();
1600            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
1601        }
1602    }
1603
1604    s->coalesced_flush_in_progress = false;
1605}
1606
1607static void do_kvm_cpu_synchronize_state(void *arg)
1608{
1609    CPUState *cpu = arg;
1610
1611    if (!cpu->kvm_vcpu_dirty) {
1612        kvm_arch_get_registers(cpu);
1613        cpu->kvm_vcpu_dirty = true;
1614    }
1615}
1616
1617void kvm_cpu_synchronize_state(CPUState *cpu)
1618{
1619    if (!cpu->kvm_vcpu_dirty) {
1620        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, cpu);
1621    }
1622}
1623
1624void kvm_cpu_synchronize_post_reset(CPUState *cpu)
1625{
1626    kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
1627    cpu->kvm_vcpu_dirty = false;
1628}
1629
1630void kvm_cpu_synchronize_post_init(CPUState *cpu)
1631{
1632    kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
1633    cpu->kvm_vcpu_dirty = false;
1634}
1635
1636int kvm_cpu_exec(CPUState *cpu)
1637{
1638    struct kvm_run *run = cpu->kvm_run;
1639    int ret, run_ret;
1640
1641    DPRINTF("kvm_cpu_exec()\n");
1642
1643    if (kvm_arch_process_async_events(cpu)) {
1644        cpu->exit_request = 0;
1645        return EXCP_HLT;
1646    }
1647
1648    do {
1649        if (cpu->kvm_vcpu_dirty) {
1650            kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
1651            cpu->kvm_vcpu_dirty = false;
1652        }
1653
1654        kvm_arch_pre_run(cpu, run);
1655        if (cpu->exit_request) {
1656            DPRINTF("interrupt exit requested\n");
1657            /*
1658             * KVM requires us to reenter the kernel after IO exits to complete
1659             * instruction emulation. This self-signal will ensure that we
1660             * leave ASAP again.
1661             */
1662            qemu_cpu_kick_self();
1663        }
1664        qemu_mutex_unlock_iothread();
1665
1666        run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
1667
1668        qemu_mutex_lock_iothread();
1669        kvm_arch_post_run(cpu, run);
1670
1671        if (run_ret < 0) {
1672            if (run_ret == -EINTR || run_ret == -EAGAIN) {
1673                DPRINTF("io window exit\n");
1674                ret = EXCP_INTERRUPT;
1675                break;
1676            }
1677            fprintf(stderr, "error: kvm run failed %s\n",
1678                    strerror(-run_ret));
1679            abort();
1680        }
1681
1682        trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
1683        switch (run->exit_reason) {
1684        case KVM_EXIT_IO:
1685            DPRINTF("handle_io\n");
1686            kvm_handle_io(run->io.port,
1687                          (uint8_t *)run + run->io.data_offset,
1688                          run->io.direction,
1689                          run->io.size,
1690                          run->io.count);
1691            ret = 0;
1692            break;
1693        case KVM_EXIT_MMIO:
1694            DPRINTF("handle_mmio\n");
1695            cpu_physical_memory_rw(run->mmio.phys_addr,
1696                                   run->mmio.data,
1697                                   run->mmio.len,
1698                                   run->mmio.is_write);
1699            ret = 0;
1700            break;
1701        case KVM_EXIT_IRQ_WINDOW_OPEN:
1702            DPRINTF("irq_window_open\n");
1703            ret = EXCP_INTERRUPT;
1704            break;
1705        case KVM_EXIT_SHUTDOWN:
1706            DPRINTF("shutdown\n");
1707            qemu_system_reset_request();
1708            ret = EXCP_INTERRUPT;
1709            break;
1710        case KVM_EXIT_UNKNOWN:
1711            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
1712                    (uint64_t)run->hw.hardware_exit_reason);
1713            ret = -1;
1714            break;
1715        case KVM_EXIT_INTERNAL_ERROR:
1716            ret = kvm_handle_internal_error(cpu, run);
1717            break;
1718        default:
1719            DPRINTF("kvm_arch_handle_exit\n");
1720            ret = kvm_arch_handle_exit(cpu, run);
1721            break;
1722        }
1723    } while (ret == 0);
1724
1725    if (ret < 0) {
1726        cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
1727        vm_stop(RUN_STATE_INTERNAL_ERROR);
1728    }
1729
1730    cpu->exit_request = 0;
1731    return ret;
1732}
1733
1734int kvm_ioctl(KVMState *s, int type, ...)
1735{
1736    int ret;
1737    void *arg;
1738    va_list ap;
1739
1740    va_start(ap, type);
1741    arg = va_arg(ap, void *);
1742    va_end(ap);
1743
1744    trace_kvm_ioctl(type, arg);
1745    ret = ioctl(s->fd, type, arg);
1746    if (ret == -1) {
1747        ret = -errno;
1748    }
1749    return ret;
1750}
1751
1752int kvm_vm_ioctl(KVMState *s, int type, ...)
1753{
1754    int ret;
1755    void *arg;
1756    va_list ap;
1757
1758    va_start(ap, type);
1759    arg = va_arg(ap, void *);
1760    va_end(ap);
1761
1762    trace_kvm_vm_ioctl(type, arg);
1763    ret = ioctl(s->vmfd, type, arg);
1764    if (ret == -1) {
1765        ret = -errno;
1766    }
1767    return ret;
1768}
1769
1770int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
1771{
1772    int ret;
1773    void *arg;
1774    va_list ap;
1775
1776    va_start(ap, type);
1777    arg = va_arg(ap, void *);
1778    va_end(ap);
1779
1780    trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
1781    ret = ioctl(cpu->kvm_fd, type, arg);
1782    if (ret == -1) {
1783        ret = -errno;
1784    }
1785    return ret;
1786}
1787
1788int kvm_has_sync_mmu(void)
1789{
1790    return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
1791}
1792
1793int kvm_has_vcpu_events(void)
1794{
1795    return kvm_state->vcpu_events;
1796}
1797
1798int kvm_has_robust_singlestep(void)
1799{
1800    return kvm_state->robust_singlestep;
1801}
1802
1803int kvm_has_debugregs(void)
1804{
1805    return kvm_state->debugregs;
1806}
1807
1808int kvm_has_xsave(void)
1809{
1810    return kvm_state->xsave;
1811}
1812
1813int kvm_has_xcrs(void)
1814{
1815    return kvm_state->xcrs;
1816}
1817
1818int kvm_has_pit_state2(void)
1819{
1820    return kvm_state->pit_state2;
1821}
1822
1823int kvm_has_many_ioeventfds(void)
1824{
1825    if (!kvm_enabled()) {
1826        return 0;
1827    }
1828    return kvm_state->many_ioeventfds;
1829}
1830
1831int kvm_has_gsi_routing(void)
1832{
1833#ifdef KVM_CAP_IRQ_ROUTING
1834    return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
1835#else
1836    return false;
1837#endif
1838}
1839
1840int kvm_has_intx_set_mask(void)
1841{
1842    return kvm_state->intx_set_mask;
1843}
1844
1845void kvm_setup_guest_memory(void *start, size_t size)
1846{
1847#ifdef CONFIG_VALGRIND_H
1848    VALGRIND_MAKE_MEM_DEFINED(start, size);
1849#endif
1850    if (!kvm_has_sync_mmu()) {
1851        int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);
1852
1853        if (ret) {
1854            perror("qemu_madvise");
1855            fprintf(stderr,
1856                    "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1857            exit(1);
1858        }
1859    }
1860}
1861
1862#ifdef KVM_CAP_SET_GUEST_DEBUG
1863struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu,
1864                                                 target_ulong pc)
1865{
1866    struct kvm_sw_breakpoint *bp;
1867
1868    QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
1869        if (bp->pc == pc) {
1870            return bp;
1871        }
1872    }
1873    return NULL;
1874}
1875
1876int kvm_sw_breakpoints_active(CPUState *cpu)
1877{
1878    return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
1879}
1880
1881struct kvm_set_guest_debug_data {
1882    struct kvm_guest_debug dbg;
1883    CPUState *cpu;
1884    int err;
1885};
1886
1887static void kvm_invoke_set_guest_debug(void *data)
1888{
1889    struct kvm_set_guest_debug_data *dbg_data = data;
1890
1891    dbg_data->err = kvm_vcpu_ioctl(dbg_data->cpu, KVM_SET_GUEST_DEBUG,
1892                                   &dbg_data->dbg);
1893}
1894
1895int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
1896{
1897    struct kvm_set_guest_debug_data data;
1898
1899    data.dbg.control = reinject_trap;
1900
1901    if (cpu->singlestep_enabled) {
1902        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1903    }
1904    kvm_arch_update_guest_debug(cpu, &data.dbg);
1905    data.cpu = cpu;
1906
1907    run_on_cpu(cpu, kvm_invoke_set_guest_debug, &data);
1908    return data.err;
1909}
1910
1911int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
1912                          target_ulong len, int type)
1913{
1914    struct kvm_sw_breakpoint *bp;
1915    int err;
1916
1917    if (type == GDB_BREAKPOINT_SW) {
1918        bp = kvm_find_sw_breakpoint(cpu, addr);
1919        if (bp) {
1920            bp->use_count++;
1921            return 0;
1922        }
1923
1924        bp = g_malloc(sizeof(struct kvm_sw_breakpoint));
1925        if (!bp) {
1926            return -ENOMEM;
1927        }
1928
1929        bp->pc = addr;
1930        bp->use_count = 1;
1931        err = kvm_arch_insert_sw_breakpoint(cpu, bp);
1932        if (err) {
1933            g_free(bp);
1934            return err;
1935        }
1936
1937        QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
1938    } else {
1939        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1940        if (err) {
1941            return err;
1942        }
1943    }
1944
1945    CPU_FOREACH(cpu) {
1946        err = kvm_update_guest_debug(cpu, 0);
1947        if (err) {
1948            return err;
1949        }
1950    }
1951    return 0;
1952}
1953
1954int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
1955                          target_ulong len, int type)
1956{
1957    struct kvm_sw_breakpoint *bp;
1958    int err;
1959
1960    if (type == GDB_BREAKPOINT_SW) {
1961        bp = kvm_find_sw_breakpoint(cpu, addr);
1962        if (!bp) {
1963            return -ENOENT;
1964        }
1965
1966        if (bp->use_count > 1) {
1967            bp->use_count--;
1968            return 0;
1969        }
1970
1971        err = kvm_arch_remove_sw_breakpoint(cpu, bp);
1972        if (err) {
1973            return err;
1974        }
1975
1976        QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
1977        g_free(bp);
1978    } else {
1979        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1980        if (err) {
1981            return err;
1982        }
1983    }
1984
1985    CPU_FOREACH(cpu) {
1986        err = kvm_update_guest_debug(cpu, 0);
1987        if (err) {
1988            return err;
1989        }
1990    }
1991    return 0;
1992}
1993
1994void kvm_remove_all_breakpoints(CPUState *cpu)
1995{
1996    struct kvm_sw_breakpoint *bp, *next;
1997    KVMState *s = cpu->kvm_state;
1998
1999    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
2000        if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
2001            /* Try harder to find a CPU that currently sees the breakpoint. */
2002            CPU_FOREACH(cpu) {
2003                if (kvm_arch_remove_sw_breakpoint(cpu, bp) == 0) {
2004                    break;
2005                }
2006            }
2007        }
2008        QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
2009        g_free(bp);
2010    }
2011    kvm_arch_remove_all_hw_breakpoints();
2012
2013    CPU_FOREACH(cpu) {
2014        kvm_update_guest_debug(cpu, 0);
2015    }
2016}
2017
2018#else /* !KVM_CAP_SET_GUEST_DEBUG */
2019
2020int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
2021{
2022    return -EINVAL;
2023}
2024
2025int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2026                          target_ulong len, int type)
2027{
2028    return -EINVAL;
2029}
2030
2031int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2032                          target_ulong len, int type)
2033{
2034    return -EINVAL;
2035}
2036
2037void kvm_remove_all_breakpoints(CPUState *cpu)
2038{
2039}
2040#endif /* !KVM_CAP_SET_GUEST_DEBUG */
2041
2042int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
2043{
2044    struct kvm_signal_mask *sigmask;
2045    int r;
2046
2047    if (!sigset) {
2048        return kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, NULL);
2049    }
2050
2051    sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
2052
2053    sigmask->len = 8;
2054    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
2055    r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
2056    g_free(sigmask);
2057
2058    return r;
2059}
2060int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
2061{
2062    return kvm_arch_on_sigbus_vcpu(cpu, code, addr);
2063}
2064
2065int kvm_on_sigbus(int code, void *addr)
2066{
2067    return kvm_arch_on_sigbus(code, addr);
2068}
2069