qemu/kvm-all.c
<<
>>
Prefs
   1/*
   2 * QEMU KVM support
   3 *
   4 * Copyright IBM, Corp. 2008
   5 *           Red Hat, Inc. 2008
   6 *
   7 * Authors:
   8 *  Anthony Liguori   <aliguori@us.ibm.com>
   9 *  Glauber Costa     <gcosta@redhat.com>
  10 *
  11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
  12 * See the COPYING file in the top-level directory.
  13 *
  14 */
  15
  16#include <sys/types.h>
  17#include <sys/ioctl.h>
  18#include <sys/mman.h>
  19#include <stdarg.h>
  20
  21#include <linux/kvm.h>
  22
  23#include "qemu-common.h"
  24#include "qemu-barrier.h"
  25#include "qemu-option.h"
  26#include "qemu-config.h"
  27#include "sysemu.h"
  28#include "hw/hw.h"
  29#include "hw/msi.h"
  30#include "gdbstub.h"
  31#include "kvm.h"
  32#include "bswap.h"
  33#include "memory.h"
  34#include "exec-memory.h"
  35#include "event_notifier.h"
  36
  37/* This check must be after config-host.h is included */
  38#ifdef CONFIG_EVENTFD
  39#include <sys/eventfd.h>
  40#endif
  41
  42#ifdef CONFIG_VALGRIND_H
  43#include <valgrind/memcheck.h>
  44#endif
  45
  46/* KVM uses PAGE_SIZE in its definition of COALESCED_MMIO_MAX */
  47#define PAGE_SIZE TARGET_PAGE_SIZE
  48
  49//#define DEBUG_KVM
  50
  51#ifdef DEBUG_KVM
  52#define DPRINTF(fmt, ...) \
  53    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
  54#else
  55#define DPRINTF(fmt, ...) \
  56    do { } while (0)
  57#endif
  58
  59#define KVM_MSI_HASHTAB_SIZE    256
  60
  61typedef struct KVMSlot
  62{
  63    hwaddr start_addr;
  64    ram_addr_t memory_size;
  65    void *ram;
  66    int slot;
  67    int flags;
  68} KVMSlot;
  69
  70typedef struct kvm_dirty_log KVMDirtyLog;
  71
  72struct KVMState
  73{
  74    KVMSlot slots[32];
  75    int fd;
  76    int vmfd;
  77    int coalesced_mmio;
  78    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
  79    bool coalesced_flush_in_progress;
  80    int broken_set_mem_region;
  81    int migration_log;
  82    int vcpu_events;
  83    int robust_singlestep;
  84    int debugregs;
  85#ifdef KVM_CAP_SET_GUEST_DEBUG
  86    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
  87#endif
  88    int pit_state2;
  89    int xsave, xcrs;
  90    int many_ioeventfds;
  91    int intx_set_mask;
  92    /* The man page (and posix) say ioctl numbers are signed int, but
  93     * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
  94     * unsigned, and treating them as signed here can break things */
  95    unsigned irq_set_ioctl;
  96#ifdef KVM_CAP_IRQ_ROUTING
  97    struct kvm_irq_routing *irq_routes;
  98    int nr_allocated_irq_routes;
  99    uint32_t *used_gsi_bitmap;
 100    unsigned int gsi_count;
 101    QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
 102    bool direct_msi;
 103#endif
 104};
 105
 106KVMState *kvm_state;
 107bool kvm_kernel_irqchip;
 108bool kvm_async_interrupts_allowed;
 109bool kvm_irqfds_allowed;
 110bool kvm_msi_via_irqfd_allowed;
 111bool kvm_gsi_routing_allowed;
 112
 113static const KVMCapabilityInfo kvm_required_capabilites[] = {
 114    KVM_CAP_INFO(USER_MEMORY),
 115    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
 116    KVM_CAP_LAST_INFO
 117};
 118
 119static KVMSlot *kvm_alloc_slot(KVMState *s)
 120{
 121    int i;
 122
 123    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 124        if (s->slots[i].memory_size == 0) {
 125            return &s->slots[i];
 126        }
 127    }
 128
 129    fprintf(stderr, "%s: no free slot available\n", __func__);
 130    abort();
 131}
 132
 133static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
 134                                         hwaddr start_addr,
 135                                         hwaddr end_addr)
 136{
 137    int i;
 138
 139    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 140        KVMSlot *mem = &s->slots[i];
 141
 142        if (start_addr == mem->start_addr &&
 143            end_addr == mem->start_addr + mem->memory_size) {
 144            return mem;
 145        }
 146    }
 147
 148    return NULL;
 149}
 150
 151/*
 152 * Find overlapping slot with lowest start address
 153 */
 154static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
 155                                            hwaddr start_addr,
 156                                            hwaddr end_addr)
 157{
 158    KVMSlot *found = NULL;
 159    int i;
 160
 161    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 162        KVMSlot *mem = &s->slots[i];
 163
 164        if (mem->memory_size == 0 ||
 165            (found && found->start_addr < mem->start_addr)) {
 166            continue;
 167        }
 168
 169        if (end_addr > mem->start_addr &&
 170            start_addr < mem->start_addr + mem->memory_size) {
 171            found = mem;
 172        }
 173    }
 174
 175    return found;
 176}
 177
 178int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
 179                                       hwaddr *phys_addr)
 180{
 181    int i;
 182
 183    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 184        KVMSlot *mem = &s->slots[i];
 185
 186        if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
 187            *phys_addr = mem->start_addr + (ram - mem->ram);
 188            return 1;
 189        }
 190    }
 191
 192    return 0;
 193}
 194
 195static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
 196{
 197    struct kvm_userspace_memory_region mem;
 198
 199    mem.slot = slot->slot;
 200    mem.guest_phys_addr = slot->start_addr;
 201    mem.memory_size = slot->memory_size;
 202    mem.userspace_addr = (unsigned long)slot->ram;
 203    mem.flags = slot->flags;
 204    if (s->migration_log) {
 205        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
 206    }
 207    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 208}
 209
 210static void kvm_reset_vcpu(void *opaque)
 211{
 212    CPUArchState *env = opaque;
 213
 214    kvm_arch_reset_vcpu(env);
 215}
 216
 217int kvm_init_vcpu(CPUArchState *env)
 218{
 219    KVMState *s = kvm_state;
 220    long mmap_size;
 221    int ret;
 222
 223    DPRINTF("kvm_init_vcpu\n");
 224
 225    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
 226    if (ret < 0) {
 227        DPRINTF("kvm_create_vcpu failed\n");
 228        goto err;
 229    }
 230
 231    env->kvm_fd = ret;
 232    env->kvm_state = s;
 233    env->kvm_vcpu_dirty = 1;
 234
 235    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
 236    if (mmap_size < 0) {
 237        ret = mmap_size;
 238        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
 239        goto err;
 240    }
 241
 242    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
 243                        env->kvm_fd, 0);
 244    if (env->kvm_run == MAP_FAILED) {
 245        ret = -errno;
 246        DPRINTF("mmap'ing vcpu state failed\n");
 247        goto err;
 248    }
 249
 250    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
 251        s->coalesced_mmio_ring =
 252            (void *)env->kvm_run + s->coalesced_mmio * PAGE_SIZE;
 253    }
 254
 255    ret = kvm_arch_init_vcpu(env);
 256    if (ret == 0) {
 257        qemu_register_reset(kvm_reset_vcpu, env);
 258        kvm_arch_reset_vcpu(env);
 259    }
 260err:
 261    return ret;
 262}
 263
 264/*
 265 * dirty pages logging control
 266 */
 267
 268static int kvm_mem_flags(KVMState *s, bool log_dirty)
 269{
 270    return log_dirty ? KVM_MEM_LOG_DIRTY_PAGES : 0;
 271}
 272
 273static int kvm_slot_dirty_pages_log_change(KVMSlot *mem, bool log_dirty)
 274{
 275    KVMState *s = kvm_state;
 276    int flags, mask = KVM_MEM_LOG_DIRTY_PAGES;
 277    int old_flags;
 278
 279    old_flags = mem->flags;
 280
 281    flags = (mem->flags & ~mask) | kvm_mem_flags(s, log_dirty);
 282    mem->flags = flags;
 283
 284    /* If nothing changed effectively, no need to issue ioctl */
 285    if (s->migration_log) {
 286        flags |= KVM_MEM_LOG_DIRTY_PAGES;
 287    }
 288
 289    if (flags == old_flags) {
 290        return 0;
 291    }
 292
 293    return kvm_set_user_memory_region(s, mem);
 294}
 295
 296static int kvm_dirty_pages_log_change(hwaddr phys_addr,
 297                                      ram_addr_t size, bool log_dirty)
 298{
 299    KVMState *s = kvm_state;
 300    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
 301
 302    if (mem == NULL)  {
 303        fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
 304                TARGET_FMT_plx "\n", __func__, phys_addr,
 305                (hwaddr)(phys_addr + size - 1));
 306        return -EINVAL;
 307    }
 308    return kvm_slot_dirty_pages_log_change(mem, log_dirty);
 309}
 310
 311static void kvm_log_start(MemoryListener *listener,
 312                          MemoryRegionSection *section)
 313{
 314    int r;
 315
 316    r = kvm_dirty_pages_log_change(section->offset_within_address_space,
 317                                   section->size, true);
 318    if (r < 0) {
 319        abort();
 320    }
 321}
 322
 323static void kvm_log_stop(MemoryListener *listener,
 324                          MemoryRegionSection *section)
 325{
 326    int r;
 327
 328    r = kvm_dirty_pages_log_change(section->offset_within_address_space,
 329                                   section->size, false);
 330    if (r < 0) {
 331        abort();
 332    }
 333}
 334
 335static int kvm_set_migration_log(int enable)
 336{
 337    KVMState *s = kvm_state;
 338    KVMSlot *mem;
 339    int i, err;
 340
 341    s->migration_log = enable;
 342
 343    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 344        mem = &s->slots[i];
 345
 346        if (!mem->memory_size) {
 347            continue;
 348        }
 349        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
 350            continue;
 351        }
 352        err = kvm_set_user_memory_region(s, mem);
 353        if (err) {
 354            return err;
 355        }
 356    }
 357    return 0;
 358}
 359
 360/* get kvm's dirty pages bitmap and update qemu's */
 361static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
 362                                         unsigned long *bitmap)
 363{
 364    unsigned int i, j;
 365    unsigned long page_number, c;
 366    hwaddr addr, addr1;
 367    unsigned int len = ((section->size / getpagesize()) + HOST_LONG_BITS - 1) / HOST_LONG_BITS;
 368    unsigned long hpratio = getpagesize() / TARGET_PAGE_SIZE;
 369
 370    /*
 371     * bitmap-traveling is faster than memory-traveling (for addr...)
 372     * especially when most of the memory is not dirty.
 373     */
 374    for (i = 0; i < len; i++) {
 375        if (bitmap[i] != 0) {
 376            c = leul_to_cpu(bitmap[i]);
 377            do {
 378                j = ffsl(c) - 1;
 379                c &= ~(1ul << j);
 380                page_number = (i * HOST_LONG_BITS + j) * hpratio;
 381                addr1 = page_number * TARGET_PAGE_SIZE;
 382                addr = section->offset_within_region + addr1;
 383                memory_region_set_dirty(section->mr, addr,
 384                                        TARGET_PAGE_SIZE * hpratio);
 385            } while (c != 0);
 386        }
 387    }
 388    return 0;
 389}
 390
 391#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
 392
 393/**
 394 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
 395 * This function updates qemu's dirty bitmap using
 396 * memory_region_set_dirty().  This means all bits are set
 397 * to dirty.
 398 *
 399 * @start_add: start of logged region.
 400 * @end_addr: end of logged region.
 401 */
 402static int kvm_physical_sync_dirty_bitmap(MemoryRegionSection *section)
 403{
 404    KVMState *s = kvm_state;
 405    unsigned long size, allocated_size = 0;
 406    KVMDirtyLog d;
 407    KVMSlot *mem;
 408    int ret = 0;
 409    hwaddr start_addr = section->offset_within_address_space;
 410    hwaddr end_addr = start_addr + section->size;
 411
 412    d.dirty_bitmap = NULL;
 413    while (start_addr < end_addr) {
 414        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
 415        if (mem == NULL) {
 416            break;
 417        }
 418
 419        /* XXX bad kernel interface alert
 420         * For dirty bitmap, kernel allocates array of size aligned to
 421         * bits-per-long.  But for case when the kernel is 64bits and
 422         * the userspace is 32bits, userspace can't align to the same
 423         * bits-per-long, since sizeof(long) is different between kernel
 424         * and user space.  This way, userspace will provide buffer which
 425         * may be 4 bytes less than the kernel will use, resulting in
 426         * userspace memory corruption (which is not detectable by valgrind
 427         * too, in most cases).
 428         * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
 429         * a hope that sizeof(long) wont become >8 any time soon.
 430         */
 431        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
 432                     /*HOST_LONG_BITS*/ 64) / 8;
 433        if (!d.dirty_bitmap) {
 434            d.dirty_bitmap = g_malloc(size);
 435        } else if (size > allocated_size) {
 436            d.dirty_bitmap = g_realloc(d.dirty_bitmap, size);
 437        }
 438        allocated_size = size;
 439        memset(d.dirty_bitmap, 0, allocated_size);
 440
 441        d.slot = mem->slot;
 442
 443        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
 444            DPRINTF("ioctl failed %d\n", errno);
 445            ret = -1;
 446            break;
 447        }
 448
 449        kvm_get_dirty_pages_log_range(section, d.dirty_bitmap);
 450        start_addr = mem->start_addr + mem->memory_size;
 451    }
 452    g_free(d.dirty_bitmap);
 453
 454    return ret;
 455}
 456
 457static void kvm_coalesce_mmio_region(MemoryListener *listener,
 458                                     MemoryRegionSection *secion,
 459                                     hwaddr start, hwaddr size)
 460{
 461    KVMState *s = kvm_state;
 462
 463    if (s->coalesced_mmio) {
 464        struct kvm_coalesced_mmio_zone zone;
 465
 466        zone.addr = start;
 467        zone.size = size;
 468        zone.pad = 0;
 469
 470        (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
 471    }
 472}
 473
 474static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
 475                                       MemoryRegionSection *secion,
 476                                       hwaddr start, hwaddr size)
 477{
 478    KVMState *s = kvm_state;
 479
 480    if (s->coalesced_mmio) {
 481        struct kvm_coalesced_mmio_zone zone;
 482
 483        zone.addr = start;
 484        zone.size = size;
 485        zone.pad = 0;
 486
 487        (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
 488    }
 489}
 490
 491int kvm_check_extension(KVMState *s, unsigned int extension)
 492{
 493    int ret;
 494
 495    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
 496    if (ret < 0) {
 497        ret = 0;
 498    }
 499
 500    return ret;
 501}
 502
 503static int kvm_check_many_ioeventfds(void)
 504{
 505    /* Userspace can use ioeventfd for io notification.  This requires a host
 506     * that supports eventfd(2) and an I/O thread; since eventfd does not
 507     * support SIGIO it cannot interrupt the vcpu.
 508     *
 509     * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
 510     * can avoid creating too many ioeventfds.
 511     */
 512#if defined(CONFIG_EVENTFD)
 513    int ioeventfds[7];
 514    int i, ret = 0;
 515    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
 516        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
 517        if (ioeventfds[i] < 0) {
 518            break;
 519        }
 520        ret = kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, true);
 521        if (ret < 0) {
 522            close(ioeventfds[i]);
 523            break;
 524        }
 525    }
 526
 527    /* Decide whether many devices are supported or not */
 528    ret = i == ARRAY_SIZE(ioeventfds);
 529
 530    while (i-- > 0) {
 531        kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, false);
 532        close(ioeventfds[i]);
 533    }
 534    return ret;
 535#else
 536    return 0;
 537#endif
 538}
 539
 540static const KVMCapabilityInfo *
 541kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
 542{
 543    while (list->name) {
 544        if (!kvm_check_extension(s, list->value)) {
 545            return list;
 546        }
 547        list++;
 548    }
 549    return NULL;
 550}
 551
 552static void kvm_set_phys_mem(MemoryRegionSection *section, bool add)
 553{
 554    KVMState *s = kvm_state;
 555    KVMSlot *mem, old;
 556    int err;
 557    MemoryRegion *mr = section->mr;
 558    bool log_dirty = memory_region_is_logging(mr);
 559    hwaddr start_addr = section->offset_within_address_space;
 560    ram_addr_t size = section->size;
 561    void *ram = NULL;
 562    unsigned delta;
 563
 564    /* kvm works in page size chunks, but the function may be called
 565       with sub-page size and unaligned start address. */
 566    delta = TARGET_PAGE_ALIGN(size) - size;
 567    if (delta > size) {
 568        return;
 569    }
 570    start_addr += delta;
 571    size -= delta;
 572    size &= TARGET_PAGE_MASK;
 573    if (!size || (start_addr & ~TARGET_PAGE_MASK)) {
 574        return;
 575    }
 576
 577    if (!memory_region_is_ram(mr)) {
 578        return;
 579    }
 580
 581    ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + delta;
 582
 583    while (1) {
 584        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
 585        if (!mem) {
 586            break;
 587        }
 588
 589        if (add && start_addr >= mem->start_addr &&
 590            (start_addr + size <= mem->start_addr + mem->memory_size) &&
 591            (ram - start_addr == mem->ram - mem->start_addr)) {
 592            /* The new slot fits into the existing one and comes with
 593             * identical parameters - update flags and done. */
 594            kvm_slot_dirty_pages_log_change(mem, log_dirty);
 595            return;
 596        }
 597
 598        old = *mem;
 599
 600        if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
 601            kvm_physical_sync_dirty_bitmap(section);
 602        }
 603
 604        /* unregister the overlapping slot */
 605        mem->memory_size = 0;
 606        err = kvm_set_user_memory_region(s, mem);
 607        if (err) {
 608            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
 609                    __func__, strerror(-err));
 610            abort();
 611        }
 612
 613        /* Workaround for older KVM versions: we can't join slots, even not by
 614         * unregistering the previous ones and then registering the larger
 615         * slot. We have to maintain the existing fragmentation. Sigh.
 616         *
 617         * This workaround assumes that the new slot starts at the same
 618         * address as the first existing one. If not or if some overlapping
 619         * slot comes around later, we will fail (not seen in practice so far)
 620         * - and actually require a recent KVM version. */
 621        if (s->broken_set_mem_region &&
 622            old.start_addr == start_addr && old.memory_size < size && add) {
 623            mem = kvm_alloc_slot(s);
 624            mem->memory_size = old.memory_size;
 625            mem->start_addr = old.start_addr;
 626            mem->ram = old.ram;
 627            mem->flags = kvm_mem_flags(s, log_dirty);
 628
 629            err = kvm_set_user_memory_region(s, mem);
 630            if (err) {
 631                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
 632                        strerror(-err));
 633                abort();
 634            }
 635
 636            start_addr += old.memory_size;
 637            ram += old.memory_size;
 638            size -= old.memory_size;
 639            continue;
 640        }
 641
 642        /* register prefix slot */
 643        if (old.start_addr < start_addr) {
 644            mem = kvm_alloc_slot(s);
 645            mem->memory_size = start_addr - old.start_addr;
 646            mem->start_addr = old.start_addr;
 647            mem->ram = old.ram;
 648            mem->flags =  kvm_mem_flags(s, log_dirty);
 649
 650            err = kvm_set_user_memory_region(s, mem);
 651            if (err) {
 652                fprintf(stderr, "%s: error registering prefix slot: %s\n",
 653                        __func__, strerror(-err));
 654#ifdef TARGET_PPC
 655                fprintf(stderr, "%s: This is probably because your kernel's " \
 656                                "PAGE_SIZE is too big. Please try to use 4k " \
 657                                "PAGE_SIZE!\n", __func__);
 658#endif
 659                abort();
 660            }
 661        }
 662
 663        /* register suffix slot */
 664        if (old.start_addr + old.memory_size > start_addr + size) {
 665            ram_addr_t size_delta;
 666
 667            mem = kvm_alloc_slot(s);
 668            mem->start_addr = start_addr + size;
 669            size_delta = mem->start_addr - old.start_addr;
 670            mem->memory_size = old.memory_size - size_delta;
 671            mem->ram = old.ram + size_delta;
 672            mem->flags = kvm_mem_flags(s, log_dirty);
 673
 674            err = kvm_set_user_memory_region(s, mem);
 675            if (err) {
 676                fprintf(stderr, "%s: error registering suffix slot: %s\n",
 677                        __func__, strerror(-err));
 678                abort();
 679            }
 680        }
 681    }
 682
 683    /* in case the KVM bug workaround already "consumed" the new slot */
 684    if (!size) {
 685        return;
 686    }
 687    if (!add) {
 688        return;
 689    }
 690    mem = kvm_alloc_slot(s);
 691    mem->memory_size = size;
 692    mem->start_addr = start_addr;
 693    mem->ram = ram;
 694    mem->flags = kvm_mem_flags(s, log_dirty);
 695
 696    err = kvm_set_user_memory_region(s, mem);
 697    if (err) {
 698        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
 699                strerror(-err));
 700        abort();
 701    }
 702}
 703
 704static void kvm_region_add(MemoryListener *listener,
 705                           MemoryRegionSection *section)
 706{
 707    kvm_set_phys_mem(section, true);
 708}
 709
 710static void kvm_region_del(MemoryListener *listener,
 711                           MemoryRegionSection *section)
 712{
 713    kvm_set_phys_mem(section, false);
 714}
 715
 716static void kvm_log_sync(MemoryListener *listener,
 717                         MemoryRegionSection *section)
 718{
 719    int r;
 720
 721    r = kvm_physical_sync_dirty_bitmap(section);
 722    if (r < 0) {
 723        abort();
 724    }
 725}
 726
 727static void kvm_log_global_start(struct MemoryListener *listener)
 728{
 729    int r;
 730
 731    r = kvm_set_migration_log(1);
 732    assert(r >= 0);
 733}
 734
 735static void kvm_log_global_stop(struct MemoryListener *listener)
 736{
 737    int r;
 738
 739    r = kvm_set_migration_log(0);
 740    assert(r >= 0);
 741}
 742
 743static void kvm_mem_ioeventfd_add(MemoryListener *listener,
 744                                  MemoryRegionSection *section,
 745                                  bool match_data, uint64_t data,
 746                                  EventNotifier *e)
 747{
 748    int fd = event_notifier_get_fd(e);
 749    int r;
 750
 751    assert(match_data && section->size <= 8);
 752
 753    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
 754                               data, true, section->size);
 755    if (r < 0) {
 756        abort();
 757    }
 758}
 759
 760static void kvm_mem_ioeventfd_del(MemoryListener *listener,
 761                                  MemoryRegionSection *section,
 762                                  bool match_data, uint64_t data,
 763                                  EventNotifier *e)
 764{
 765    int fd = event_notifier_get_fd(e);
 766    int r;
 767
 768    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
 769                               data, false, section->size);
 770    if (r < 0) {
 771        abort();
 772    }
 773}
 774
 775static void kvm_io_ioeventfd_add(MemoryListener *listener,
 776                                 MemoryRegionSection *section,
 777                                 bool match_data, uint64_t data,
 778                                 EventNotifier *e)
 779{
 780    int fd = event_notifier_get_fd(e);
 781    int r;
 782
 783    assert(match_data && section->size == 2);
 784
 785    r = kvm_set_ioeventfd_pio_word(fd, section->offset_within_address_space,
 786                                   data, true);
 787    if (r < 0) {
 788        abort();
 789    }
 790}
 791
 792static void kvm_io_ioeventfd_del(MemoryListener *listener,
 793                                 MemoryRegionSection *section,
 794                                 bool match_data, uint64_t data,
 795                                 EventNotifier *e)
 796
 797{
 798    int fd = event_notifier_get_fd(e);
 799    int r;
 800
 801    r = kvm_set_ioeventfd_pio_word(fd, section->offset_within_address_space,
 802                                   data, false);
 803    if (r < 0) {
 804        abort();
 805    }
 806}
 807
 808static MemoryListener kvm_memory_listener = {
 809    .region_add = kvm_region_add,
 810    .region_del = kvm_region_del,
 811    .log_start = kvm_log_start,
 812    .log_stop = kvm_log_stop,
 813    .log_sync = kvm_log_sync,
 814    .log_global_start = kvm_log_global_start,
 815    .log_global_stop = kvm_log_global_stop,
 816    .eventfd_add = kvm_mem_ioeventfd_add,
 817    .eventfd_del = kvm_mem_ioeventfd_del,
 818    .coalesced_mmio_add = kvm_coalesce_mmio_region,
 819    .coalesced_mmio_del = kvm_uncoalesce_mmio_region,
 820    .priority = 10,
 821};
 822
 823static MemoryListener kvm_io_listener = {
 824    .eventfd_add = kvm_io_ioeventfd_add,
 825    .eventfd_del = kvm_io_ioeventfd_del,
 826    .priority = 10,
 827};
 828
 829static void kvm_handle_interrupt(CPUArchState *env, int mask)
 830{
 831    CPUState *cpu = ENV_GET_CPU(env);
 832
 833    env->interrupt_request |= mask;
 834
 835    if (!qemu_cpu_is_self(cpu)) {
 836        qemu_cpu_kick(cpu);
 837    }
 838}
 839
 840int kvm_set_irq(KVMState *s, int irq, int level)
 841{
 842    struct kvm_irq_level event;
 843    int ret;
 844
 845    assert(kvm_async_interrupts_enabled());
 846
 847    event.level = level;
 848    event.irq = irq;
 849    ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
 850    if (ret < 0) {
 851        perror("kvm_set_irq");
 852        abort();
 853    }
 854
 855    return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
 856}
 857
 858#ifdef KVM_CAP_IRQ_ROUTING
 859typedef struct KVMMSIRoute {
 860    struct kvm_irq_routing_entry kroute;
 861    QTAILQ_ENTRY(KVMMSIRoute) entry;
 862} KVMMSIRoute;
 863
 864static void set_gsi(KVMState *s, unsigned int gsi)
 865{
 866    s->used_gsi_bitmap[gsi / 32] |= 1U << (gsi % 32);
 867}
 868
 869static void clear_gsi(KVMState *s, unsigned int gsi)
 870{
 871    s->used_gsi_bitmap[gsi / 32] &= ~(1U << (gsi % 32));
 872}
 873
 874static void kvm_init_irq_routing(KVMState *s)
 875{
 876    int gsi_count, i;
 877
 878    gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING);
 879    if (gsi_count > 0) {
 880        unsigned int gsi_bits, i;
 881
 882        /* Round up so we can search ints using ffs */
 883        gsi_bits = ALIGN(gsi_count, 32);
 884        s->used_gsi_bitmap = g_malloc0(gsi_bits / 8);
 885        s->gsi_count = gsi_count;
 886
 887        /* Mark any over-allocated bits as already in use */
 888        for (i = gsi_count; i < gsi_bits; i++) {
 889            set_gsi(s, i);
 890        }
 891    }
 892
 893    s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
 894    s->nr_allocated_irq_routes = 0;
 895
 896    if (!s->direct_msi) {
 897        for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
 898            QTAILQ_INIT(&s->msi_hashtab[i]);
 899        }
 900    }
 901
 902    kvm_arch_init_irq_routing(s);
 903}
 904
 905static void kvm_irqchip_commit_routes(KVMState *s)
 906{
 907    int ret;
 908
 909    s->irq_routes->flags = 0;
 910    ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
 911    assert(ret == 0);
 912}
 913
 914static void kvm_add_routing_entry(KVMState *s,
 915                                  struct kvm_irq_routing_entry *entry)
 916{
 917    struct kvm_irq_routing_entry *new;
 918    int n, size;
 919
 920    if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
 921        n = s->nr_allocated_irq_routes * 2;
 922        if (n < 64) {
 923            n = 64;
 924        }
 925        size = sizeof(struct kvm_irq_routing);
 926        size += n * sizeof(*new);
 927        s->irq_routes = g_realloc(s->irq_routes, size);
 928        s->nr_allocated_irq_routes = n;
 929    }
 930    n = s->irq_routes->nr++;
 931    new = &s->irq_routes->entries[n];
 932    memset(new, 0, sizeof(*new));
 933    new->gsi = entry->gsi;
 934    new->type = entry->type;
 935    new->flags = entry->flags;
 936    new->u = entry->u;
 937
 938    set_gsi(s, entry->gsi);
 939
 940    kvm_irqchip_commit_routes(s);
 941}
 942
 943static int kvm_update_routing_entry(KVMState *s,
 944                                    struct kvm_irq_routing_entry *new_entry)
 945{
 946    struct kvm_irq_routing_entry *entry;
 947    int n;
 948
 949    for (n = 0; n < s->irq_routes->nr; n++) {
 950        entry = &s->irq_routes->entries[n];
 951        if (entry->gsi != new_entry->gsi) {
 952            continue;
 953        }
 954
 955        entry->type = new_entry->type;
 956        entry->flags = new_entry->flags;
 957        entry->u = new_entry->u;
 958
 959        kvm_irqchip_commit_routes(s);
 960
 961        return 0;
 962    }
 963
 964    return -ESRCH;
 965}
 966
 967void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
 968{
 969    struct kvm_irq_routing_entry e;
 970
 971    assert(pin < s->gsi_count);
 972
 973    e.gsi = irq;
 974    e.type = KVM_IRQ_ROUTING_IRQCHIP;
 975    e.flags = 0;
 976    e.u.irqchip.irqchip = irqchip;
 977    e.u.irqchip.pin = pin;
 978    kvm_add_routing_entry(s, &e);
 979}
 980
 981void kvm_irqchip_release_virq(KVMState *s, int virq)
 982{
 983    struct kvm_irq_routing_entry *e;
 984    int i;
 985
 986    for (i = 0; i < s->irq_routes->nr; i++) {
 987        e = &s->irq_routes->entries[i];
 988        if (e->gsi == virq) {
 989            s->irq_routes->nr--;
 990            *e = s->irq_routes->entries[s->irq_routes->nr];
 991        }
 992    }
 993    clear_gsi(s, virq);
 994
 995    kvm_irqchip_commit_routes(s);
 996}
 997
 998static unsigned int kvm_hash_msi(uint32_t data)
 999{
1000    /* This is optimized for IA32 MSI layout. However, no other arch shall
1001     * repeat the mistake of not providing a direct MSI injection API. */
1002    return data & 0xff;
1003}
1004
1005static void kvm_flush_dynamic_msi_routes(KVMState *s)
1006{
1007    KVMMSIRoute *route, *next;
1008    unsigned int hash;
1009
1010    for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
1011        QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
1012            kvm_irqchip_release_virq(s, route->kroute.gsi);
1013            QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
1014            g_free(route);
1015        }
1016    }
1017}
1018
1019static int kvm_irqchip_get_virq(KVMState *s)
1020{
1021    uint32_t *word = s->used_gsi_bitmap;
1022    int max_words = ALIGN(s->gsi_count, 32) / 32;
1023    int i, bit;
1024    bool retry = true;
1025
1026again:
1027    /* Return the lowest unused GSI in the bitmap */
1028    for (i = 0; i < max_words; i++) {
1029        bit = ffs(~word[i]);
1030        if (!bit) {
1031            continue;
1032        }
1033
1034        return bit - 1 + i * 32;
1035    }
1036    if (!s->direct_msi && retry) {
1037        retry = false;
1038        kvm_flush_dynamic_msi_routes(s);
1039        goto again;
1040    }
1041    return -ENOSPC;
1042
1043}
1044
1045static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
1046{
1047    unsigned int hash = kvm_hash_msi(msg.data);
1048    KVMMSIRoute *route;
1049
1050    QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
1051        if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
1052            route->kroute.u.msi.address_hi == (msg.address >> 32) &&
1053            route->kroute.u.msi.data == msg.data) {
1054            return route;
1055        }
1056    }
1057    return NULL;
1058}
1059
1060int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1061{
1062    struct kvm_msi msi;
1063    KVMMSIRoute *route;
1064
1065    if (s->direct_msi) {
1066        msi.address_lo = (uint32_t)msg.address;
1067        msi.address_hi = msg.address >> 32;
1068        msi.data = msg.data;
1069        msi.flags = 0;
1070        memset(msi.pad, 0, sizeof(msi.pad));
1071
1072        return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
1073    }
1074
1075    route = kvm_lookup_msi_route(s, msg);
1076    if (!route) {
1077        int virq;
1078
1079        virq = kvm_irqchip_get_virq(s);
1080        if (virq < 0) {
1081            return virq;
1082        }
1083
1084        route = g_malloc(sizeof(KVMMSIRoute));
1085        route->kroute.gsi = virq;
1086        route->kroute.type = KVM_IRQ_ROUTING_MSI;
1087        route->kroute.flags = 0;
1088        route->kroute.u.msi.address_lo = (uint32_t)msg.address;
1089        route->kroute.u.msi.address_hi = msg.address >> 32;
1090        route->kroute.u.msi.data = msg.data;
1091
1092        kvm_add_routing_entry(s, &route->kroute);
1093
1094        QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
1095                           entry);
1096    }
1097
1098    assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
1099
1100    return kvm_set_irq(s, route->kroute.gsi, 1);
1101}
1102
1103int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
1104{
1105    struct kvm_irq_routing_entry kroute;
1106    int virq;
1107
1108    if (!kvm_gsi_routing_enabled()) {
1109        return -ENOSYS;
1110    }
1111
1112    virq = kvm_irqchip_get_virq(s);
1113    if (virq < 0) {
1114        return virq;
1115    }
1116
1117    kroute.gsi = virq;
1118    kroute.type = KVM_IRQ_ROUTING_MSI;
1119    kroute.flags = 0;
1120    kroute.u.msi.address_lo = (uint32_t)msg.address;
1121    kroute.u.msi.address_hi = msg.address >> 32;
1122    kroute.u.msi.data = msg.data;
1123
1124    kvm_add_routing_entry(s, &kroute);
1125
1126    return virq;
1127}
1128
1129int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
1130{
1131    struct kvm_irq_routing_entry kroute;
1132
1133    if (!kvm_irqchip_in_kernel()) {
1134        return -ENOSYS;
1135    }
1136
1137    kroute.gsi = virq;
1138    kroute.type = KVM_IRQ_ROUTING_MSI;
1139    kroute.flags = 0;
1140    kroute.u.msi.address_lo = (uint32_t)msg.address;
1141    kroute.u.msi.address_hi = msg.address >> 32;
1142    kroute.u.msi.data = msg.data;
1143
1144    return kvm_update_routing_entry(s, &kroute);
1145}
1146
1147static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
1148{
1149    struct kvm_irqfd irqfd = {
1150        .fd = fd,
1151        .gsi = virq,
1152        .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
1153    };
1154
1155    if (!kvm_irqfds_enabled()) {
1156        return -ENOSYS;
1157    }
1158
1159    return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
1160}
1161
1162#else /* !KVM_CAP_IRQ_ROUTING */
1163
1164static void kvm_init_irq_routing(KVMState *s)
1165{
1166}
1167
1168void kvm_irqchip_release_virq(KVMState *s, int virq)
1169{
1170}
1171
1172int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1173{
1174    abort();
1175}
1176
1177int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
1178{
1179    return -ENOSYS;
1180}
1181
1182static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
1183{
1184    abort();
1185}
1186#endif /* !KVM_CAP_IRQ_ROUTING */
1187
1188int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, int virq)
1189{
1190    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), virq, true);
1191}
1192
1193int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq)
1194{
1195    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), virq, false);
1196}
1197
1198static int kvm_irqchip_create(KVMState *s)
1199{
1200    QemuOptsList *list = qemu_find_opts("machine");
1201    int ret;
1202
1203    if (QTAILQ_EMPTY(&list->head) ||
1204        !qemu_opt_get_bool(QTAILQ_FIRST(&list->head),
1205                           "kernel_irqchip", true) ||
1206        !kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
1207        return 0;
1208    }
1209
1210    ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
1211    if (ret < 0) {
1212        fprintf(stderr, "Create kernel irqchip failed\n");
1213        return ret;
1214    }
1215
1216    kvm_kernel_irqchip = true;
1217    /* If we have an in-kernel IRQ chip then we must have asynchronous
1218     * interrupt delivery (though the reverse is not necessarily true)
1219     */
1220    kvm_async_interrupts_allowed = true;
1221
1222    kvm_init_irq_routing(s);
1223
1224    return 0;
1225}
1226
1227static int kvm_max_vcpus(KVMState *s)
1228{
1229    int ret;
1230
1231    /* Find number of supported CPUs using the recommended
1232     * procedure from the kernel API documentation to cope with
1233     * older kernels that may be missing capabilities.
1234     */
1235    ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
1236    if (ret) {
1237        return ret;
1238    }
1239    ret = kvm_check_extension(s, KVM_CAP_NR_VCPUS);
1240    if (ret) {
1241        return ret;
1242    }
1243
1244    return 4;
1245}
1246
1247int kvm_init(void)
1248{
1249    static const char upgrade_note[] =
1250        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1251        "(see http://sourceforge.net/projects/kvm).\n";
1252    KVMState *s;
1253    const KVMCapabilityInfo *missing_cap;
1254    int ret;
1255    int i;
1256    int max_vcpus;
1257
1258    s = g_malloc0(sizeof(KVMState));
1259
1260    /*
1261     * On systems where the kernel can support different base page
1262     * sizes, host page size may be different from TARGET_PAGE_SIZE,
1263     * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
1264     * page size for the system though.
1265     */
1266    assert(TARGET_PAGE_SIZE <= getpagesize());
1267
1268#ifdef KVM_CAP_SET_GUEST_DEBUG
1269    QTAILQ_INIT(&s->kvm_sw_breakpoints);
1270#endif
1271    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
1272        s->slots[i].slot = i;
1273    }
1274    s->vmfd = -1;
1275    s->fd = qemu_open("/dev/kvm", O_RDWR);
1276    if (s->fd == -1) {
1277        fprintf(stderr, "Could not access KVM kernel module: %m\n");
1278        ret = -errno;
1279        goto err;
1280    }
1281
1282    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
1283    if (ret < KVM_API_VERSION) {
1284        if (ret > 0) {
1285            ret = -EINVAL;
1286        }
1287        fprintf(stderr, "kvm version too old\n");
1288        goto err;
1289    }
1290
1291    if (ret > KVM_API_VERSION) {
1292        ret = -EINVAL;
1293        fprintf(stderr, "kvm version not supported\n");
1294        goto err;
1295    }
1296
1297    max_vcpus = kvm_max_vcpus(s);
1298    if (smp_cpus > max_vcpus) {
1299        ret = -EINVAL;
1300        fprintf(stderr, "Number of SMP cpus requested (%d) exceeds max cpus "
1301                "supported by KVM (%d)\n", smp_cpus, max_vcpus);
1302        goto err;
1303    }
1304
1305    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
1306    if (s->vmfd < 0) {
1307#ifdef TARGET_S390X
1308        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
1309                        "your host kernel command line\n");
1310#endif
1311        ret = s->vmfd;
1312        goto err;
1313    }
1314
1315    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
1316    if (!missing_cap) {
1317        missing_cap =
1318            kvm_check_extension_list(s, kvm_arch_required_capabilities);
1319    }
1320    if (missing_cap) {
1321        ret = -EINVAL;
1322        fprintf(stderr, "kvm does not support %s\n%s",
1323                missing_cap->name, upgrade_note);
1324        goto err;
1325    }
1326
1327    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
1328
1329    s->broken_set_mem_region = 1;
1330    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
1331    if (ret > 0) {
1332        s->broken_set_mem_region = 0;
1333    }
1334
1335#ifdef KVM_CAP_VCPU_EVENTS
1336    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
1337#endif
1338
1339    s->robust_singlestep =
1340        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
1341
1342#ifdef KVM_CAP_DEBUGREGS
1343    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
1344#endif
1345
1346#ifdef KVM_CAP_XSAVE
1347    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
1348#endif
1349
1350#ifdef KVM_CAP_XCRS
1351    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
1352#endif
1353
1354#ifdef KVM_CAP_PIT_STATE2
1355    s->pit_state2 = kvm_check_extension(s, KVM_CAP_PIT_STATE2);
1356#endif
1357
1358#ifdef KVM_CAP_IRQ_ROUTING
1359    s->direct_msi = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
1360#endif
1361
1362    s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
1363
1364    s->irq_set_ioctl = KVM_IRQ_LINE;
1365    if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
1366        s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
1367    }
1368
1369    ret = kvm_arch_init(s);
1370    if (ret < 0) {
1371        goto err;
1372    }
1373
1374    ret = kvm_irqchip_create(s);
1375    if (ret < 0) {
1376        goto err;
1377    }
1378
1379    kvm_state = s;
1380    memory_listener_register(&kvm_memory_listener, &address_space_memory);
1381    memory_listener_register(&kvm_io_listener, &address_space_io);
1382
1383    s->many_ioeventfds = kvm_check_many_ioeventfds();
1384
1385    cpu_interrupt_handler = kvm_handle_interrupt;
1386
1387    return 0;
1388
1389err:
1390    if (s->vmfd >= 0) {
1391        close(s->vmfd);
1392    }
1393    if (s->fd != -1) {
1394        close(s->fd);
1395    }
1396    g_free(s);
1397
1398    return ret;
1399}
1400
1401static void kvm_handle_io(uint16_t port, void *data, int direction, int size,
1402                          uint32_t count)
1403{
1404    int i;
1405    uint8_t *ptr = data;
1406
1407    for (i = 0; i < count; i++) {
1408        if (direction == KVM_EXIT_IO_IN) {
1409            switch (size) {
1410            case 1:
1411                stb_p(ptr, cpu_inb(port));
1412                break;
1413            case 2:
1414                stw_p(ptr, cpu_inw(port));
1415                break;
1416            case 4:
1417                stl_p(ptr, cpu_inl(port));
1418                break;
1419            }
1420        } else {
1421            switch (size) {
1422            case 1:
1423                cpu_outb(port, ldub_p(ptr));
1424                break;
1425            case 2:
1426                cpu_outw(port, lduw_p(ptr));
1427                break;
1428            case 4:
1429                cpu_outl(port, ldl_p(ptr));
1430                break;
1431            }
1432        }
1433
1434        ptr += size;
1435    }
1436}
1437
1438static int kvm_handle_internal_error(CPUArchState *env, struct kvm_run *run)
1439{
1440    fprintf(stderr, "KVM internal error.");
1441    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
1442        int i;
1443
1444        fprintf(stderr, " Suberror: %d\n", run->internal.suberror);
1445        for (i = 0; i < run->internal.ndata; ++i) {
1446            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
1447                    i, (uint64_t)run->internal.data[i]);
1448        }
1449    } else {
1450        fprintf(stderr, "\n");
1451    }
1452    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
1453        fprintf(stderr, "emulation failure\n");
1454        if (!kvm_arch_stop_on_emulation_error(env)) {
1455            cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
1456            return EXCP_INTERRUPT;
1457        }
1458    }
1459    /* FIXME: Should trigger a qmp message to let management know
1460     * something went wrong.
1461     */
1462    return -1;
1463}
1464
1465void kvm_flush_coalesced_mmio_buffer(void)
1466{
1467    KVMState *s = kvm_state;
1468
1469    if (s->coalesced_flush_in_progress) {
1470        return;
1471    }
1472
1473    s->coalesced_flush_in_progress = true;
1474
1475    if (s->coalesced_mmio_ring) {
1476        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
1477        while (ring->first != ring->last) {
1478            struct kvm_coalesced_mmio *ent;
1479
1480            ent = &ring->coalesced_mmio[ring->first];
1481
1482            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
1483            smp_wmb();
1484            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
1485        }
1486    }
1487
1488    s->coalesced_flush_in_progress = false;
1489}
1490
1491static void do_kvm_cpu_synchronize_state(void *_env)
1492{
1493    CPUArchState *env = _env;
1494
1495    if (!env->kvm_vcpu_dirty) {
1496        kvm_arch_get_registers(env);
1497        env->kvm_vcpu_dirty = 1;
1498    }
1499}
1500
1501void kvm_cpu_synchronize_state(CPUArchState *env)
1502{
1503    CPUState *cpu = ENV_GET_CPU(env);
1504
1505    if (!env->kvm_vcpu_dirty) {
1506        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, env);
1507    }
1508}
1509
1510void kvm_cpu_synchronize_post_reset(CPUArchState *env)
1511{
1512    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
1513    env->kvm_vcpu_dirty = 0;
1514}
1515
1516void kvm_cpu_synchronize_post_init(CPUArchState *env)
1517{
1518    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
1519    env->kvm_vcpu_dirty = 0;
1520}
1521
1522int kvm_cpu_exec(CPUArchState *env)
1523{
1524    struct kvm_run *run = env->kvm_run;
1525    int ret, run_ret;
1526
1527    DPRINTF("kvm_cpu_exec()\n");
1528
1529    if (kvm_arch_process_async_events(env)) {
1530        env->exit_request = 0;
1531        return EXCP_HLT;
1532    }
1533
1534    do {
1535        if (env->kvm_vcpu_dirty) {
1536            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
1537            env->kvm_vcpu_dirty = 0;
1538        }
1539
1540        kvm_arch_pre_run(env, run);
1541        if (env->exit_request) {
1542            DPRINTF("interrupt exit requested\n");
1543            /*
1544             * KVM requires us to reenter the kernel after IO exits to complete
1545             * instruction emulation. This self-signal will ensure that we
1546             * leave ASAP again.
1547             */
1548            qemu_cpu_kick_self();
1549        }
1550        qemu_mutex_unlock_iothread();
1551
1552        run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
1553
1554        qemu_mutex_lock_iothread();
1555        kvm_arch_post_run(env, run);
1556
1557        if (run_ret < 0) {
1558            if (run_ret == -EINTR || run_ret == -EAGAIN) {
1559                DPRINTF("io window exit\n");
1560                ret = EXCP_INTERRUPT;
1561                break;
1562            }
1563            fprintf(stderr, "error: kvm run failed %s\n",
1564                    strerror(-run_ret));
1565            abort();
1566        }
1567
1568        switch (run->exit_reason) {
1569        case KVM_EXIT_IO:
1570            DPRINTF("handle_io\n");
1571            kvm_handle_io(run->io.port,
1572                          (uint8_t *)run + run->io.data_offset,
1573                          run->io.direction,
1574                          run->io.size,
1575                          run->io.count);
1576            ret = 0;
1577            break;
1578        case KVM_EXIT_MMIO:
1579            DPRINTF("handle_mmio\n");
1580            cpu_physical_memory_rw(run->mmio.phys_addr,
1581                                   run->mmio.data,
1582                                   run->mmio.len,
1583                                   run->mmio.is_write);
1584            ret = 0;
1585            break;
1586        case KVM_EXIT_IRQ_WINDOW_OPEN:
1587            DPRINTF("irq_window_open\n");
1588            ret = EXCP_INTERRUPT;
1589            break;
1590        case KVM_EXIT_SHUTDOWN:
1591            DPRINTF("shutdown\n");
1592            qemu_system_reset_request();
1593            ret = EXCP_INTERRUPT;
1594            break;
1595        case KVM_EXIT_UNKNOWN:
1596            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
1597                    (uint64_t)run->hw.hardware_exit_reason);
1598            ret = -1;
1599            break;
1600        case KVM_EXIT_INTERNAL_ERROR:
1601            ret = kvm_handle_internal_error(env, run);
1602            break;
1603        default:
1604            DPRINTF("kvm_arch_handle_exit\n");
1605            ret = kvm_arch_handle_exit(env, run);
1606            break;
1607        }
1608    } while (ret == 0);
1609
1610    if (ret < 0) {
1611        cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
1612        vm_stop(RUN_STATE_INTERNAL_ERROR);
1613    }
1614
1615    env->exit_request = 0;
1616    return ret;
1617}
1618
1619int kvm_ioctl(KVMState *s, int type, ...)
1620{
1621    int ret;
1622    void *arg;
1623    va_list ap;
1624
1625    va_start(ap, type);
1626    arg = va_arg(ap, void *);
1627    va_end(ap);
1628
1629    ret = ioctl(s->fd, type, arg);
1630    if (ret == -1) {
1631        ret = -errno;
1632    }
1633    return ret;
1634}
1635
1636int kvm_vm_ioctl(KVMState *s, int type, ...)
1637{
1638    int ret;
1639    void *arg;
1640    va_list ap;
1641
1642    va_start(ap, type);
1643    arg = va_arg(ap, void *);
1644    va_end(ap);
1645
1646    ret = ioctl(s->vmfd, type, arg);
1647    if (ret == -1) {
1648        ret = -errno;
1649    }
1650    return ret;
1651}
1652
1653int kvm_vcpu_ioctl(CPUArchState *env, int type, ...)
1654{
1655    int ret;
1656    void *arg;
1657    va_list ap;
1658
1659    va_start(ap, type);
1660    arg = va_arg(ap, void *);
1661    va_end(ap);
1662
1663    ret = ioctl(env->kvm_fd, type, arg);
1664    if (ret == -1) {
1665        ret = -errno;
1666    }
1667    return ret;
1668}
1669
1670int kvm_has_sync_mmu(void)
1671{
1672    return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
1673}
1674
1675int kvm_has_vcpu_events(void)
1676{
1677    return kvm_state->vcpu_events;
1678}
1679
1680int kvm_has_robust_singlestep(void)
1681{
1682    return kvm_state->robust_singlestep;
1683}
1684
1685int kvm_has_debugregs(void)
1686{
1687    return kvm_state->debugregs;
1688}
1689
1690int kvm_has_xsave(void)
1691{
1692    return kvm_state->xsave;
1693}
1694
1695int kvm_has_xcrs(void)
1696{
1697    return kvm_state->xcrs;
1698}
1699
1700int kvm_has_pit_state2(void)
1701{
1702    return kvm_state->pit_state2;
1703}
1704
1705int kvm_has_many_ioeventfds(void)
1706{
1707    if (!kvm_enabled()) {
1708        return 0;
1709    }
1710    return kvm_state->many_ioeventfds;
1711}
1712
1713int kvm_has_gsi_routing(void)
1714{
1715#ifdef KVM_CAP_IRQ_ROUTING
1716    return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
1717#else
1718    return false;
1719#endif
1720}
1721
1722int kvm_has_intx_set_mask(void)
1723{
1724    return kvm_state->intx_set_mask;
1725}
1726
1727void *kvm_vmalloc(ram_addr_t size)
1728{
1729#ifdef TARGET_S390X
1730    void *mem;
1731
1732    mem = kvm_arch_vmalloc(size);
1733    if (mem) {
1734        return mem;
1735    }
1736#endif
1737    return qemu_vmalloc(size);
1738}
1739
1740void kvm_setup_guest_memory(void *start, size_t size)
1741{
1742#ifdef CONFIG_VALGRIND_H
1743    VALGRIND_MAKE_MEM_DEFINED(start, size);
1744#endif
1745    if (!kvm_has_sync_mmu()) {
1746        int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);
1747
1748        if (ret) {
1749            perror("qemu_madvise");
1750            fprintf(stderr,
1751                    "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1752            exit(1);
1753        }
1754    }
1755}
1756
1757#ifdef KVM_CAP_SET_GUEST_DEBUG
1758struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUArchState *env,
1759                                                 target_ulong pc)
1760{
1761    struct kvm_sw_breakpoint *bp;
1762
1763    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
1764        if (bp->pc == pc) {
1765            return bp;
1766        }
1767    }
1768    return NULL;
1769}
1770
1771int kvm_sw_breakpoints_active(CPUArchState *env)
1772{
1773    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
1774}
1775
1776struct kvm_set_guest_debug_data {
1777    struct kvm_guest_debug dbg;
1778    CPUArchState *env;
1779    int err;
1780};
1781
1782static void kvm_invoke_set_guest_debug(void *data)
1783{
1784    struct kvm_set_guest_debug_data *dbg_data = data;
1785    CPUArchState *env = dbg_data->env;
1786
1787    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
1788}
1789
1790int kvm_update_guest_debug(CPUArchState *env, unsigned long reinject_trap)
1791{
1792    CPUState *cpu = ENV_GET_CPU(env);
1793    struct kvm_set_guest_debug_data data;
1794
1795    data.dbg.control = reinject_trap;
1796
1797    if (env->singlestep_enabled) {
1798        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1799    }
1800    kvm_arch_update_guest_debug(env, &data.dbg);
1801    data.env = env;
1802
1803    run_on_cpu(cpu, kvm_invoke_set_guest_debug, &data);
1804    return data.err;
1805}
1806
1807int kvm_insert_breakpoint(CPUArchState *current_env, target_ulong addr,
1808                          target_ulong len, int type)
1809{
1810    struct kvm_sw_breakpoint *bp;
1811    CPUArchState *env;
1812    int err;
1813
1814    if (type == GDB_BREAKPOINT_SW) {
1815        bp = kvm_find_sw_breakpoint(current_env, addr);
1816        if (bp) {
1817            bp->use_count++;
1818            return 0;
1819        }
1820
1821        bp = g_malloc(sizeof(struct kvm_sw_breakpoint));
1822        if (!bp) {
1823            return -ENOMEM;
1824        }
1825
1826        bp->pc = addr;
1827        bp->use_count = 1;
1828        err = kvm_arch_insert_sw_breakpoint(current_env, bp);
1829        if (err) {
1830            g_free(bp);
1831            return err;
1832        }
1833
1834        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
1835                          bp, entry);
1836    } else {
1837        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1838        if (err) {
1839            return err;
1840        }
1841    }
1842
1843    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1844        err = kvm_update_guest_debug(env, 0);
1845        if (err) {
1846            return err;
1847        }
1848    }
1849    return 0;
1850}
1851
1852int kvm_remove_breakpoint(CPUArchState *current_env, target_ulong addr,
1853                          target_ulong len, int type)
1854{
1855    struct kvm_sw_breakpoint *bp;
1856    CPUArchState *env;
1857    int err;
1858
1859    if (type == GDB_BREAKPOINT_SW) {
1860        bp = kvm_find_sw_breakpoint(current_env, addr);
1861        if (!bp) {
1862            return -ENOENT;
1863        }
1864
1865        if (bp->use_count > 1) {
1866            bp->use_count--;
1867            return 0;
1868        }
1869
1870        err = kvm_arch_remove_sw_breakpoint(current_env, bp);
1871        if (err) {
1872            return err;
1873        }
1874
1875        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
1876        g_free(bp);
1877    } else {
1878        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1879        if (err) {
1880            return err;
1881        }
1882    }
1883
1884    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1885        err = kvm_update_guest_debug(env, 0);
1886        if (err) {
1887            return err;
1888        }
1889    }
1890    return 0;
1891}
1892
1893void kvm_remove_all_breakpoints(CPUArchState *current_env)
1894{
1895    struct kvm_sw_breakpoint *bp, *next;
1896    KVMState *s = current_env->kvm_state;
1897    CPUArchState *env;
1898
1899    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1900        if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
1901            /* Try harder to find a CPU that currently sees the breakpoint. */
1902            for (env = first_cpu; env != NULL; env = env->next_cpu) {
1903                if (kvm_arch_remove_sw_breakpoint(env, bp) == 0) {
1904                    break;
1905                }
1906            }
1907        }
1908        QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
1909        g_free(bp);
1910    }
1911    kvm_arch_remove_all_hw_breakpoints();
1912
1913    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1914        kvm_update_guest_debug(env, 0);
1915    }
1916}
1917
1918#else /* !KVM_CAP_SET_GUEST_DEBUG */
1919
1920int kvm_update_guest_debug(CPUArchState *env, unsigned long reinject_trap)
1921{
1922    return -EINVAL;
1923}
1924
1925int kvm_insert_breakpoint(CPUArchState *current_env, target_ulong addr,
1926                          target_ulong len, int type)
1927{
1928    return -EINVAL;
1929}
1930
1931int kvm_remove_breakpoint(CPUArchState *current_env, target_ulong addr,
1932                          target_ulong len, int type)
1933{
1934    return -EINVAL;
1935}
1936
1937void kvm_remove_all_breakpoints(CPUArchState *current_env)
1938{
1939}
1940#endif /* !KVM_CAP_SET_GUEST_DEBUG */
1941
1942int kvm_set_signal_mask(CPUArchState *env, const sigset_t *sigset)
1943{
1944    struct kvm_signal_mask *sigmask;
1945    int r;
1946
1947    if (!sigset) {
1948        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1949    }
1950
1951    sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
1952
1953    sigmask->len = 8;
1954    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1955    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1956    g_free(sigmask);
1957
1958    return r;
1959}
1960
1961int kvm_set_ioeventfd_mmio(int fd, uint32_t addr, uint32_t val, bool assign,
1962                           uint32_t size)
1963{
1964    int ret;
1965    struct kvm_ioeventfd iofd;
1966
1967    iofd.datamatch = val;
1968    iofd.addr = addr;
1969    iofd.len = size;
1970    iofd.flags = KVM_IOEVENTFD_FLAG_DATAMATCH;
1971    iofd.fd = fd;
1972
1973    if (!kvm_enabled()) {
1974        return -ENOSYS;
1975    }
1976
1977    if (!assign) {
1978        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1979    }
1980
1981    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1982
1983    if (ret < 0) {
1984        return -errno;
1985    }
1986
1987    return 0;
1988}
1989
1990int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
1991{
1992    struct kvm_ioeventfd kick = {
1993        .datamatch = val,
1994        .addr = addr,
1995        .len = 2,
1996        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
1997        .fd = fd,
1998    };
1999    int r;
2000    if (!kvm_enabled()) {
2001        return -ENOSYS;
2002    }
2003    if (!assign) {
2004        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
2005    }
2006    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
2007    if (r < 0) {
2008        return r;
2009    }
2010    return 0;
2011}
2012
2013int kvm_on_sigbus_vcpu(CPUArchState *env, int code, void *addr)
2014{
2015    return kvm_arch_on_sigbus_vcpu(env, code, addr);
2016}
2017
2018int kvm_on_sigbus(int code, void *addr)
2019{
2020    return kvm_arch_on_sigbus(code, addr);
2021}
2022