qemu/kvm-all.c
<<
>>
Prefs
   1/*
   2 * QEMU KVM support
   3 *
   4 * Copyright IBM, Corp. 2008
   5 *           Red Hat, Inc. 2008
   6 *
   7 * Authors:
   8 *  Anthony Liguori   <aliguori@us.ibm.com>
   9 *  Glauber Costa     <gcosta@redhat.com>
  10 *
  11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
  12 * See the COPYING file in the top-level directory.
  13 *
  14 */
  15
  16#include <sys/types.h>
  17#include <sys/ioctl.h>
  18#include <sys/mman.h>
  19#include <stdarg.h>
  20
  21#include <linux/kvm.h>
  22
  23#include "qemu-common.h"
  24#include "qemu/atomic.h"
  25#include "qemu/option.h"
  26#include "qemu/config-file.h"
  27#include "sysemu/sysemu.h"
  28#include "hw/hw.h"
  29#include "hw/pci/msi.h"
  30#include "exec/gdbstub.h"
  31#include "sysemu/kvm.h"
  32#include "qemu/bswap.h"
  33#include "exec/memory.h"
  34#include "exec/address-spaces.h"
  35#include "qemu/event_notifier.h"
  36#include "trace.h"
  37
  38/* This check must be after config-host.h is included */
  39#ifdef CONFIG_EVENTFD
  40#include <sys/eventfd.h>
  41#endif
  42
  43#ifdef CONFIG_VALGRIND_H
  44#include <valgrind/memcheck.h>
  45#endif
  46
  47/* KVM uses PAGE_SIZE in its definition of COALESCED_MMIO_MAX */
  48#define PAGE_SIZE TARGET_PAGE_SIZE
  49
  50//#define DEBUG_KVM
  51
  52#ifdef DEBUG_KVM
  53#define DPRINTF(fmt, ...) \
  54    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
  55#else
  56#define DPRINTF(fmt, ...) \
  57    do { } while (0)
  58#endif
  59
  60#define KVM_MSI_HASHTAB_SIZE    256
  61
  62typedef struct KVMSlot
  63{
  64    hwaddr start_addr;
  65    ram_addr_t memory_size;
  66    void *ram;
  67    int slot;
  68    int flags;
  69} KVMSlot;
  70
  71typedef struct kvm_dirty_log KVMDirtyLog;
  72
  73struct KVMState
  74{
  75    KVMSlot slots[32];
  76    int fd;
  77    int vmfd;
  78    int coalesced_mmio;
  79    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
  80    bool coalesced_flush_in_progress;
  81    int broken_set_mem_region;
  82    int migration_log;
  83    int vcpu_events;
  84    int robust_singlestep;
  85    int debugregs;
  86#ifdef KVM_CAP_SET_GUEST_DEBUG
  87    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
  88#endif
  89    int pit_state2;
  90    int xsave, xcrs;
  91    int many_ioeventfds;
  92    int intx_set_mask;
  93    /* The man page (and posix) say ioctl numbers are signed int, but
  94     * they're not.  Linux, glibc and *BSD all treat ioctl numbers as
  95     * unsigned, and treating them as signed here can break things */
  96    unsigned irq_set_ioctl;
  97#ifdef KVM_CAP_IRQ_ROUTING
  98    struct kvm_irq_routing *irq_routes;
  99    int nr_allocated_irq_routes;
 100    uint32_t *used_gsi_bitmap;
 101    unsigned int gsi_count;
 102    QTAILQ_HEAD(msi_hashtab, KVMMSIRoute) msi_hashtab[KVM_MSI_HASHTAB_SIZE];
 103    bool direct_msi;
 104#endif
 105};
 106
 107KVMState *kvm_state;
 108bool kvm_kernel_irqchip;
 109bool kvm_async_interrupts_allowed;
 110bool kvm_halt_in_kernel_allowed;
 111bool kvm_irqfds_allowed;
 112bool kvm_msi_via_irqfd_allowed;
 113bool kvm_gsi_routing_allowed;
 114bool kvm_allowed;
 115bool kvm_readonly_mem_allowed;
 116
 117static const KVMCapabilityInfo kvm_required_capabilites[] = {
 118    KVM_CAP_INFO(USER_MEMORY),
 119    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
 120    KVM_CAP_LAST_INFO
 121};
 122
 123static KVMSlot *kvm_alloc_slot(KVMState *s)
 124{
 125    int i;
 126
 127    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 128        if (s->slots[i].memory_size == 0) {
 129            return &s->slots[i];
 130        }
 131    }
 132
 133    fprintf(stderr, "%s: no free slot available\n", __func__);
 134    abort();
 135}
 136
 137static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
 138                                         hwaddr start_addr,
 139                                         hwaddr end_addr)
 140{
 141    int i;
 142
 143    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 144        KVMSlot *mem = &s->slots[i];
 145
 146        if (start_addr == mem->start_addr &&
 147            end_addr == mem->start_addr + mem->memory_size) {
 148            return mem;
 149        }
 150    }
 151
 152    return NULL;
 153}
 154
 155/*
 156 * Find overlapping slot with lowest start address
 157 */
 158static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
 159                                            hwaddr start_addr,
 160                                            hwaddr end_addr)
 161{
 162    KVMSlot *found = NULL;
 163    int i;
 164
 165    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 166        KVMSlot *mem = &s->slots[i];
 167
 168        if (mem->memory_size == 0 ||
 169            (found && found->start_addr < mem->start_addr)) {
 170            continue;
 171        }
 172
 173        if (end_addr > mem->start_addr &&
 174            start_addr < mem->start_addr + mem->memory_size) {
 175            found = mem;
 176        }
 177    }
 178
 179    return found;
 180}
 181
 182int kvm_physical_memory_addr_from_host(KVMState *s, void *ram,
 183                                       hwaddr *phys_addr)
 184{
 185    int i;
 186
 187    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 188        KVMSlot *mem = &s->slots[i];
 189
 190        if (ram >= mem->ram && ram < mem->ram + mem->memory_size) {
 191            *phys_addr = mem->start_addr + (ram - mem->ram);
 192            return 1;
 193        }
 194    }
 195
 196    return 0;
 197}
 198
 199static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
 200{
 201    struct kvm_userspace_memory_region mem;
 202
 203    mem.slot = slot->slot;
 204    mem.guest_phys_addr = slot->start_addr;
 205    mem.userspace_addr = (unsigned long)slot->ram;
 206    mem.flags = slot->flags;
 207    if (s->migration_log) {
 208        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
 209    }
 210
 211    if (slot->memory_size && mem.flags & KVM_MEM_READONLY) {
 212        /* Set the slot size to 0 before setting the slot to the desired
 213         * value. This is needed based on KVM commit 75d61fbc. */
 214        mem.memory_size = 0;
 215        kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 216    }
 217    mem.memory_size = slot->memory_size;
 218    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 219}
 220
 221static void kvm_reset_vcpu(void *opaque)
 222{
 223    CPUState *cpu = opaque;
 224
 225    kvm_arch_reset_vcpu(cpu);
 226}
 227
 228int kvm_init_vcpu(CPUState *cpu)
 229{
 230    KVMState *s = kvm_state;
 231    long mmap_size;
 232    int ret;
 233
 234    DPRINTF("kvm_init_vcpu\n");
 235
 236    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, (void *)kvm_arch_vcpu_id(cpu));
 237    if (ret < 0) {
 238        DPRINTF("kvm_create_vcpu failed\n");
 239        goto err;
 240    }
 241
 242    cpu->kvm_fd = ret;
 243    cpu->kvm_state = s;
 244    cpu->kvm_vcpu_dirty = true;
 245
 246    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
 247    if (mmap_size < 0) {
 248        ret = mmap_size;
 249        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
 250        goto err;
 251    }
 252
 253    cpu->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
 254                        cpu->kvm_fd, 0);
 255    if (cpu->kvm_run == MAP_FAILED) {
 256        ret = -errno;
 257        DPRINTF("mmap'ing vcpu state failed\n");
 258        goto err;
 259    }
 260
 261    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
 262        s->coalesced_mmio_ring =
 263            (void *)cpu->kvm_run + s->coalesced_mmio * PAGE_SIZE;
 264    }
 265
 266    ret = kvm_arch_init_vcpu(cpu);
 267    if (ret == 0) {
 268        qemu_register_reset(kvm_reset_vcpu, cpu);
 269        kvm_arch_reset_vcpu(cpu);
 270    }
 271err:
 272    return ret;
 273}
 274
 275/*
 276 * dirty pages logging control
 277 */
 278
 279static int kvm_mem_flags(KVMState *s, bool log_dirty, bool readonly)
 280{
 281    int flags = 0;
 282    flags = log_dirty ? KVM_MEM_LOG_DIRTY_PAGES : 0;
 283    if (readonly && kvm_readonly_mem_allowed) {
 284        flags |= KVM_MEM_READONLY;
 285    }
 286    return flags;
 287}
 288
 289static int kvm_slot_dirty_pages_log_change(KVMSlot *mem, bool log_dirty)
 290{
 291    KVMState *s = kvm_state;
 292    int flags, mask = KVM_MEM_LOG_DIRTY_PAGES;
 293    int old_flags;
 294
 295    old_flags = mem->flags;
 296
 297    flags = (mem->flags & ~mask) | kvm_mem_flags(s, log_dirty, false);
 298    mem->flags = flags;
 299
 300    /* If nothing changed effectively, no need to issue ioctl */
 301    if (s->migration_log) {
 302        flags |= KVM_MEM_LOG_DIRTY_PAGES;
 303    }
 304
 305    if (flags == old_flags) {
 306        return 0;
 307    }
 308
 309    return kvm_set_user_memory_region(s, mem);
 310}
 311
 312static int kvm_dirty_pages_log_change(hwaddr phys_addr,
 313                                      ram_addr_t size, bool log_dirty)
 314{
 315    KVMState *s = kvm_state;
 316    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
 317
 318    if (mem == NULL)  {
 319        fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
 320                TARGET_FMT_plx "\n", __func__, phys_addr,
 321                (hwaddr)(phys_addr + size - 1));
 322        return -EINVAL;
 323    }
 324    return kvm_slot_dirty_pages_log_change(mem, log_dirty);
 325}
 326
 327static void kvm_log_start(MemoryListener *listener,
 328                          MemoryRegionSection *section)
 329{
 330    int r;
 331
 332    r = kvm_dirty_pages_log_change(section->offset_within_address_space,
 333                                   int128_get64(section->size), true);
 334    if (r < 0) {
 335        abort();
 336    }
 337}
 338
 339static void kvm_log_stop(MemoryListener *listener,
 340                          MemoryRegionSection *section)
 341{
 342    int r;
 343
 344    r = kvm_dirty_pages_log_change(section->offset_within_address_space,
 345                                   int128_get64(section->size), false);
 346    if (r < 0) {
 347        abort();
 348    }
 349}
 350
 351static int kvm_set_migration_log(int enable)
 352{
 353    KVMState *s = kvm_state;
 354    KVMSlot *mem;
 355    int i, err;
 356
 357    s->migration_log = enable;
 358
 359    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 360        mem = &s->slots[i];
 361
 362        if (!mem->memory_size) {
 363            continue;
 364        }
 365        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
 366            continue;
 367        }
 368        err = kvm_set_user_memory_region(s, mem);
 369        if (err) {
 370            return err;
 371        }
 372    }
 373    return 0;
 374}
 375
 376/* get kvm's dirty pages bitmap and update qemu's */
 377static int kvm_get_dirty_pages_log_range(MemoryRegionSection *section,
 378                                         unsigned long *bitmap)
 379{
 380    unsigned int i, j;
 381    unsigned long page_number, c;
 382    hwaddr addr, addr1;
 383    unsigned int pages = int128_get64(section->size) / getpagesize();
 384    unsigned int len = (pages + HOST_LONG_BITS - 1) / HOST_LONG_BITS;
 385    unsigned long hpratio = getpagesize() / TARGET_PAGE_SIZE;
 386
 387    /*
 388     * bitmap-traveling is faster than memory-traveling (for addr...)
 389     * especially when most of the memory is not dirty.
 390     */
 391    for (i = 0; i < len; i++) {
 392        if (bitmap[i] != 0) {
 393            c = leul_to_cpu(bitmap[i]);
 394            do {
 395                j = ffsl(c) - 1;
 396                c &= ~(1ul << j);
 397                page_number = (i * HOST_LONG_BITS + j) * hpratio;
 398                addr1 = page_number * TARGET_PAGE_SIZE;
 399                addr = section->offset_within_region + addr1;
 400                memory_region_set_dirty(section->mr, addr,
 401                                        TARGET_PAGE_SIZE * hpratio);
 402            } while (c != 0);
 403        }
 404    }
 405    return 0;
 406}
 407
 408#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
 409
 410/**
 411 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
 412 * This function updates qemu's dirty bitmap using
 413 * memory_region_set_dirty().  This means all bits are set
 414 * to dirty.
 415 *
 416 * @start_add: start of logged region.
 417 * @end_addr: end of logged region.
 418 */
 419static int kvm_physical_sync_dirty_bitmap(MemoryRegionSection *section)
 420{
 421    KVMState *s = kvm_state;
 422    unsigned long size, allocated_size = 0;
 423    KVMDirtyLog d;
 424    KVMSlot *mem;
 425    int ret = 0;
 426    hwaddr start_addr = section->offset_within_address_space;
 427    hwaddr end_addr = start_addr + int128_get64(section->size);
 428
 429    d.dirty_bitmap = NULL;
 430    while (start_addr < end_addr) {
 431        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
 432        if (mem == NULL) {
 433            break;
 434        }
 435
 436        /* XXX bad kernel interface alert
 437         * For dirty bitmap, kernel allocates array of size aligned to
 438         * bits-per-long.  But for case when the kernel is 64bits and
 439         * the userspace is 32bits, userspace can't align to the same
 440         * bits-per-long, since sizeof(long) is different between kernel
 441         * and user space.  This way, userspace will provide buffer which
 442         * may be 4 bytes less than the kernel will use, resulting in
 443         * userspace memory corruption (which is not detectable by valgrind
 444         * too, in most cases).
 445         * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
 446         * a hope that sizeof(long) wont become >8 any time soon.
 447         */
 448        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
 449                     /*HOST_LONG_BITS*/ 64) / 8;
 450        if (!d.dirty_bitmap) {
 451            d.dirty_bitmap = g_malloc(size);
 452        } else if (size > allocated_size) {
 453            d.dirty_bitmap = g_realloc(d.dirty_bitmap, size);
 454        }
 455        allocated_size = size;
 456        memset(d.dirty_bitmap, 0, allocated_size);
 457
 458        d.slot = mem->slot;
 459
 460        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
 461            DPRINTF("ioctl failed %d\n", errno);
 462            ret = -1;
 463            break;
 464        }
 465
 466        kvm_get_dirty_pages_log_range(section, d.dirty_bitmap);
 467        start_addr = mem->start_addr + mem->memory_size;
 468    }
 469    g_free(d.dirty_bitmap);
 470
 471    return ret;
 472}
 473
 474static void kvm_coalesce_mmio_region(MemoryListener *listener,
 475                                     MemoryRegionSection *secion,
 476                                     hwaddr start, hwaddr size)
 477{
 478    KVMState *s = kvm_state;
 479
 480    if (s->coalesced_mmio) {
 481        struct kvm_coalesced_mmio_zone zone;
 482
 483        zone.addr = start;
 484        zone.size = size;
 485        zone.pad = 0;
 486
 487        (void)kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
 488    }
 489}
 490
 491static void kvm_uncoalesce_mmio_region(MemoryListener *listener,
 492                                       MemoryRegionSection *secion,
 493                                       hwaddr start, hwaddr size)
 494{
 495    KVMState *s = kvm_state;
 496
 497    if (s->coalesced_mmio) {
 498        struct kvm_coalesced_mmio_zone zone;
 499
 500        zone.addr = start;
 501        zone.size = size;
 502        zone.pad = 0;
 503
 504        (void)kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
 505    }
 506}
 507
 508int kvm_check_extension(KVMState *s, unsigned int extension)
 509{
 510    int ret;
 511
 512    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
 513    if (ret < 0) {
 514        ret = 0;
 515    }
 516
 517    return ret;
 518}
 519
 520static int kvm_set_ioeventfd_mmio(int fd, uint32_t addr, uint32_t val,
 521                                  bool assign, uint32_t size, bool datamatch)
 522{
 523    int ret;
 524    struct kvm_ioeventfd iofd;
 525
 526    iofd.datamatch = datamatch ? val : 0;
 527    iofd.addr = addr;
 528    iofd.len = size;
 529    iofd.flags = 0;
 530    iofd.fd = fd;
 531
 532    if (!kvm_enabled()) {
 533        return -ENOSYS;
 534    }
 535
 536    if (datamatch) {
 537        iofd.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
 538    }
 539    if (!assign) {
 540        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
 541    }
 542
 543    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
 544
 545    if (ret < 0) {
 546        return -errno;
 547    }
 548
 549    return 0;
 550}
 551
 552static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
 553                                 bool assign, uint32_t size, bool datamatch)
 554{
 555    struct kvm_ioeventfd kick = {
 556        .datamatch = datamatch ? val : 0,
 557        .addr = addr,
 558        .flags = KVM_IOEVENTFD_FLAG_PIO,
 559        .len = size,
 560        .fd = fd,
 561    };
 562    int r;
 563    if (!kvm_enabled()) {
 564        return -ENOSYS;
 565    }
 566    if (datamatch) {
 567        kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
 568    }
 569    if (!assign) {
 570        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
 571    }
 572    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
 573    if (r < 0) {
 574        return r;
 575    }
 576    return 0;
 577}
 578
 579
 580static int kvm_check_many_ioeventfds(void)
 581{
 582    /* Userspace can use ioeventfd for io notification.  This requires a host
 583     * that supports eventfd(2) and an I/O thread; since eventfd does not
 584     * support SIGIO it cannot interrupt the vcpu.
 585     *
 586     * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
 587     * can avoid creating too many ioeventfds.
 588     */
 589#if defined(CONFIG_EVENTFD)
 590    int ioeventfds[7];
 591    int i, ret = 0;
 592    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
 593        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
 594        if (ioeventfds[i] < 0) {
 595            break;
 596        }
 597        ret = kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, true, 2, true);
 598        if (ret < 0) {
 599            close(ioeventfds[i]);
 600            break;
 601        }
 602    }
 603
 604    /* Decide whether many devices are supported or not */
 605    ret = i == ARRAY_SIZE(ioeventfds);
 606
 607    while (i-- > 0) {
 608        kvm_set_ioeventfd_pio(ioeventfds[i], 0, i, false, 2, true);
 609        close(ioeventfds[i]);
 610    }
 611    return ret;
 612#else
 613    return 0;
 614#endif
 615}
 616
 617static const KVMCapabilityInfo *
 618kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
 619{
 620    while (list->name) {
 621        if (!kvm_check_extension(s, list->value)) {
 622            return list;
 623        }
 624        list++;
 625    }
 626    return NULL;
 627}
 628
 629static void kvm_set_phys_mem(MemoryRegionSection *section, bool add)
 630{
 631    KVMState *s = kvm_state;
 632    KVMSlot *mem, old;
 633    int err;
 634    MemoryRegion *mr = section->mr;
 635    bool log_dirty = memory_region_is_logging(mr);
 636    bool writeable = !mr->readonly && !mr->rom_device;
 637    bool readonly_flag = mr->readonly || memory_region_is_romd(mr);
 638    hwaddr start_addr = section->offset_within_address_space;
 639    ram_addr_t size = int128_get64(section->size);
 640    void *ram = NULL;
 641    unsigned delta;
 642
 643    /* kvm works in page size chunks, but the function may be called
 644       with sub-page size and unaligned start address. */
 645    delta = TARGET_PAGE_ALIGN(size) - size;
 646    if (delta > size) {
 647        return;
 648    }
 649    start_addr += delta;
 650    size -= delta;
 651    size &= TARGET_PAGE_MASK;
 652    if (!size || (start_addr & ~TARGET_PAGE_MASK)) {
 653        return;
 654    }
 655
 656    if (!memory_region_is_ram(mr)) {
 657        if (writeable || !kvm_readonly_mem_allowed) {
 658            return;
 659        } else if (!mr->romd_mode) {
 660            /* If the memory device is not in romd_mode, then we actually want
 661             * to remove the kvm memory slot so all accesses will trap. */
 662            add = false;
 663        }
 664    }
 665
 666    ram = memory_region_get_ram_ptr(mr) + section->offset_within_region + delta;
 667
 668    while (1) {
 669        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
 670        if (!mem) {
 671            break;
 672        }
 673
 674        if (add && start_addr >= mem->start_addr &&
 675            (start_addr + size <= mem->start_addr + mem->memory_size) &&
 676            (ram - start_addr == mem->ram - mem->start_addr)) {
 677            /* The new slot fits into the existing one and comes with
 678             * identical parameters - update flags and done. */
 679            kvm_slot_dirty_pages_log_change(mem, log_dirty);
 680            return;
 681        }
 682
 683        old = *mem;
 684
 685        if (mem->flags & KVM_MEM_LOG_DIRTY_PAGES) {
 686            kvm_physical_sync_dirty_bitmap(section);
 687        }
 688
 689        /* unregister the overlapping slot */
 690        mem->memory_size = 0;
 691        err = kvm_set_user_memory_region(s, mem);
 692        if (err) {
 693            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
 694                    __func__, strerror(-err));
 695            abort();
 696        }
 697
 698        /* Workaround for older KVM versions: we can't join slots, even not by
 699         * unregistering the previous ones and then registering the larger
 700         * slot. We have to maintain the existing fragmentation. Sigh.
 701         *
 702         * This workaround assumes that the new slot starts at the same
 703         * address as the first existing one. If not or if some overlapping
 704         * slot comes around later, we will fail (not seen in practice so far)
 705         * - and actually require a recent KVM version. */
 706        if (s->broken_set_mem_region &&
 707            old.start_addr == start_addr && old.memory_size < size && add) {
 708            mem = kvm_alloc_slot(s);
 709            mem->memory_size = old.memory_size;
 710            mem->start_addr = old.start_addr;
 711            mem->ram = old.ram;
 712            mem->flags = kvm_mem_flags(s, log_dirty, readonly_flag);
 713
 714            err = kvm_set_user_memory_region(s, mem);
 715            if (err) {
 716                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
 717                        strerror(-err));
 718                abort();
 719            }
 720
 721            start_addr += old.memory_size;
 722            ram += old.memory_size;
 723            size -= old.memory_size;
 724            continue;
 725        }
 726
 727        /* register prefix slot */
 728        if (old.start_addr < start_addr) {
 729            mem = kvm_alloc_slot(s);
 730            mem->memory_size = start_addr - old.start_addr;
 731            mem->start_addr = old.start_addr;
 732            mem->ram = old.ram;
 733            mem->flags =  kvm_mem_flags(s, log_dirty, readonly_flag);
 734
 735            err = kvm_set_user_memory_region(s, mem);
 736            if (err) {
 737                fprintf(stderr, "%s: error registering prefix slot: %s\n",
 738                        __func__, strerror(-err));
 739#ifdef TARGET_PPC
 740                fprintf(stderr, "%s: This is probably because your kernel's " \
 741                                "PAGE_SIZE is too big. Please try to use 4k " \
 742                                "PAGE_SIZE!\n", __func__);
 743#endif
 744                abort();
 745            }
 746        }
 747
 748        /* register suffix slot */
 749        if (old.start_addr + old.memory_size > start_addr + size) {
 750            ram_addr_t size_delta;
 751
 752            mem = kvm_alloc_slot(s);
 753            mem->start_addr = start_addr + size;
 754            size_delta = mem->start_addr - old.start_addr;
 755            mem->memory_size = old.memory_size - size_delta;
 756            mem->ram = old.ram + size_delta;
 757            mem->flags = kvm_mem_flags(s, log_dirty, readonly_flag);
 758
 759            err = kvm_set_user_memory_region(s, mem);
 760            if (err) {
 761                fprintf(stderr, "%s: error registering suffix slot: %s\n",
 762                        __func__, strerror(-err));
 763                abort();
 764            }
 765        }
 766    }
 767
 768    /* in case the KVM bug workaround already "consumed" the new slot */
 769    if (!size) {
 770        return;
 771    }
 772    if (!add) {
 773        return;
 774    }
 775    mem = kvm_alloc_slot(s);
 776    mem->memory_size = size;
 777    mem->start_addr = start_addr;
 778    mem->ram = ram;
 779    mem->flags = kvm_mem_flags(s, log_dirty, readonly_flag);
 780
 781    err = kvm_set_user_memory_region(s, mem);
 782    if (err) {
 783        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
 784                strerror(-err));
 785        abort();
 786    }
 787}
 788
 789static void kvm_region_add(MemoryListener *listener,
 790                           MemoryRegionSection *section)
 791{
 792    memory_region_ref(section->mr);
 793    kvm_set_phys_mem(section, true);
 794}
 795
 796static void kvm_region_del(MemoryListener *listener,
 797                           MemoryRegionSection *section)
 798{
 799    kvm_set_phys_mem(section, false);
 800    memory_region_unref(section->mr);
 801}
 802
 803static void kvm_log_sync(MemoryListener *listener,
 804                         MemoryRegionSection *section)
 805{
 806    int r;
 807
 808    r = kvm_physical_sync_dirty_bitmap(section);
 809    if (r < 0) {
 810        abort();
 811    }
 812}
 813
 814static void kvm_log_global_start(struct MemoryListener *listener)
 815{
 816    int r;
 817
 818    r = kvm_set_migration_log(1);
 819    assert(r >= 0);
 820}
 821
 822static void kvm_log_global_stop(struct MemoryListener *listener)
 823{
 824    int r;
 825
 826    r = kvm_set_migration_log(0);
 827    assert(r >= 0);
 828}
 829
 830static void kvm_mem_ioeventfd_add(MemoryListener *listener,
 831                                  MemoryRegionSection *section,
 832                                  bool match_data, uint64_t data,
 833                                  EventNotifier *e)
 834{
 835    int fd = event_notifier_get_fd(e);
 836    int r;
 837
 838    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
 839                               data, true, int128_get64(section->size),
 840                               match_data);
 841    if (r < 0) {
 842        fprintf(stderr, "%s: error adding ioeventfd: %s\n",
 843                __func__, strerror(-r));
 844        abort();
 845    }
 846}
 847
 848static void kvm_mem_ioeventfd_del(MemoryListener *listener,
 849                                  MemoryRegionSection *section,
 850                                  bool match_data, uint64_t data,
 851                                  EventNotifier *e)
 852{
 853    int fd = event_notifier_get_fd(e);
 854    int r;
 855
 856    r = kvm_set_ioeventfd_mmio(fd, section->offset_within_address_space,
 857                               data, false, int128_get64(section->size),
 858                               match_data);
 859    if (r < 0) {
 860        abort();
 861    }
 862}
 863
 864static void kvm_io_ioeventfd_add(MemoryListener *listener,
 865                                 MemoryRegionSection *section,
 866                                 bool match_data, uint64_t data,
 867                                 EventNotifier *e)
 868{
 869    int fd = event_notifier_get_fd(e);
 870    int r;
 871
 872    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
 873                              data, true, int128_get64(section->size),
 874                              match_data);
 875    if (r < 0) {
 876        fprintf(stderr, "%s: error adding ioeventfd: %s\n",
 877                __func__, strerror(-r));
 878        abort();
 879    }
 880}
 881
 882static void kvm_io_ioeventfd_del(MemoryListener *listener,
 883                                 MemoryRegionSection *section,
 884                                 bool match_data, uint64_t data,
 885                                 EventNotifier *e)
 886
 887{
 888    int fd = event_notifier_get_fd(e);
 889    int r;
 890
 891    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
 892                              data, false, int128_get64(section->size),
 893                              match_data);
 894    if (r < 0) {
 895        abort();
 896    }
 897}
 898
 899static MemoryListener kvm_memory_listener = {
 900    .region_add = kvm_region_add,
 901    .region_del = kvm_region_del,
 902    .log_start = kvm_log_start,
 903    .log_stop = kvm_log_stop,
 904    .log_sync = kvm_log_sync,
 905    .log_global_start = kvm_log_global_start,
 906    .log_global_stop = kvm_log_global_stop,
 907    .eventfd_add = kvm_mem_ioeventfd_add,
 908    .eventfd_del = kvm_mem_ioeventfd_del,
 909    .coalesced_mmio_add = kvm_coalesce_mmio_region,
 910    .coalesced_mmio_del = kvm_uncoalesce_mmio_region,
 911    .priority = 10,
 912};
 913
 914static MemoryListener kvm_io_listener = {
 915    .eventfd_add = kvm_io_ioeventfd_add,
 916    .eventfd_del = kvm_io_ioeventfd_del,
 917    .priority = 10,
 918};
 919
 920static void kvm_handle_interrupt(CPUState *cpu, int mask)
 921{
 922    cpu->interrupt_request |= mask;
 923
 924    if (!qemu_cpu_is_self(cpu)) {
 925        qemu_cpu_kick(cpu);
 926    }
 927}
 928
 929int kvm_set_irq(KVMState *s, int irq, int level)
 930{
 931    struct kvm_irq_level event;
 932    int ret;
 933
 934    assert(kvm_async_interrupts_enabled());
 935
 936    event.level = level;
 937    event.irq = irq;
 938    ret = kvm_vm_ioctl(s, s->irq_set_ioctl, &event);
 939    if (ret < 0) {
 940        perror("kvm_set_irq");
 941        abort();
 942    }
 943
 944    return (s->irq_set_ioctl == KVM_IRQ_LINE) ? 1 : event.status;
 945}
 946
 947#ifdef KVM_CAP_IRQ_ROUTING
 948typedef struct KVMMSIRoute {
 949    struct kvm_irq_routing_entry kroute;
 950    QTAILQ_ENTRY(KVMMSIRoute) entry;
 951} KVMMSIRoute;
 952
 953static void set_gsi(KVMState *s, unsigned int gsi)
 954{
 955    s->used_gsi_bitmap[gsi / 32] |= 1U << (gsi % 32);
 956}
 957
 958static void clear_gsi(KVMState *s, unsigned int gsi)
 959{
 960    s->used_gsi_bitmap[gsi / 32] &= ~(1U << (gsi % 32));
 961}
 962
 963void kvm_init_irq_routing(KVMState *s)
 964{
 965    int gsi_count, i;
 966
 967    gsi_count = kvm_check_extension(s, KVM_CAP_IRQ_ROUTING);
 968    if (gsi_count > 0) {
 969        unsigned int gsi_bits, i;
 970
 971        /* Round up so we can search ints using ffs */
 972        gsi_bits = ALIGN(gsi_count, 32);
 973        s->used_gsi_bitmap = g_malloc0(gsi_bits / 8);
 974        s->gsi_count = gsi_count;
 975
 976        /* Mark any over-allocated bits as already in use */
 977        for (i = gsi_count; i < gsi_bits; i++) {
 978            set_gsi(s, i);
 979        }
 980    }
 981
 982    s->irq_routes = g_malloc0(sizeof(*s->irq_routes));
 983    s->nr_allocated_irq_routes = 0;
 984
 985    if (!s->direct_msi) {
 986        for (i = 0; i < KVM_MSI_HASHTAB_SIZE; i++) {
 987            QTAILQ_INIT(&s->msi_hashtab[i]);
 988        }
 989    }
 990
 991    kvm_arch_init_irq_routing(s);
 992}
 993
 994void kvm_irqchip_commit_routes(KVMState *s)
 995{
 996    int ret;
 997
 998    s->irq_routes->flags = 0;
 999    ret = kvm_vm_ioctl(s, KVM_SET_GSI_ROUTING, s->irq_routes);
1000    assert(ret == 0);
1001}
1002
1003static void kvm_add_routing_entry(KVMState *s,
1004                                  struct kvm_irq_routing_entry *entry)
1005{
1006    struct kvm_irq_routing_entry *new;
1007    int n, size;
1008
1009    if (s->irq_routes->nr == s->nr_allocated_irq_routes) {
1010        n = s->nr_allocated_irq_routes * 2;
1011        if (n < 64) {
1012            n = 64;
1013        }
1014        size = sizeof(struct kvm_irq_routing);
1015        size += n * sizeof(*new);
1016        s->irq_routes = g_realloc(s->irq_routes, size);
1017        s->nr_allocated_irq_routes = n;
1018    }
1019    n = s->irq_routes->nr++;
1020    new = &s->irq_routes->entries[n];
1021
1022    *new = *entry;
1023
1024    set_gsi(s, entry->gsi);
1025}
1026
1027static int kvm_update_routing_entry(KVMState *s,
1028                                    struct kvm_irq_routing_entry *new_entry)
1029{
1030    struct kvm_irq_routing_entry *entry;
1031    int n;
1032
1033    for (n = 0; n < s->irq_routes->nr; n++) {
1034        entry = &s->irq_routes->entries[n];
1035        if (entry->gsi != new_entry->gsi) {
1036            continue;
1037        }
1038
1039        if(!memcmp(entry, new_entry, sizeof *entry)) {
1040            return 0;
1041        }
1042
1043        *entry = *new_entry;
1044
1045        kvm_irqchip_commit_routes(s);
1046
1047        return 0;
1048    }
1049
1050    return -ESRCH;
1051}
1052
1053void kvm_irqchip_add_irq_route(KVMState *s, int irq, int irqchip, int pin)
1054{
1055    struct kvm_irq_routing_entry e = {};
1056
1057    assert(pin < s->gsi_count);
1058
1059    e.gsi = irq;
1060    e.type = KVM_IRQ_ROUTING_IRQCHIP;
1061    e.flags = 0;
1062    e.u.irqchip.irqchip = irqchip;
1063    e.u.irqchip.pin = pin;
1064    kvm_add_routing_entry(s, &e);
1065}
1066
1067void kvm_irqchip_release_virq(KVMState *s, int virq)
1068{
1069    struct kvm_irq_routing_entry *e;
1070    int i;
1071
1072    for (i = 0; i < s->irq_routes->nr; i++) {
1073        e = &s->irq_routes->entries[i];
1074        if (e->gsi == virq) {
1075            s->irq_routes->nr--;
1076            *e = s->irq_routes->entries[s->irq_routes->nr];
1077        }
1078    }
1079    clear_gsi(s, virq);
1080}
1081
1082static unsigned int kvm_hash_msi(uint32_t data)
1083{
1084    /* This is optimized for IA32 MSI layout. However, no other arch shall
1085     * repeat the mistake of not providing a direct MSI injection API. */
1086    return data & 0xff;
1087}
1088
1089static void kvm_flush_dynamic_msi_routes(KVMState *s)
1090{
1091    KVMMSIRoute *route, *next;
1092    unsigned int hash;
1093
1094    for (hash = 0; hash < KVM_MSI_HASHTAB_SIZE; hash++) {
1095        QTAILQ_FOREACH_SAFE(route, &s->msi_hashtab[hash], entry, next) {
1096            kvm_irqchip_release_virq(s, route->kroute.gsi);
1097            QTAILQ_REMOVE(&s->msi_hashtab[hash], route, entry);
1098            g_free(route);
1099        }
1100    }
1101}
1102
1103static int kvm_irqchip_get_virq(KVMState *s)
1104{
1105    uint32_t *word = s->used_gsi_bitmap;
1106    int max_words = ALIGN(s->gsi_count, 32) / 32;
1107    int i, bit;
1108    bool retry = true;
1109
1110again:
1111    /* Return the lowest unused GSI in the bitmap */
1112    for (i = 0; i < max_words; i++) {
1113        bit = ffs(~word[i]);
1114        if (!bit) {
1115            continue;
1116        }
1117
1118        return bit - 1 + i * 32;
1119    }
1120    if (!s->direct_msi && retry) {
1121        retry = false;
1122        kvm_flush_dynamic_msi_routes(s);
1123        goto again;
1124    }
1125    return -ENOSPC;
1126
1127}
1128
1129static KVMMSIRoute *kvm_lookup_msi_route(KVMState *s, MSIMessage msg)
1130{
1131    unsigned int hash = kvm_hash_msi(msg.data);
1132    KVMMSIRoute *route;
1133
1134    QTAILQ_FOREACH(route, &s->msi_hashtab[hash], entry) {
1135        if (route->kroute.u.msi.address_lo == (uint32_t)msg.address &&
1136            route->kroute.u.msi.address_hi == (msg.address >> 32) &&
1137            route->kroute.u.msi.data == le32_to_cpu(msg.data)) {
1138            return route;
1139        }
1140    }
1141    return NULL;
1142}
1143
1144int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1145{
1146    struct kvm_msi msi;
1147    KVMMSIRoute *route;
1148
1149    if (s->direct_msi) {
1150        msi.address_lo = (uint32_t)msg.address;
1151        msi.address_hi = msg.address >> 32;
1152        msi.data = le32_to_cpu(msg.data);
1153        msi.flags = 0;
1154        memset(msi.pad, 0, sizeof(msi.pad));
1155
1156        return kvm_vm_ioctl(s, KVM_SIGNAL_MSI, &msi);
1157    }
1158
1159    route = kvm_lookup_msi_route(s, msg);
1160    if (!route) {
1161        int virq;
1162
1163        virq = kvm_irqchip_get_virq(s);
1164        if (virq < 0) {
1165            return virq;
1166        }
1167
1168        route = g_malloc0(sizeof(KVMMSIRoute));
1169        route->kroute.gsi = virq;
1170        route->kroute.type = KVM_IRQ_ROUTING_MSI;
1171        route->kroute.flags = 0;
1172        route->kroute.u.msi.address_lo = (uint32_t)msg.address;
1173        route->kroute.u.msi.address_hi = msg.address >> 32;
1174        route->kroute.u.msi.data = le32_to_cpu(msg.data);
1175
1176        kvm_add_routing_entry(s, &route->kroute);
1177        kvm_irqchip_commit_routes(s);
1178
1179        QTAILQ_INSERT_TAIL(&s->msi_hashtab[kvm_hash_msi(msg.data)], route,
1180                           entry);
1181    }
1182
1183    assert(route->kroute.type == KVM_IRQ_ROUTING_MSI);
1184
1185    return kvm_set_irq(s, route->kroute.gsi, 1);
1186}
1187
1188int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
1189{
1190    struct kvm_irq_routing_entry kroute = {};
1191    int virq;
1192
1193    if (!kvm_gsi_routing_enabled()) {
1194        return -ENOSYS;
1195    }
1196
1197    virq = kvm_irqchip_get_virq(s);
1198    if (virq < 0) {
1199        return virq;
1200    }
1201
1202    kroute.gsi = virq;
1203    kroute.type = KVM_IRQ_ROUTING_MSI;
1204    kroute.flags = 0;
1205    kroute.u.msi.address_lo = (uint32_t)msg.address;
1206    kroute.u.msi.address_hi = msg.address >> 32;
1207    kroute.u.msi.data = le32_to_cpu(msg.data);
1208
1209    kvm_add_routing_entry(s, &kroute);
1210    kvm_irqchip_commit_routes(s);
1211
1212    return virq;
1213}
1214
1215int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
1216{
1217    struct kvm_irq_routing_entry kroute = {};
1218
1219    if (!kvm_irqchip_in_kernel()) {
1220        return -ENOSYS;
1221    }
1222
1223    kroute.gsi = virq;
1224    kroute.type = KVM_IRQ_ROUTING_MSI;
1225    kroute.flags = 0;
1226    kroute.u.msi.address_lo = (uint32_t)msg.address;
1227    kroute.u.msi.address_hi = msg.address >> 32;
1228    kroute.u.msi.data = le32_to_cpu(msg.data);
1229
1230    return kvm_update_routing_entry(s, &kroute);
1231}
1232
1233static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
1234{
1235    struct kvm_irqfd irqfd = {
1236        .fd = fd,
1237        .gsi = virq,
1238        .flags = assign ? 0 : KVM_IRQFD_FLAG_DEASSIGN,
1239    };
1240
1241    if (!kvm_irqfds_enabled()) {
1242        return -ENOSYS;
1243    }
1244
1245    return kvm_vm_ioctl(s, KVM_IRQFD, &irqfd);
1246}
1247
1248#else /* !KVM_CAP_IRQ_ROUTING */
1249
1250void kvm_init_irq_routing(KVMState *s)
1251{
1252}
1253
1254void kvm_irqchip_release_virq(KVMState *s, int virq)
1255{
1256}
1257
1258int kvm_irqchip_send_msi(KVMState *s, MSIMessage msg)
1259{
1260    abort();
1261}
1262
1263int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg)
1264{
1265    return -ENOSYS;
1266}
1267
1268static int kvm_irqchip_assign_irqfd(KVMState *s, int fd, int virq, bool assign)
1269{
1270    abort();
1271}
1272
1273int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg)
1274{
1275    return -ENOSYS;
1276}
1277#endif /* !KVM_CAP_IRQ_ROUTING */
1278
1279int kvm_irqchip_add_irqfd_notifier(KVMState *s, EventNotifier *n, int virq)
1280{
1281    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), virq, true);
1282}
1283
1284int kvm_irqchip_remove_irqfd_notifier(KVMState *s, EventNotifier *n, int virq)
1285{
1286    return kvm_irqchip_assign_irqfd(s, event_notifier_get_fd(n), virq, false);
1287}
1288
1289static int kvm_irqchip_create(KVMState *s)
1290{
1291    int ret;
1292
1293    if (!qemu_opt_get_bool(qemu_get_machine_opts(), "kernel_irqchip", true) ||
1294        !kvm_check_extension(s, KVM_CAP_IRQCHIP)) {
1295        return 0;
1296    }
1297
1298    ret = kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP);
1299    if (ret < 0) {
1300        fprintf(stderr, "Create kernel irqchip failed\n");
1301        return ret;
1302    }
1303
1304    kvm_kernel_irqchip = true;
1305    /* If we have an in-kernel IRQ chip then we must have asynchronous
1306     * interrupt delivery (though the reverse is not necessarily true)
1307     */
1308    kvm_async_interrupts_allowed = true;
1309    kvm_halt_in_kernel_allowed = true;
1310
1311    kvm_init_irq_routing(s);
1312
1313    return 0;
1314}
1315
1316static int kvm_max_vcpus(KVMState *s)
1317{
1318    int ret;
1319
1320    /* Find number of supported CPUs using the recommended
1321     * procedure from the kernel API documentation to cope with
1322     * older kernels that may be missing capabilities.
1323     */
1324    ret = kvm_check_extension(s, KVM_CAP_MAX_VCPUS);
1325    if (ret) {
1326        return ret;
1327    }
1328    ret = kvm_check_extension(s, KVM_CAP_NR_VCPUS);
1329    if (ret) {
1330        return ret;
1331    }
1332
1333    return 4;
1334}
1335
1336int kvm_init(void)
1337{
1338    static const char upgrade_note[] =
1339        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
1340        "(see http://sourceforge.net/projects/kvm).\n";
1341    KVMState *s;
1342    const KVMCapabilityInfo *missing_cap;
1343    int ret;
1344    int i;
1345    int max_vcpus;
1346
1347    s = g_malloc0(sizeof(KVMState));
1348
1349    /*
1350     * On systems where the kernel can support different base page
1351     * sizes, host page size may be different from TARGET_PAGE_SIZE,
1352     * even with KVM.  TARGET_PAGE_SIZE is assumed to be the minimum
1353     * page size for the system though.
1354     */
1355    assert(TARGET_PAGE_SIZE <= getpagesize());
1356
1357#ifdef KVM_CAP_SET_GUEST_DEBUG
1358    QTAILQ_INIT(&s->kvm_sw_breakpoints);
1359#endif
1360    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
1361        s->slots[i].slot = i;
1362    }
1363    s->vmfd = -1;
1364    s->fd = qemu_open("/dev/kvm", O_RDWR);
1365    if (s->fd == -1) {
1366        fprintf(stderr, "Could not access KVM kernel module: %m\n");
1367        ret = -errno;
1368        goto err;
1369    }
1370
1371    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
1372    if (ret < KVM_API_VERSION) {
1373        if (ret > 0) {
1374            ret = -EINVAL;
1375        }
1376        fprintf(stderr, "kvm version too old\n");
1377        goto err;
1378    }
1379
1380    if (ret > KVM_API_VERSION) {
1381        ret = -EINVAL;
1382        fprintf(stderr, "kvm version not supported\n");
1383        goto err;
1384    }
1385
1386    max_vcpus = kvm_max_vcpus(s);
1387    if (smp_cpus > max_vcpus) {
1388        ret = -EINVAL;
1389        fprintf(stderr, "Number of SMP cpus requested (%d) exceeds max cpus "
1390                "supported by KVM (%d)\n", smp_cpus, max_vcpus);
1391        goto err;
1392    }
1393
1394    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
1395    if (s->vmfd < 0) {
1396#ifdef TARGET_S390X
1397        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
1398                        "your host kernel command line\n");
1399#endif
1400        ret = s->vmfd;
1401        goto err;
1402    }
1403
1404    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
1405    if (!missing_cap) {
1406        missing_cap =
1407            kvm_check_extension_list(s, kvm_arch_required_capabilities);
1408    }
1409    if (missing_cap) {
1410        ret = -EINVAL;
1411        fprintf(stderr, "kvm does not support %s\n%s",
1412                missing_cap->name, upgrade_note);
1413        goto err;
1414    }
1415
1416    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
1417
1418    s->broken_set_mem_region = 1;
1419    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
1420    if (ret > 0) {
1421        s->broken_set_mem_region = 0;
1422    }
1423
1424#ifdef KVM_CAP_VCPU_EVENTS
1425    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
1426#endif
1427
1428    s->robust_singlestep =
1429        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
1430
1431#ifdef KVM_CAP_DEBUGREGS
1432    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
1433#endif
1434
1435#ifdef KVM_CAP_XSAVE
1436    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
1437#endif
1438
1439#ifdef KVM_CAP_XCRS
1440    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
1441#endif
1442
1443#ifdef KVM_CAP_PIT_STATE2
1444    s->pit_state2 = kvm_check_extension(s, KVM_CAP_PIT_STATE2);
1445#endif
1446
1447#ifdef KVM_CAP_IRQ_ROUTING
1448    s->direct_msi = (kvm_check_extension(s, KVM_CAP_SIGNAL_MSI) > 0);
1449#endif
1450
1451    s->intx_set_mask = kvm_check_extension(s, KVM_CAP_PCI_2_3);
1452
1453    s->irq_set_ioctl = KVM_IRQ_LINE;
1454    if (kvm_check_extension(s, KVM_CAP_IRQ_INJECT_STATUS)) {
1455        s->irq_set_ioctl = KVM_IRQ_LINE_STATUS;
1456    }
1457
1458#ifdef KVM_CAP_READONLY_MEM
1459    kvm_readonly_mem_allowed =
1460        (kvm_check_extension(s, KVM_CAP_READONLY_MEM) > 0);
1461#endif
1462
1463    ret = kvm_arch_init(s);
1464    if (ret < 0) {
1465        goto err;
1466    }
1467
1468    ret = kvm_irqchip_create(s);
1469    if (ret < 0) {
1470        goto err;
1471    }
1472
1473    kvm_state = s;
1474    memory_listener_register(&kvm_memory_listener, &address_space_memory);
1475    memory_listener_register(&kvm_io_listener, &address_space_io);
1476
1477    s->many_ioeventfds = kvm_check_many_ioeventfds();
1478
1479    cpu_interrupt_handler = kvm_handle_interrupt;
1480
1481    return 0;
1482
1483err:
1484    if (s->vmfd >= 0) {
1485        close(s->vmfd);
1486    }
1487    if (s->fd != -1) {
1488        close(s->fd);
1489    }
1490    g_free(s);
1491
1492    return ret;
1493}
1494
1495static void kvm_handle_io(uint16_t port, void *data, int direction, int size,
1496                          uint32_t count)
1497{
1498    int i;
1499    uint8_t *ptr = data;
1500
1501    for (i = 0; i < count; i++) {
1502        if (direction == KVM_EXIT_IO_IN) {
1503            switch (size) {
1504            case 1:
1505                stb_p(ptr, cpu_inb(port));
1506                break;
1507            case 2:
1508                stw_p(ptr, cpu_inw(port));
1509                break;
1510            case 4:
1511                stl_p(ptr, cpu_inl(port));
1512                break;
1513            }
1514        } else {
1515            switch (size) {
1516            case 1:
1517                cpu_outb(port, ldub_p(ptr));
1518                break;
1519            case 2:
1520                cpu_outw(port, lduw_p(ptr));
1521                break;
1522            case 4:
1523                cpu_outl(port, ldl_p(ptr));
1524                break;
1525            }
1526        }
1527
1528        ptr += size;
1529    }
1530}
1531
1532static int kvm_handle_internal_error(CPUState *cpu, struct kvm_run *run)
1533{
1534    fprintf(stderr, "KVM internal error.");
1535    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
1536        int i;
1537
1538        fprintf(stderr, " Suberror: %d\n", run->internal.suberror);
1539        for (i = 0; i < run->internal.ndata; ++i) {
1540            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
1541                    i, (uint64_t)run->internal.data[i]);
1542        }
1543    } else {
1544        fprintf(stderr, "\n");
1545    }
1546    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
1547        fprintf(stderr, "emulation failure\n");
1548        if (!kvm_arch_stop_on_emulation_error(cpu)) {
1549            cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
1550            return EXCP_INTERRUPT;
1551        }
1552    }
1553    /* FIXME: Should trigger a qmp message to let management know
1554     * something went wrong.
1555     */
1556    return -1;
1557}
1558
1559void kvm_flush_coalesced_mmio_buffer(void)
1560{
1561    KVMState *s = kvm_state;
1562
1563    if (s->coalesced_flush_in_progress) {
1564        return;
1565    }
1566
1567    s->coalesced_flush_in_progress = true;
1568
1569    if (s->coalesced_mmio_ring) {
1570        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
1571        while (ring->first != ring->last) {
1572            struct kvm_coalesced_mmio *ent;
1573
1574            ent = &ring->coalesced_mmio[ring->first];
1575
1576            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
1577            smp_wmb();
1578            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
1579        }
1580    }
1581
1582    s->coalesced_flush_in_progress = false;
1583}
1584
1585static void do_kvm_cpu_synchronize_state(void *arg)
1586{
1587    CPUState *cpu = arg;
1588
1589    if (!cpu->kvm_vcpu_dirty) {
1590        kvm_arch_get_registers(cpu);
1591        cpu->kvm_vcpu_dirty = true;
1592    }
1593}
1594
1595void kvm_cpu_synchronize_state(CPUState *cpu)
1596{
1597    if (!cpu->kvm_vcpu_dirty) {
1598        run_on_cpu(cpu, do_kvm_cpu_synchronize_state, cpu);
1599    }
1600}
1601
1602void kvm_cpu_synchronize_post_reset(CPUState *cpu)
1603{
1604    kvm_arch_put_registers(cpu, KVM_PUT_RESET_STATE);
1605    cpu->kvm_vcpu_dirty = false;
1606}
1607
1608void kvm_cpu_synchronize_post_init(CPUState *cpu)
1609{
1610    kvm_arch_put_registers(cpu, KVM_PUT_FULL_STATE);
1611    cpu->kvm_vcpu_dirty = false;
1612}
1613
1614int kvm_cpu_exec(CPUState *cpu)
1615{
1616    struct kvm_run *run = cpu->kvm_run;
1617    int ret, run_ret;
1618
1619    DPRINTF("kvm_cpu_exec()\n");
1620
1621    if (kvm_arch_process_async_events(cpu)) {
1622        cpu->exit_request = 0;
1623        return EXCP_HLT;
1624    }
1625
1626    do {
1627        if (cpu->kvm_vcpu_dirty) {
1628            kvm_arch_put_registers(cpu, KVM_PUT_RUNTIME_STATE);
1629            cpu->kvm_vcpu_dirty = false;
1630        }
1631
1632        kvm_arch_pre_run(cpu, run);
1633        if (cpu->exit_request) {
1634            DPRINTF("interrupt exit requested\n");
1635            /*
1636             * KVM requires us to reenter the kernel after IO exits to complete
1637             * instruction emulation. This self-signal will ensure that we
1638             * leave ASAP again.
1639             */
1640            qemu_cpu_kick_self();
1641        }
1642        qemu_mutex_unlock_iothread();
1643
1644        run_ret = kvm_vcpu_ioctl(cpu, KVM_RUN, 0);
1645
1646        qemu_mutex_lock_iothread();
1647        kvm_arch_post_run(cpu, run);
1648
1649        if (run_ret < 0) {
1650            if (run_ret == -EINTR || run_ret == -EAGAIN) {
1651                DPRINTF("io window exit\n");
1652                ret = EXCP_INTERRUPT;
1653                break;
1654            }
1655            fprintf(stderr, "error: kvm run failed %s\n",
1656                    strerror(-run_ret));
1657            abort();
1658        }
1659
1660        trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
1661        switch (run->exit_reason) {
1662        case KVM_EXIT_IO:
1663            DPRINTF("handle_io\n");
1664            kvm_handle_io(run->io.port,
1665                          (uint8_t *)run + run->io.data_offset,
1666                          run->io.direction,
1667                          run->io.size,
1668                          run->io.count);
1669            ret = 0;
1670            break;
1671        case KVM_EXIT_MMIO:
1672            DPRINTF("handle_mmio\n");
1673            cpu_physical_memory_rw(run->mmio.phys_addr,
1674                                   run->mmio.data,
1675                                   run->mmio.len,
1676                                   run->mmio.is_write);
1677            ret = 0;
1678            break;
1679        case KVM_EXIT_IRQ_WINDOW_OPEN:
1680            DPRINTF("irq_window_open\n");
1681            ret = EXCP_INTERRUPT;
1682            break;
1683        case KVM_EXIT_SHUTDOWN:
1684            DPRINTF("shutdown\n");
1685            qemu_system_reset_request();
1686            ret = EXCP_INTERRUPT;
1687            break;
1688        case KVM_EXIT_UNKNOWN:
1689            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
1690                    (uint64_t)run->hw.hardware_exit_reason);
1691            ret = -1;
1692            break;
1693        case KVM_EXIT_INTERNAL_ERROR:
1694            ret = kvm_handle_internal_error(cpu, run);
1695            break;
1696        default:
1697            DPRINTF("kvm_arch_handle_exit\n");
1698            ret = kvm_arch_handle_exit(cpu, run);
1699            break;
1700        }
1701    } while (ret == 0);
1702
1703    if (ret < 0) {
1704        cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_CODE);
1705        vm_stop(RUN_STATE_INTERNAL_ERROR);
1706    }
1707
1708    cpu->exit_request = 0;
1709    return ret;
1710}
1711
1712int kvm_ioctl(KVMState *s, int type, ...)
1713{
1714    int ret;
1715    void *arg;
1716    va_list ap;
1717
1718    va_start(ap, type);
1719    arg = va_arg(ap, void *);
1720    va_end(ap);
1721
1722    trace_kvm_ioctl(type, arg);
1723    ret = ioctl(s->fd, type, arg);
1724    if (ret == -1) {
1725        ret = -errno;
1726    }
1727    return ret;
1728}
1729
1730int kvm_vm_ioctl(KVMState *s, int type, ...)
1731{
1732    int ret;
1733    void *arg;
1734    va_list ap;
1735
1736    va_start(ap, type);
1737    arg = va_arg(ap, void *);
1738    va_end(ap);
1739
1740    trace_kvm_vm_ioctl(type, arg);
1741    ret = ioctl(s->vmfd, type, arg);
1742    if (ret == -1) {
1743        ret = -errno;
1744    }
1745    return ret;
1746}
1747
1748int kvm_vcpu_ioctl(CPUState *cpu, int type, ...)
1749{
1750    int ret;
1751    void *arg;
1752    va_list ap;
1753
1754    va_start(ap, type);
1755    arg = va_arg(ap, void *);
1756    va_end(ap);
1757
1758    trace_kvm_vcpu_ioctl(cpu->cpu_index, type, arg);
1759    ret = ioctl(cpu->kvm_fd, type, arg);
1760    if (ret == -1) {
1761        ret = -errno;
1762    }
1763    return ret;
1764}
1765
1766int kvm_has_sync_mmu(void)
1767{
1768    return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
1769}
1770
1771int kvm_has_vcpu_events(void)
1772{
1773    return kvm_state->vcpu_events;
1774}
1775
1776int kvm_has_robust_singlestep(void)
1777{
1778    return kvm_state->robust_singlestep;
1779}
1780
1781int kvm_has_debugregs(void)
1782{
1783    return kvm_state->debugregs;
1784}
1785
1786int kvm_has_xsave(void)
1787{
1788    return kvm_state->xsave;
1789}
1790
1791int kvm_has_xcrs(void)
1792{
1793    return kvm_state->xcrs;
1794}
1795
1796int kvm_has_pit_state2(void)
1797{
1798    return kvm_state->pit_state2;
1799}
1800
1801int kvm_has_many_ioeventfds(void)
1802{
1803    if (!kvm_enabled()) {
1804        return 0;
1805    }
1806    return kvm_state->many_ioeventfds;
1807}
1808
1809int kvm_has_gsi_routing(void)
1810{
1811#ifdef KVM_CAP_IRQ_ROUTING
1812    return kvm_check_extension(kvm_state, KVM_CAP_IRQ_ROUTING);
1813#else
1814    return false;
1815#endif
1816}
1817
1818int kvm_has_intx_set_mask(void)
1819{
1820    return kvm_state->intx_set_mask;
1821}
1822
1823void *kvm_ram_alloc(ram_addr_t size)
1824{
1825#ifdef TARGET_S390X
1826    void *mem;
1827
1828    mem = kvm_arch_ram_alloc(size);
1829    if (mem) {
1830        return mem;
1831    }
1832#endif
1833    return qemu_anon_ram_alloc(size);
1834}
1835
1836void kvm_setup_guest_memory(void *start, size_t size)
1837{
1838#ifdef CONFIG_VALGRIND_H
1839    VALGRIND_MAKE_MEM_DEFINED(start, size);
1840#endif
1841    if (!kvm_has_sync_mmu()) {
1842        int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);
1843
1844        if (ret) {
1845            perror("qemu_madvise");
1846            fprintf(stderr,
1847                    "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1848            exit(1);
1849        }
1850    }
1851}
1852
1853#ifdef KVM_CAP_SET_GUEST_DEBUG
1854struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *cpu,
1855                                                 target_ulong pc)
1856{
1857    struct kvm_sw_breakpoint *bp;
1858
1859    QTAILQ_FOREACH(bp, &cpu->kvm_state->kvm_sw_breakpoints, entry) {
1860        if (bp->pc == pc) {
1861            return bp;
1862        }
1863    }
1864    return NULL;
1865}
1866
1867int kvm_sw_breakpoints_active(CPUState *cpu)
1868{
1869    return !QTAILQ_EMPTY(&cpu->kvm_state->kvm_sw_breakpoints);
1870}
1871
1872struct kvm_set_guest_debug_data {
1873    struct kvm_guest_debug dbg;
1874    CPUState *cpu;
1875    int err;
1876};
1877
1878static void kvm_invoke_set_guest_debug(void *data)
1879{
1880    struct kvm_set_guest_debug_data *dbg_data = data;
1881
1882    dbg_data->err = kvm_vcpu_ioctl(dbg_data->cpu, KVM_SET_GUEST_DEBUG,
1883                                   &dbg_data->dbg);
1884}
1885
1886int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
1887{
1888    struct kvm_set_guest_debug_data data;
1889
1890    data.dbg.control = reinject_trap;
1891
1892    if (cpu->singlestep_enabled) {
1893        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1894    }
1895    kvm_arch_update_guest_debug(cpu, &data.dbg);
1896    data.cpu = cpu;
1897
1898    run_on_cpu(cpu, kvm_invoke_set_guest_debug, &data);
1899    return data.err;
1900}
1901
1902int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
1903                          target_ulong len, int type)
1904{
1905    struct kvm_sw_breakpoint *bp;
1906    int err;
1907
1908    if (type == GDB_BREAKPOINT_SW) {
1909        bp = kvm_find_sw_breakpoint(cpu, addr);
1910        if (bp) {
1911            bp->use_count++;
1912            return 0;
1913        }
1914
1915        bp = g_malloc(sizeof(struct kvm_sw_breakpoint));
1916        if (!bp) {
1917            return -ENOMEM;
1918        }
1919
1920        bp->pc = addr;
1921        bp->use_count = 1;
1922        err = kvm_arch_insert_sw_breakpoint(cpu, bp);
1923        if (err) {
1924            g_free(bp);
1925            return err;
1926        }
1927
1928        QTAILQ_INSERT_HEAD(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
1929    } else {
1930        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1931        if (err) {
1932            return err;
1933        }
1934    }
1935
1936    for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) {
1937        err = kvm_update_guest_debug(cpu, 0);
1938        if (err) {
1939            return err;
1940        }
1941    }
1942    return 0;
1943}
1944
1945int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
1946                          target_ulong len, int type)
1947{
1948    struct kvm_sw_breakpoint *bp;
1949    int err;
1950
1951    if (type == GDB_BREAKPOINT_SW) {
1952        bp = kvm_find_sw_breakpoint(cpu, addr);
1953        if (!bp) {
1954            return -ENOENT;
1955        }
1956
1957        if (bp->use_count > 1) {
1958            bp->use_count--;
1959            return 0;
1960        }
1961
1962        err = kvm_arch_remove_sw_breakpoint(cpu, bp);
1963        if (err) {
1964            return err;
1965        }
1966
1967        QTAILQ_REMOVE(&cpu->kvm_state->kvm_sw_breakpoints, bp, entry);
1968        g_free(bp);
1969    } else {
1970        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1971        if (err) {
1972            return err;
1973        }
1974    }
1975
1976    for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) {
1977        err = kvm_update_guest_debug(cpu, 0);
1978        if (err) {
1979            return err;
1980        }
1981    }
1982    return 0;
1983}
1984
1985void kvm_remove_all_breakpoints(CPUState *cpu)
1986{
1987    struct kvm_sw_breakpoint *bp, *next;
1988    KVMState *s = cpu->kvm_state;
1989
1990    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1991        if (kvm_arch_remove_sw_breakpoint(cpu, bp) != 0) {
1992            /* Try harder to find a CPU that currently sees the breakpoint. */
1993            for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) {
1994                if (kvm_arch_remove_sw_breakpoint(cpu, bp) == 0) {
1995                    break;
1996                }
1997            }
1998        }
1999        QTAILQ_REMOVE(&s->kvm_sw_breakpoints, bp, entry);
2000        g_free(bp);
2001    }
2002    kvm_arch_remove_all_hw_breakpoints();
2003
2004    for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) {
2005        kvm_update_guest_debug(cpu, 0);
2006    }
2007}
2008
2009#else /* !KVM_CAP_SET_GUEST_DEBUG */
2010
2011int kvm_update_guest_debug(CPUState *cpu, unsigned long reinject_trap)
2012{
2013    return -EINVAL;
2014}
2015
2016int kvm_insert_breakpoint(CPUState *cpu, target_ulong addr,
2017                          target_ulong len, int type)
2018{
2019    return -EINVAL;
2020}
2021
2022int kvm_remove_breakpoint(CPUState *cpu, target_ulong addr,
2023                          target_ulong len, int type)
2024{
2025    return -EINVAL;
2026}
2027
2028void kvm_remove_all_breakpoints(CPUState *cpu)
2029{
2030}
2031#endif /* !KVM_CAP_SET_GUEST_DEBUG */
2032
2033int kvm_set_signal_mask(CPUState *cpu, const sigset_t *sigset)
2034{
2035    struct kvm_signal_mask *sigmask;
2036    int r;
2037
2038    if (!sigset) {
2039        return kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, NULL);
2040    }
2041
2042    sigmask = g_malloc(sizeof(*sigmask) + sizeof(*sigset));
2043
2044    sigmask->len = 8;
2045    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
2046    r = kvm_vcpu_ioctl(cpu, KVM_SET_SIGNAL_MASK, sigmask);
2047    g_free(sigmask);
2048
2049    return r;
2050}
2051int kvm_on_sigbus_vcpu(CPUState *cpu, int code, void *addr)
2052{
2053    return kvm_arch_on_sigbus_vcpu(cpu, code, addr);
2054}
2055
2056int kvm_on_sigbus(int code, void *addr)
2057{
2058    return kvm_arch_on_sigbus(code, addr);
2059}
2060