qemu/kvm-all.c
<<
>>
Prefs
   1/*
   2 * QEMU KVM support
   3 *
   4 * Copyright IBM, Corp. 2008
   5 *           Red Hat, Inc. 2008
   6 *
   7 * Authors:
   8 *  Anthony Liguori   <aliguori@us.ibm.com>
   9 *  Glauber Costa     <gcosta@redhat.com>
  10 *
  11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
  12 * See the COPYING file in the top-level directory.
  13 *
  14 */
  15
  16#include <sys/types.h>
  17#include <sys/ioctl.h>
  18#include <sys/mman.h>
  19#include <stdarg.h>
  20
  21#include <linux/kvm.h>
  22
  23#include "qemu-common.h"
  24#include "qemu-barrier.h"
  25#include "sysemu.h"
  26#include "hw/hw.h"
  27#include "gdbstub.h"
  28#include "kvm.h"
  29#include "bswap.h"
  30
  31/* This check must be after config-host.h is included */
  32#ifdef CONFIG_EVENTFD
  33#include <sys/eventfd.h>
  34#endif
  35
  36/* KVM uses PAGE_SIZE in it's definition of COALESCED_MMIO_MAX */
  37#define PAGE_SIZE TARGET_PAGE_SIZE
  38
  39//#define DEBUG_KVM
  40
  41#ifdef DEBUG_KVM
  42#define DPRINTF(fmt, ...) \
  43    do { fprintf(stderr, fmt, ## __VA_ARGS__); } while (0)
  44#else
  45#define DPRINTF(fmt, ...) \
  46    do { } while (0)
  47#endif
  48
  49typedef struct KVMSlot
  50{
  51    target_phys_addr_t start_addr;
  52    ram_addr_t memory_size;
  53    ram_addr_t phys_offset;
  54    int slot;
  55    int flags;
  56} KVMSlot;
  57
  58typedef struct kvm_dirty_log KVMDirtyLog;
  59
  60struct KVMState
  61{
  62    KVMSlot slots[32];
  63    int fd;
  64    int vmfd;
  65    int coalesced_mmio;
  66    struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
  67    int broken_set_mem_region;
  68    int migration_log;
  69    int vcpu_events;
  70    int robust_singlestep;
  71    int debugregs;
  72#ifdef KVM_CAP_SET_GUEST_DEBUG
  73    struct kvm_sw_breakpoint_head kvm_sw_breakpoints;
  74#endif
  75    int irqchip_in_kernel;
  76    int pit_in_kernel;
  77    int xsave, xcrs;
  78    int many_ioeventfds;
  79};
  80
  81KVMState *kvm_state;
  82
  83static const KVMCapabilityInfo kvm_required_capabilites[] = {
  84    KVM_CAP_INFO(USER_MEMORY),
  85    KVM_CAP_INFO(DESTROY_MEMORY_REGION_WORKS),
  86    KVM_CAP_LAST_INFO
  87};
  88
  89static KVMSlot *kvm_alloc_slot(KVMState *s)
  90{
  91    int i;
  92
  93    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
  94        if (s->slots[i].memory_size == 0) {
  95            return &s->slots[i];
  96        }
  97    }
  98
  99    fprintf(stderr, "%s: no free slot available\n", __func__);
 100    abort();
 101}
 102
 103static KVMSlot *kvm_lookup_matching_slot(KVMState *s,
 104                                         target_phys_addr_t start_addr,
 105                                         target_phys_addr_t end_addr)
 106{
 107    int i;
 108
 109    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 110        KVMSlot *mem = &s->slots[i];
 111
 112        if (start_addr == mem->start_addr &&
 113            end_addr == mem->start_addr + mem->memory_size) {
 114            return mem;
 115        }
 116    }
 117
 118    return NULL;
 119}
 120
 121/*
 122 * Find overlapping slot with lowest start address
 123 */
 124static KVMSlot *kvm_lookup_overlapping_slot(KVMState *s,
 125                                            target_phys_addr_t start_addr,
 126                                            target_phys_addr_t end_addr)
 127{
 128    KVMSlot *found = NULL;
 129    int i;
 130
 131    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 132        KVMSlot *mem = &s->slots[i];
 133
 134        if (mem->memory_size == 0 ||
 135            (found && found->start_addr < mem->start_addr)) {
 136            continue;
 137        }
 138
 139        if (end_addr > mem->start_addr &&
 140            start_addr < mem->start_addr + mem->memory_size) {
 141            found = mem;
 142        }
 143    }
 144
 145    return found;
 146}
 147
 148int kvm_physical_memory_addr_from_ram(KVMState *s, ram_addr_t ram_addr,
 149                                      target_phys_addr_t *phys_addr)
 150{
 151    int i;
 152
 153    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 154        KVMSlot *mem = &s->slots[i];
 155
 156        if (ram_addr >= mem->phys_offset &&
 157            ram_addr < mem->phys_offset + mem->memory_size) {
 158            *phys_addr = mem->start_addr + (ram_addr - mem->phys_offset);
 159            return 1;
 160        }
 161    }
 162
 163    return 0;
 164}
 165
 166static int kvm_set_user_memory_region(KVMState *s, KVMSlot *slot)
 167{
 168    struct kvm_userspace_memory_region mem;
 169
 170    mem.slot = slot->slot;
 171    mem.guest_phys_addr = slot->start_addr;
 172    mem.memory_size = slot->memory_size;
 173    mem.userspace_addr = (unsigned long)qemu_safe_ram_ptr(slot->phys_offset);
 174    mem.flags = slot->flags;
 175    if (s->migration_log) {
 176        mem.flags |= KVM_MEM_LOG_DIRTY_PAGES;
 177    }
 178    return kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem);
 179}
 180
 181static void kvm_reset_vcpu(void *opaque)
 182{
 183    CPUState *env = opaque;
 184
 185    kvm_arch_reset_vcpu(env);
 186}
 187
 188int kvm_irqchip_in_kernel(void)
 189{
 190    return kvm_state->irqchip_in_kernel;
 191}
 192
 193int kvm_pit_in_kernel(void)
 194{
 195    return kvm_state->pit_in_kernel;
 196}
 197
 198int kvm_init_vcpu(CPUState *env)
 199{
 200    KVMState *s = kvm_state;
 201    long mmap_size;
 202    int ret;
 203
 204    DPRINTF("kvm_init_vcpu\n");
 205
 206    ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);
 207    if (ret < 0) {
 208        DPRINTF("kvm_create_vcpu failed\n");
 209        goto err;
 210    }
 211
 212    env->kvm_fd = ret;
 213    env->kvm_state = s;
 214    env->kvm_vcpu_dirty = 1;
 215
 216    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
 217    if (mmap_size < 0) {
 218        ret = mmap_size;
 219        DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");
 220        goto err;
 221    }
 222
 223    env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
 224                        env->kvm_fd, 0);
 225    if (env->kvm_run == MAP_FAILED) {
 226        ret = -errno;
 227        DPRINTF("mmap'ing vcpu state failed\n");
 228        goto err;
 229    }
 230
 231    if (s->coalesced_mmio && !s->coalesced_mmio_ring) {
 232        s->coalesced_mmio_ring =
 233            (void *)env->kvm_run + s->coalesced_mmio * PAGE_SIZE;
 234    }
 235
 236    ret = kvm_arch_init_vcpu(env);
 237    if (ret == 0) {
 238        qemu_register_reset(kvm_reset_vcpu, env);
 239        kvm_arch_reset_vcpu(env);
 240    }
 241err:
 242    return ret;
 243}
 244
 245/*
 246 * dirty pages logging control
 247 */
 248
 249static int kvm_mem_flags(KVMState *s, bool log_dirty)
 250{
 251    return log_dirty ? KVM_MEM_LOG_DIRTY_PAGES : 0;
 252}
 253
 254static int kvm_slot_dirty_pages_log_change(KVMSlot *mem, bool log_dirty)
 255{
 256    KVMState *s = kvm_state;
 257    int flags, mask = KVM_MEM_LOG_DIRTY_PAGES;
 258    int old_flags;
 259
 260    old_flags = mem->flags;
 261
 262    flags = (mem->flags & ~mask) | kvm_mem_flags(s, log_dirty);
 263    mem->flags = flags;
 264
 265    /* If nothing changed effectively, no need to issue ioctl */
 266    if (s->migration_log) {
 267        flags |= KVM_MEM_LOG_DIRTY_PAGES;
 268    }
 269
 270    if (flags == old_flags) {
 271        return 0;
 272    }
 273
 274    return kvm_set_user_memory_region(s, mem);
 275}
 276
 277static int kvm_dirty_pages_log_change(target_phys_addr_t phys_addr,
 278                                      ram_addr_t size, bool log_dirty)
 279{
 280    KVMState *s = kvm_state;
 281    KVMSlot *mem = kvm_lookup_matching_slot(s, phys_addr, phys_addr + size);
 282
 283    if (mem == NULL)  {
 284        fprintf(stderr, "BUG: %s: invalid parameters " TARGET_FMT_plx "-"
 285                TARGET_FMT_plx "\n", __func__, phys_addr,
 286                (target_phys_addr_t)(phys_addr + size - 1));
 287        return -EINVAL;
 288    }
 289    return kvm_slot_dirty_pages_log_change(mem, log_dirty);
 290}
 291
 292static int kvm_log_start(CPUPhysMemoryClient *client,
 293                         target_phys_addr_t phys_addr, ram_addr_t size)
 294{
 295    return kvm_dirty_pages_log_change(phys_addr, size, true);
 296}
 297
 298static int kvm_log_stop(CPUPhysMemoryClient *client,
 299                        target_phys_addr_t phys_addr, ram_addr_t size)
 300{
 301    return kvm_dirty_pages_log_change(phys_addr, size, false);
 302}
 303
 304static int kvm_set_migration_log(int enable)
 305{
 306    KVMState *s = kvm_state;
 307    KVMSlot *mem;
 308    int i, err;
 309
 310    s->migration_log = enable;
 311
 312    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 313        mem = &s->slots[i];
 314
 315        if (!mem->memory_size) {
 316            continue;
 317        }
 318        if (!!(mem->flags & KVM_MEM_LOG_DIRTY_PAGES) == enable) {
 319            continue;
 320        }
 321        err = kvm_set_user_memory_region(s, mem);
 322        if (err) {
 323            return err;
 324        }
 325    }
 326    return 0;
 327}
 328
 329/* get kvm's dirty pages bitmap and update qemu's */
 330static int kvm_get_dirty_pages_log_range(unsigned long start_addr,
 331                                         unsigned long *bitmap,
 332                                         unsigned long offset,
 333                                         unsigned long mem_size)
 334{
 335    unsigned int i, j;
 336    unsigned long page_number, addr, addr1, c;
 337    ram_addr_t ram_addr;
 338    unsigned int len = ((mem_size / TARGET_PAGE_SIZE) + HOST_LONG_BITS - 1) /
 339        HOST_LONG_BITS;
 340
 341    /*
 342     * bitmap-traveling is faster than memory-traveling (for addr...)
 343     * especially when most of the memory is not dirty.
 344     */
 345    for (i = 0; i < len; i++) {
 346        if (bitmap[i] != 0) {
 347            c = leul_to_cpu(bitmap[i]);
 348            do {
 349                j = ffsl(c) - 1;
 350                c &= ~(1ul << j);
 351                page_number = i * HOST_LONG_BITS + j;
 352                addr1 = page_number * TARGET_PAGE_SIZE;
 353                addr = offset + addr1;
 354                ram_addr = cpu_get_physical_page_desc(addr);
 355                cpu_physical_memory_set_dirty(ram_addr);
 356            } while (c != 0);
 357        }
 358    }
 359    return 0;
 360}
 361
 362#define ALIGN(x, y)  (((x)+(y)-1) & ~((y)-1))
 363
 364/**
 365 * kvm_physical_sync_dirty_bitmap - Grab dirty bitmap from kernel space
 366 * This function updates qemu's dirty bitmap using cpu_physical_memory_set_dirty().
 367 * This means all bits are set to dirty.
 368 *
 369 * @start_add: start of logged region.
 370 * @end_addr: end of logged region.
 371 */
 372static int kvm_physical_sync_dirty_bitmap(target_phys_addr_t start_addr,
 373                                          target_phys_addr_t end_addr)
 374{
 375    KVMState *s = kvm_state;
 376    unsigned long size, allocated_size = 0;
 377    KVMDirtyLog d;
 378    KVMSlot *mem;
 379    int ret = 0;
 380
 381    d.dirty_bitmap = NULL;
 382    while (start_addr < end_addr) {
 383        mem = kvm_lookup_overlapping_slot(s, start_addr, end_addr);
 384        if (mem == NULL) {
 385            break;
 386        }
 387
 388        /* XXX bad kernel interface alert
 389         * For dirty bitmap, kernel allocates array of size aligned to
 390         * bits-per-long.  But for case when the kernel is 64bits and
 391         * the userspace is 32bits, userspace can't align to the same
 392         * bits-per-long, since sizeof(long) is different between kernel
 393         * and user space.  This way, userspace will provide buffer which
 394         * may be 4 bytes less than the kernel will use, resulting in
 395         * userspace memory corruption (which is not detectable by valgrind
 396         * too, in most cases).
 397         * So for now, let's align to 64 instead of HOST_LONG_BITS here, in
 398         * a hope that sizeof(long) wont become >8 any time soon.
 399         */
 400        size = ALIGN(((mem->memory_size) >> TARGET_PAGE_BITS),
 401                     /*HOST_LONG_BITS*/ 64) / 8;
 402        if (!d.dirty_bitmap) {
 403            d.dirty_bitmap = qemu_malloc(size);
 404        } else if (size > allocated_size) {
 405            d.dirty_bitmap = qemu_realloc(d.dirty_bitmap, size);
 406        }
 407        allocated_size = size;
 408        memset(d.dirty_bitmap, 0, allocated_size);
 409
 410        d.slot = mem->slot;
 411
 412        if (kvm_vm_ioctl(s, KVM_GET_DIRTY_LOG, &d) == -1) {
 413            DPRINTF("ioctl failed %d\n", errno);
 414            ret = -1;
 415            break;
 416        }
 417
 418        kvm_get_dirty_pages_log_range(mem->start_addr, d.dirty_bitmap,
 419                                      mem->start_addr, mem->memory_size);
 420        start_addr = mem->start_addr + mem->memory_size;
 421    }
 422    qemu_free(d.dirty_bitmap);
 423
 424    return ret;
 425}
 426
 427int kvm_coalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
 428{
 429    int ret = -ENOSYS;
 430    KVMState *s = kvm_state;
 431
 432    if (s->coalesced_mmio) {
 433        struct kvm_coalesced_mmio_zone zone;
 434
 435        zone.addr = start;
 436        zone.size = size;
 437
 438        ret = kvm_vm_ioctl(s, KVM_REGISTER_COALESCED_MMIO, &zone);
 439    }
 440
 441    return ret;
 442}
 443
 444int kvm_uncoalesce_mmio_region(target_phys_addr_t start, ram_addr_t size)
 445{
 446    int ret = -ENOSYS;
 447    KVMState *s = kvm_state;
 448
 449    if (s->coalesced_mmio) {
 450        struct kvm_coalesced_mmio_zone zone;
 451
 452        zone.addr = start;
 453        zone.size = size;
 454
 455        ret = kvm_vm_ioctl(s, KVM_UNREGISTER_COALESCED_MMIO, &zone);
 456    }
 457
 458    return ret;
 459}
 460
 461int kvm_check_extension(KVMState *s, unsigned int extension)
 462{
 463    int ret;
 464
 465    ret = kvm_ioctl(s, KVM_CHECK_EXTENSION, extension);
 466    if (ret < 0) {
 467        ret = 0;
 468    }
 469
 470    return ret;
 471}
 472
 473static int kvm_check_many_ioeventfds(void)
 474{
 475    /* Userspace can use ioeventfd for io notification.  This requires a host
 476     * that supports eventfd(2) and an I/O thread; since eventfd does not
 477     * support SIGIO it cannot interrupt the vcpu.
 478     *
 479     * Older kernels have a 6 device limit on the KVM io bus.  Find out so we
 480     * can avoid creating too many ioeventfds.
 481     */
 482#if defined(CONFIG_EVENTFD) && defined(CONFIG_IOTHREAD)
 483    int ioeventfds[7];
 484    int i, ret = 0;
 485    for (i = 0; i < ARRAY_SIZE(ioeventfds); i++) {
 486        ioeventfds[i] = eventfd(0, EFD_CLOEXEC);
 487        if (ioeventfds[i] < 0) {
 488            break;
 489        }
 490        ret = kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, true);
 491        if (ret < 0) {
 492            close(ioeventfds[i]);
 493            break;
 494        }
 495    }
 496
 497    /* Decide whether many devices are supported or not */
 498    ret = i == ARRAY_SIZE(ioeventfds);
 499
 500    while (i-- > 0) {
 501        kvm_set_ioeventfd_pio_word(ioeventfds[i], 0, i, false);
 502        close(ioeventfds[i]);
 503    }
 504    return ret;
 505#else
 506    return 0;
 507#endif
 508}
 509
 510static const KVMCapabilityInfo *
 511kvm_check_extension_list(KVMState *s, const KVMCapabilityInfo *list)
 512{
 513    while (list->name) {
 514        if (!kvm_check_extension(s, list->value)) {
 515            return list;
 516        }
 517        list++;
 518    }
 519    return NULL;
 520}
 521
 522static void kvm_set_phys_mem(target_phys_addr_t start_addr, ram_addr_t size,
 523                             ram_addr_t phys_offset, bool log_dirty)
 524{
 525    KVMState *s = kvm_state;
 526    ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
 527    KVMSlot *mem, old;
 528    int err;
 529
 530    /* kvm works in page size chunks, but the function may be called
 531       with sub-page size and unaligned start address. */
 532    size = TARGET_PAGE_ALIGN(size);
 533    start_addr = TARGET_PAGE_ALIGN(start_addr);
 534
 535    /* KVM does not support read-only slots */
 536    phys_offset &= ~IO_MEM_ROM;
 537
 538    while (1) {
 539        mem = kvm_lookup_overlapping_slot(s, start_addr, start_addr + size);
 540        if (!mem) {
 541            break;
 542        }
 543
 544        if (flags < IO_MEM_UNASSIGNED && start_addr >= mem->start_addr &&
 545            (start_addr + size <= mem->start_addr + mem->memory_size) &&
 546            (phys_offset - start_addr == mem->phys_offset - mem->start_addr)) {
 547            /* The new slot fits into the existing one and comes with
 548             * identical parameters - update flags and done. */
 549            kvm_slot_dirty_pages_log_change(mem, log_dirty);
 550            return;
 551        }
 552
 553        old = *mem;
 554
 555        /* unregister the overlapping slot */
 556        mem->memory_size = 0;
 557        err = kvm_set_user_memory_region(s, mem);
 558        if (err) {
 559            fprintf(stderr, "%s: error unregistering overlapping slot: %s\n",
 560                    __func__, strerror(-err));
 561            abort();
 562        }
 563
 564        /* Workaround for older KVM versions: we can't join slots, even not by
 565         * unregistering the previous ones and then registering the larger
 566         * slot. We have to maintain the existing fragmentation. Sigh.
 567         *
 568         * This workaround assumes that the new slot starts at the same
 569         * address as the first existing one. If not or if some overlapping
 570         * slot comes around later, we will fail (not seen in practice so far)
 571         * - and actually require a recent KVM version. */
 572        if (s->broken_set_mem_region &&
 573            old.start_addr == start_addr && old.memory_size < size &&
 574            flags < IO_MEM_UNASSIGNED) {
 575            mem = kvm_alloc_slot(s);
 576            mem->memory_size = old.memory_size;
 577            mem->start_addr = old.start_addr;
 578            mem->phys_offset = old.phys_offset;
 579            mem->flags = kvm_mem_flags(s, log_dirty);
 580
 581            err = kvm_set_user_memory_region(s, mem);
 582            if (err) {
 583                fprintf(stderr, "%s: error updating slot: %s\n", __func__,
 584                        strerror(-err));
 585                abort();
 586            }
 587
 588            start_addr += old.memory_size;
 589            phys_offset += old.memory_size;
 590            size -= old.memory_size;
 591            continue;
 592        }
 593
 594        /* register prefix slot */
 595        if (old.start_addr < start_addr) {
 596            mem = kvm_alloc_slot(s);
 597            mem->memory_size = start_addr - old.start_addr;
 598            mem->start_addr = old.start_addr;
 599            mem->phys_offset = old.phys_offset;
 600            mem->flags =  kvm_mem_flags(s, log_dirty);
 601
 602            err = kvm_set_user_memory_region(s, mem);
 603            if (err) {
 604                fprintf(stderr, "%s: error registering prefix slot: %s\n",
 605                        __func__, strerror(-err));
 606#ifdef TARGET_PPC
 607                fprintf(stderr, "%s: This is probably because your kernel's " \
 608                                "PAGE_SIZE is too big. Please try to use 4k " \
 609                                "PAGE_SIZE!\n", __func__);
 610#endif
 611                abort();
 612            }
 613        }
 614
 615        /* register suffix slot */
 616        if (old.start_addr + old.memory_size > start_addr + size) {
 617            ram_addr_t size_delta;
 618
 619            mem = kvm_alloc_slot(s);
 620            mem->start_addr = start_addr + size;
 621            size_delta = mem->start_addr - old.start_addr;
 622            mem->memory_size = old.memory_size - size_delta;
 623            mem->phys_offset = old.phys_offset + size_delta;
 624            mem->flags = kvm_mem_flags(s, log_dirty);
 625
 626            err = kvm_set_user_memory_region(s, mem);
 627            if (err) {
 628                fprintf(stderr, "%s: error registering suffix slot: %s\n",
 629                        __func__, strerror(-err));
 630                abort();
 631            }
 632        }
 633    }
 634
 635    /* in case the KVM bug workaround already "consumed" the new slot */
 636    if (!size) {
 637        return;
 638    }
 639    /* KVM does not need to know about this memory */
 640    if (flags >= IO_MEM_UNASSIGNED) {
 641        return;
 642    }
 643    mem = kvm_alloc_slot(s);
 644    mem->memory_size = size;
 645    mem->start_addr = start_addr;
 646    mem->phys_offset = phys_offset;
 647    mem->flags = kvm_mem_flags(s, log_dirty);
 648
 649    err = kvm_set_user_memory_region(s, mem);
 650    if (err) {
 651        fprintf(stderr, "%s: error registering slot: %s\n", __func__,
 652                strerror(-err));
 653        abort();
 654    }
 655}
 656
 657static void kvm_client_set_memory(struct CPUPhysMemoryClient *client,
 658                                  target_phys_addr_t start_addr,
 659                                  ram_addr_t size, ram_addr_t phys_offset,
 660                                  bool log_dirty)
 661{
 662    kvm_set_phys_mem(start_addr, size, phys_offset, log_dirty);
 663}
 664
 665static int kvm_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
 666                                        target_phys_addr_t start_addr,
 667                                        target_phys_addr_t end_addr)
 668{
 669    return kvm_physical_sync_dirty_bitmap(start_addr, end_addr);
 670}
 671
 672static int kvm_client_migration_log(struct CPUPhysMemoryClient *client,
 673                                    int enable)
 674{
 675    return kvm_set_migration_log(enable);
 676}
 677
 678static CPUPhysMemoryClient kvm_cpu_phys_memory_client = {
 679    .set_memory = kvm_client_set_memory,
 680    .sync_dirty_bitmap = kvm_client_sync_dirty_bitmap,
 681    .migration_log = kvm_client_migration_log,
 682    .log_start = kvm_log_start,
 683    .log_stop = kvm_log_stop,
 684};
 685
 686static void kvm_handle_interrupt(CPUState *env, int mask)
 687{
 688    env->interrupt_request |= mask;
 689
 690    if (!qemu_cpu_is_self(env)) {
 691        qemu_cpu_kick(env);
 692    }
 693}
 694
 695int kvm_init(void)
 696{
 697    static const char upgrade_note[] =
 698        "Please upgrade to at least kernel 2.6.29 or recent kvm-kmod\n"
 699        "(see http://sourceforge.net/projects/kvm).\n";
 700    KVMState *s;
 701    const KVMCapabilityInfo *missing_cap;
 702    int ret;
 703    int i;
 704
 705    s = qemu_mallocz(sizeof(KVMState));
 706
 707#ifdef KVM_CAP_SET_GUEST_DEBUG
 708    QTAILQ_INIT(&s->kvm_sw_breakpoints);
 709#endif
 710    for (i = 0; i < ARRAY_SIZE(s->slots); i++) {
 711        s->slots[i].slot = i;
 712    }
 713    s->vmfd = -1;
 714    s->fd = qemu_open("/dev/kvm", O_RDWR);
 715    if (s->fd == -1) {
 716        fprintf(stderr, "Could not access KVM kernel module: %m\n");
 717        ret = -errno;
 718        goto err;
 719    }
 720
 721    ret = kvm_ioctl(s, KVM_GET_API_VERSION, 0);
 722    if (ret < KVM_API_VERSION) {
 723        if (ret > 0) {
 724            ret = -EINVAL;
 725        }
 726        fprintf(stderr, "kvm version too old\n");
 727        goto err;
 728    }
 729
 730    if (ret > KVM_API_VERSION) {
 731        ret = -EINVAL;
 732        fprintf(stderr, "kvm version not supported\n");
 733        goto err;
 734    }
 735
 736    s->vmfd = kvm_ioctl(s, KVM_CREATE_VM, 0);
 737    if (s->vmfd < 0) {
 738#ifdef TARGET_S390X
 739        fprintf(stderr, "Please add the 'switch_amode' kernel parameter to "
 740                        "your host kernel command line\n");
 741#endif
 742        goto err;
 743    }
 744
 745    missing_cap = kvm_check_extension_list(s, kvm_required_capabilites);
 746    if (!missing_cap) {
 747        missing_cap =
 748            kvm_check_extension_list(s, kvm_arch_required_capabilities);
 749    }
 750    if (missing_cap) {
 751        ret = -EINVAL;
 752        fprintf(stderr, "kvm does not support %s\n%s",
 753                missing_cap->name, upgrade_note);
 754        goto err;
 755    }
 756
 757    s->coalesced_mmio = kvm_check_extension(s, KVM_CAP_COALESCED_MMIO);
 758
 759    s->broken_set_mem_region = 1;
 760    ret = kvm_check_extension(s, KVM_CAP_JOIN_MEMORY_REGIONS_WORKS);
 761    if (ret > 0) {
 762        s->broken_set_mem_region = 0;
 763    }
 764
 765#ifdef KVM_CAP_VCPU_EVENTS
 766    s->vcpu_events = kvm_check_extension(s, KVM_CAP_VCPU_EVENTS);
 767#endif
 768
 769    s->robust_singlestep =
 770        kvm_check_extension(s, KVM_CAP_X86_ROBUST_SINGLESTEP);
 771
 772#ifdef KVM_CAP_DEBUGREGS
 773    s->debugregs = kvm_check_extension(s, KVM_CAP_DEBUGREGS);
 774#endif
 775
 776#ifdef KVM_CAP_XSAVE
 777    s->xsave = kvm_check_extension(s, KVM_CAP_XSAVE);
 778#endif
 779
 780#ifdef KVM_CAP_XCRS
 781    s->xcrs = kvm_check_extension(s, KVM_CAP_XCRS);
 782#endif
 783
 784    ret = kvm_arch_init(s);
 785    if (ret < 0) {
 786        goto err;
 787    }
 788
 789    kvm_state = s;
 790    cpu_register_phys_memory_client(&kvm_cpu_phys_memory_client);
 791
 792    s->many_ioeventfds = kvm_check_many_ioeventfds();
 793
 794    cpu_interrupt_handler = kvm_handle_interrupt;
 795
 796    return 0;
 797
 798err:
 799    if (s) {
 800        if (s->vmfd != -1) {
 801            close(s->vmfd);
 802        }
 803        if (s->fd != -1) {
 804            close(s->fd);
 805        }
 806    }
 807    qemu_free(s);
 808
 809    return ret;
 810}
 811
 812static void kvm_handle_io(uint16_t port, void *data, int direction, int size,
 813                          uint32_t count)
 814{
 815    int i;
 816    uint8_t *ptr = data;
 817
 818    for (i = 0; i < count; i++) {
 819        if (direction == KVM_EXIT_IO_IN) {
 820            switch (size) {
 821            case 1:
 822                stb_p(ptr, cpu_inb(port));
 823                break;
 824            case 2:
 825                stw_p(ptr, cpu_inw(port));
 826                break;
 827            case 4:
 828                stl_p(ptr, cpu_inl(port));
 829                break;
 830            }
 831        } else {
 832            switch (size) {
 833            case 1:
 834                cpu_outb(port, ldub_p(ptr));
 835                break;
 836            case 2:
 837                cpu_outw(port, lduw_p(ptr));
 838                break;
 839            case 4:
 840                cpu_outl(port, ldl_p(ptr));
 841                break;
 842            }
 843        }
 844
 845        ptr += size;
 846    }
 847}
 848
 849static int kvm_handle_internal_error(CPUState *env, struct kvm_run *run)
 850{
 851    fprintf(stderr, "KVM internal error.");
 852    if (kvm_check_extension(kvm_state, KVM_CAP_INTERNAL_ERROR_DATA)) {
 853        int i;
 854
 855        fprintf(stderr, " Suberror: %d\n", run->internal.suberror);
 856        for (i = 0; i < run->internal.ndata; ++i) {
 857            fprintf(stderr, "extra data[%d]: %"PRIx64"\n",
 858                    i, (uint64_t)run->internal.data[i]);
 859        }
 860    } else {
 861        fprintf(stderr, "\n");
 862    }
 863    if (run->internal.suberror == KVM_INTERNAL_ERROR_EMULATION) {
 864        fprintf(stderr, "emulation failure\n");
 865        if (!kvm_arch_stop_on_emulation_error(env)) {
 866            cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
 867            return EXCP_INTERRUPT;
 868        }
 869    }
 870    /* FIXME: Should trigger a qmp message to let management know
 871     * something went wrong.
 872     */
 873    return -1;
 874}
 875
 876void kvm_flush_coalesced_mmio_buffer(void)
 877{
 878    KVMState *s = kvm_state;
 879    if (s->coalesced_mmio_ring) {
 880        struct kvm_coalesced_mmio_ring *ring = s->coalesced_mmio_ring;
 881        while (ring->first != ring->last) {
 882            struct kvm_coalesced_mmio *ent;
 883
 884            ent = &ring->coalesced_mmio[ring->first];
 885
 886            cpu_physical_memory_write(ent->phys_addr, ent->data, ent->len);
 887            smp_wmb();
 888            ring->first = (ring->first + 1) % KVM_COALESCED_MMIO_MAX;
 889        }
 890    }
 891}
 892
 893static void do_kvm_cpu_synchronize_state(void *_env)
 894{
 895    CPUState *env = _env;
 896
 897    if (!env->kvm_vcpu_dirty) {
 898        kvm_arch_get_registers(env);
 899        env->kvm_vcpu_dirty = 1;
 900    }
 901}
 902
 903void kvm_cpu_synchronize_state(CPUState *env)
 904{
 905    if (!env->kvm_vcpu_dirty) {
 906        run_on_cpu(env, do_kvm_cpu_synchronize_state, env);
 907    }
 908}
 909
 910void kvm_cpu_synchronize_post_reset(CPUState *env)
 911{
 912    kvm_arch_put_registers(env, KVM_PUT_RESET_STATE);
 913    env->kvm_vcpu_dirty = 0;
 914}
 915
 916void kvm_cpu_synchronize_post_init(CPUState *env)
 917{
 918    kvm_arch_put_registers(env, KVM_PUT_FULL_STATE);
 919    env->kvm_vcpu_dirty = 0;
 920}
 921
 922int kvm_cpu_exec(CPUState *env)
 923{
 924    struct kvm_run *run = env->kvm_run;
 925    int ret, run_ret;
 926
 927    DPRINTF("kvm_cpu_exec()\n");
 928
 929    if (kvm_arch_process_async_events(env)) {
 930        env->exit_request = 0;
 931        return EXCP_HLT;
 932    }
 933
 934    cpu_single_env = env;
 935
 936    do {
 937        if (env->kvm_vcpu_dirty) {
 938            kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);
 939            env->kvm_vcpu_dirty = 0;
 940        }
 941
 942        kvm_arch_pre_run(env, run);
 943        if (env->exit_request) {
 944            DPRINTF("interrupt exit requested\n");
 945            /*
 946             * KVM requires us to reenter the kernel after IO exits to complete
 947             * instruction emulation. This self-signal will ensure that we
 948             * leave ASAP again.
 949             */
 950            qemu_cpu_kick_self();
 951        }
 952        cpu_single_env = NULL;
 953        qemu_mutex_unlock_iothread();
 954
 955        run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
 956
 957        qemu_mutex_lock_iothread();
 958        cpu_single_env = env;
 959        kvm_arch_post_run(env, run);
 960
 961        kvm_flush_coalesced_mmio_buffer();
 962
 963        if (run_ret < 0) {
 964            if (run_ret == -EINTR || run_ret == -EAGAIN) {
 965                DPRINTF("io window exit\n");
 966                ret = EXCP_INTERRUPT;
 967                break;
 968            }
 969            DPRINTF("kvm run failed %s\n", strerror(-run_ret));
 970            abort();
 971        }
 972
 973        switch (run->exit_reason) {
 974        case KVM_EXIT_IO:
 975            DPRINTF("handle_io\n");
 976            kvm_handle_io(run->io.port,
 977                          (uint8_t *)run + run->io.data_offset,
 978                          run->io.direction,
 979                          run->io.size,
 980                          run->io.count);
 981            ret = 0;
 982            break;
 983        case KVM_EXIT_MMIO:
 984            DPRINTF("handle_mmio\n");
 985            cpu_physical_memory_rw(run->mmio.phys_addr,
 986                                   run->mmio.data,
 987                                   run->mmio.len,
 988                                   run->mmio.is_write);
 989            ret = 0;
 990            break;
 991        case KVM_EXIT_IRQ_WINDOW_OPEN:
 992            DPRINTF("irq_window_open\n");
 993            ret = EXCP_INTERRUPT;
 994            break;
 995        case KVM_EXIT_SHUTDOWN:
 996            DPRINTF("shutdown\n");
 997            qemu_system_reset_request();
 998            ret = EXCP_INTERRUPT;
 999            break;
1000        case KVM_EXIT_UNKNOWN:
1001            fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64 "\n",
1002                    (uint64_t)run->hw.hardware_exit_reason);
1003            ret = -1;
1004            break;
1005        case KVM_EXIT_INTERNAL_ERROR:
1006            ret = kvm_handle_internal_error(env, run);
1007            break;
1008        default:
1009            DPRINTF("kvm_arch_handle_exit\n");
1010            ret = kvm_arch_handle_exit(env, run);
1011            break;
1012        }
1013    } while (ret == 0);
1014
1015    if (ret < 0) {
1016        cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
1017        vm_stop(VMSTOP_PANIC);
1018    }
1019
1020    env->exit_request = 0;
1021    cpu_single_env = NULL;
1022    return ret;
1023}
1024
1025int kvm_ioctl(KVMState *s, int type, ...)
1026{
1027    int ret;
1028    void *arg;
1029    va_list ap;
1030
1031    va_start(ap, type);
1032    arg = va_arg(ap, void *);
1033    va_end(ap);
1034
1035    ret = ioctl(s->fd, type, arg);
1036    if (ret == -1) {
1037        ret = -errno;
1038    }
1039    return ret;
1040}
1041
1042int kvm_vm_ioctl(KVMState *s, int type, ...)
1043{
1044    int ret;
1045    void *arg;
1046    va_list ap;
1047
1048    va_start(ap, type);
1049    arg = va_arg(ap, void *);
1050    va_end(ap);
1051
1052    ret = ioctl(s->vmfd, type, arg);
1053    if (ret == -1) {
1054        ret = -errno;
1055    }
1056    return ret;
1057}
1058
1059int kvm_vcpu_ioctl(CPUState *env, int type, ...)
1060{
1061    int ret;
1062    void *arg;
1063    va_list ap;
1064
1065    va_start(ap, type);
1066    arg = va_arg(ap, void *);
1067    va_end(ap);
1068
1069    ret = ioctl(env->kvm_fd, type, arg);
1070    if (ret == -1) {
1071        ret = -errno;
1072    }
1073    return ret;
1074}
1075
1076int kvm_has_sync_mmu(void)
1077{
1078    return kvm_check_extension(kvm_state, KVM_CAP_SYNC_MMU);
1079}
1080
1081int kvm_has_vcpu_events(void)
1082{
1083    return kvm_state->vcpu_events;
1084}
1085
1086int kvm_has_robust_singlestep(void)
1087{
1088    return kvm_state->robust_singlestep;
1089}
1090
1091int kvm_has_debugregs(void)
1092{
1093    return kvm_state->debugregs;
1094}
1095
1096int kvm_has_xsave(void)
1097{
1098    return kvm_state->xsave;
1099}
1100
1101int kvm_has_xcrs(void)
1102{
1103    return kvm_state->xcrs;
1104}
1105
1106int kvm_has_many_ioeventfds(void)
1107{
1108    if (!kvm_enabled()) {
1109        return 0;
1110    }
1111    return kvm_state->many_ioeventfds;
1112}
1113
1114void kvm_setup_guest_memory(void *start, size_t size)
1115{
1116    if (!kvm_has_sync_mmu()) {
1117        int ret = qemu_madvise(start, size, QEMU_MADV_DONTFORK);
1118
1119        if (ret) {
1120            perror("qemu_madvise");
1121            fprintf(stderr,
1122                    "Need MADV_DONTFORK in absence of synchronous KVM MMU\n");
1123            exit(1);
1124        }
1125    }
1126}
1127
1128#ifdef KVM_CAP_SET_GUEST_DEBUG
1129struct kvm_sw_breakpoint *kvm_find_sw_breakpoint(CPUState *env,
1130                                                 target_ulong pc)
1131{
1132    struct kvm_sw_breakpoint *bp;
1133
1134    QTAILQ_FOREACH(bp, &env->kvm_state->kvm_sw_breakpoints, entry) {
1135        if (bp->pc == pc) {
1136            return bp;
1137        }
1138    }
1139    return NULL;
1140}
1141
1142int kvm_sw_breakpoints_active(CPUState *env)
1143{
1144    return !QTAILQ_EMPTY(&env->kvm_state->kvm_sw_breakpoints);
1145}
1146
1147struct kvm_set_guest_debug_data {
1148    struct kvm_guest_debug dbg;
1149    CPUState *env;
1150    int err;
1151};
1152
1153static void kvm_invoke_set_guest_debug(void *data)
1154{
1155    struct kvm_set_guest_debug_data *dbg_data = data;
1156    CPUState *env = dbg_data->env;
1157
1158    dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg);
1159}
1160
1161int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1162{
1163    struct kvm_set_guest_debug_data data;
1164
1165    data.dbg.control = reinject_trap;
1166
1167    if (env->singlestep_enabled) {
1168        data.dbg.control |= KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP;
1169    }
1170    kvm_arch_update_guest_debug(env, &data.dbg);
1171    data.env = env;
1172
1173    run_on_cpu(env, kvm_invoke_set_guest_debug, &data);
1174    return data.err;
1175}
1176
1177int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1178                          target_ulong len, int type)
1179{
1180    struct kvm_sw_breakpoint *bp;
1181    CPUState *env;
1182    int err;
1183
1184    if (type == GDB_BREAKPOINT_SW) {
1185        bp = kvm_find_sw_breakpoint(current_env, addr);
1186        if (bp) {
1187            bp->use_count++;
1188            return 0;
1189        }
1190
1191        bp = qemu_malloc(sizeof(struct kvm_sw_breakpoint));
1192        if (!bp) {
1193            return -ENOMEM;
1194        }
1195
1196        bp->pc = addr;
1197        bp->use_count = 1;
1198        err = kvm_arch_insert_sw_breakpoint(current_env, bp);
1199        if (err) {
1200            qemu_free(bp);
1201            return err;
1202        }
1203
1204        QTAILQ_INSERT_HEAD(&current_env->kvm_state->kvm_sw_breakpoints,
1205                          bp, entry);
1206    } else {
1207        err = kvm_arch_insert_hw_breakpoint(addr, len, type);
1208        if (err) {
1209            return err;
1210        }
1211    }
1212
1213    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1214        err = kvm_update_guest_debug(env, 0);
1215        if (err) {
1216            return err;
1217        }
1218    }
1219    return 0;
1220}
1221
1222int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1223                          target_ulong len, int type)
1224{
1225    struct kvm_sw_breakpoint *bp;
1226    CPUState *env;
1227    int err;
1228
1229    if (type == GDB_BREAKPOINT_SW) {
1230        bp = kvm_find_sw_breakpoint(current_env, addr);
1231        if (!bp) {
1232            return -ENOENT;
1233        }
1234
1235        if (bp->use_count > 1) {
1236            bp->use_count--;
1237            return 0;
1238        }
1239
1240        err = kvm_arch_remove_sw_breakpoint(current_env, bp);
1241        if (err) {
1242            return err;
1243        }
1244
1245        QTAILQ_REMOVE(&current_env->kvm_state->kvm_sw_breakpoints, bp, entry);
1246        qemu_free(bp);
1247    } else {
1248        err = kvm_arch_remove_hw_breakpoint(addr, len, type);
1249        if (err) {
1250            return err;
1251        }
1252    }
1253
1254    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1255        err = kvm_update_guest_debug(env, 0);
1256        if (err) {
1257            return err;
1258        }
1259    }
1260    return 0;
1261}
1262
1263void kvm_remove_all_breakpoints(CPUState *current_env)
1264{
1265    struct kvm_sw_breakpoint *bp, *next;
1266    KVMState *s = current_env->kvm_state;
1267    CPUState *env;
1268
1269    QTAILQ_FOREACH_SAFE(bp, &s->kvm_sw_breakpoints, entry, next) {
1270        if (kvm_arch_remove_sw_breakpoint(current_env, bp) != 0) {
1271            /* Try harder to find a CPU that currently sees the breakpoint. */
1272            for (env = first_cpu; env != NULL; env = env->next_cpu) {
1273                if (kvm_arch_remove_sw_breakpoint(env, bp) == 0) {
1274                    break;
1275                }
1276            }
1277        }
1278    }
1279    kvm_arch_remove_all_hw_breakpoints();
1280
1281    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1282        kvm_update_guest_debug(env, 0);
1283    }
1284}
1285
1286#else /* !KVM_CAP_SET_GUEST_DEBUG */
1287
1288int kvm_update_guest_debug(CPUState *env, unsigned long reinject_trap)
1289{
1290    return -EINVAL;
1291}
1292
1293int kvm_insert_breakpoint(CPUState *current_env, target_ulong addr,
1294                          target_ulong len, int type)
1295{
1296    return -EINVAL;
1297}
1298
1299int kvm_remove_breakpoint(CPUState *current_env, target_ulong addr,
1300                          target_ulong len, int type)
1301{
1302    return -EINVAL;
1303}
1304
1305void kvm_remove_all_breakpoints(CPUState *current_env)
1306{
1307}
1308#endif /* !KVM_CAP_SET_GUEST_DEBUG */
1309
1310int kvm_set_signal_mask(CPUState *env, const sigset_t *sigset)
1311{
1312    struct kvm_signal_mask *sigmask;
1313    int r;
1314
1315    if (!sigset) {
1316        return kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, NULL);
1317    }
1318
1319    sigmask = qemu_malloc(sizeof(*sigmask) + sizeof(*sigset));
1320
1321    sigmask->len = 8;
1322    memcpy(sigmask->sigset, sigset, sizeof(*sigset));
1323    r = kvm_vcpu_ioctl(env, KVM_SET_SIGNAL_MASK, sigmask);
1324    qemu_free(sigmask);
1325
1326    return r;
1327}
1328
1329int kvm_set_ioeventfd_mmio_long(int fd, uint32_t addr, uint32_t val, bool assign)
1330{
1331#ifdef KVM_IOEVENTFD
1332    int ret;
1333    struct kvm_ioeventfd iofd;
1334
1335    iofd.datamatch = val;
1336    iofd.addr = addr;
1337    iofd.len = 4;
1338    iofd.flags = KVM_IOEVENTFD_FLAG_DATAMATCH;
1339    iofd.fd = fd;
1340
1341    if (!kvm_enabled()) {
1342        return -ENOSYS;
1343    }
1344
1345    if (!assign) {
1346        iofd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1347    }
1348
1349    ret = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &iofd);
1350
1351    if (ret < 0) {
1352        return -errno;
1353    }
1354
1355    return 0;
1356#else
1357    return -ENOSYS;
1358#endif
1359}
1360
1361int kvm_set_ioeventfd_pio_word(int fd, uint16_t addr, uint16_t val, bool assign)
1362{
1363#ifdef KVM_IOEVENTFD
1364    struct kvm_ioeventfd kick = {
1365        .datamatch = val,
1366        .addr = addr,
1367        .len = 2,
1368        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
1369        .fd = fd,
1370    };
1371    int r;
1372    if (!kvm_enabled()) {
1373        return -ENOSYS;
1374    }
1375    if (!assign) {
1376        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
1377    }
1378    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
1379    if (r < 0) {
1380        return r;
1381    }
1382    return 0;
1383#else
1384    return -ENOSYS;
1385#endif
1386}
1387
1388int kvm_on_sigbus_vcpu(CPUState *env, int code, void *addr)
1389{
1390    return kvm_arch_on_sigbus_vcpu(env, code, addr);
1391}
1392
1393int kvm_on_sigbus(int code, void *addr)
1394{
1395    return kvm_arch_on_sigbus(code, addr);
1396}
1397