qemu/cpus.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25/* Needed early for CONFIG_BSD etc. */
  26#include "config-host.h"
  27
  28#include "monitor.h"
  29#include "sysemu.h"
  30#include "gdbstub.h"
  31#include "dma.h"
  32#include "kvm.h"
  33#include "qmp-commands.h"
  34
  35#include "qemu-thread.h"
  36#include "cpus.h"
  37#include "qtest.h"
  38#include "main-loop.h"
  39#include "bitmap.h"
  40
  41#ifndef _WIN32
  42#include "compatfd.h"
  43#endif
  44
  45#ifdef CONFIG_LINUX
  46
  47#include <sys/prctl.h>
  48
  49#ifndef PR_MCE_KILL
  50#define PR_MCE_KILL 33
  51#endif
  52
  53#ifndef PR_MCE_KILL_SET
  54#define PR_MCE_KILL_SET 1
  55#endif
  56
  57#ifndef PR_MCE_KILL_EARLY
  58#define PR_MCE_KILL_EARLY 1
  59#endif
  60
  61#endif /* CONFIG_LINUX */
  62
  63static CPUArchState *next_cpu;
  64
  65static bool cpu_thread_is_idle(CPUArchState *env)
  66{
  67    CPUState *cpu = ENV_GET_CPU(env);
  68
  69    if (cpu->stop || cpu->queued_work_first) {
  70        return false;
  71    }
  72    if (cpu->stopped || !runstate_is_running()) {
  73        return true;
  74    }
  75    if (!env->halted || qemu_cpu_has_work(cpu) ||
  76        kvm_async_interrupts_enabled()) {
  77        return false;
  78    }
  79    return true;
  80}
  81
  82static bool all_cpu_threads_idle(void)
  83{
  84    CPUArchState *env;
  85
  86    for (env = first_cpu; env != NULL; env = env->next_cpu) {
  87        if (!cpu_thread_is_idle(env)) {
  88            return false;
  89        }
  90    }
  91    return true;
  92}
  93
  94/***********************************************************/
  95/* guest cycle counter */
  96
  97/* Conversion factor from emulated instructions to virtual clock ticks.  */
  98static int icount_time_shift;
  99/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 100#define MAX_ICOUNT_SHIFT 10
 101/* Compensate for varying guest execution speed.  */
 102static int64_t qemu_icount_bias;
 103static QEMUTimer *icount_rt_timer;
 104static QEMUTimer *icount_vm_timer;
 105static QEMUTimer *icount_warp_timer;
 106static int64_t vm_clock_warp_start;
 107static int64_t qemu_icount;
 108
 109typedef struct TimersState {
 110    int64_t cpu_ticks_prev;
 111    int64_t cpu_ticks_offset;
 112    int64_t cpu_clock_offset;
 113    int32_t cpu_ticks_enabled;
 114    int64_t dummy;
 115} TimersState;
 116
 117TimersState timers_state;
 118
 119/* Return the virtual CPU time, based on the instruction counter.  */
 120int64_t cpu_get_icount(void)
 121{
 122    int64_t icount;
 123    CPUArchState *env = cpu_single_env;
 124
 125    icount = qemu_icount;
 126    if (env) {
 127        if (!can_do_io(env)) {
 128            fprintf(stderr, "Bad clock read\n");
 129        }
 130        icount -= (env->icount_decr.u16.low + env->icount_extra);
 131    }
 132    return qemu_icount_bias + (icount << icount_time_shift);
 133}
 134
 135/* return the host CPU cycle counter and handle stop/restart */
 136int64_t cpu_get_ticks(void)
 137{
 138    if (use_icount) {
 139        return cpu_get_icount();
 140    }
 141    if (!timers_state.cpu_ticks_enabled) {
 142        return timers_state.cpu_ticks_offset;
 143    } else {
 144        int64_t ticks;
 145        ticks = cpu_get_real_ticks();
 146        if (timers_state.cpu_ticks_prev > ticks) {
 147            /* Note: non increasing ticks may happen if the host uses
 148               software suspend */
 149            timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 150        }
 151        timers_state.cpu_ticks_prev = ticks;
 152        return ticks + timers_state.cpu_ticks_offset;
 153    }
 154}
 155
 156/* return the host CPU monotonic timer and handle stop/restart */
 157int64_t cpu_get_clock(void)
 158{
 159    int64_t ti;
 160    if (!timers_state.cpu_ticks_enabled) {
 161        return timers_state.cpu_clock_offset;
 162    } else {
 163        ti = get_clock();
 164        return ti + timers_state.cpu_clock_offset;
 165    }
 166}
 167
 168/* enable cpu_get_ticks() */
 169void cpu_enable_ticks(void)
 170{
 171    if (!timers_state.cpu_ticks_enabled) {
 172        timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
 173        timers_state.cpu_clock_offset -= get_clock();
 174        timers_state.cpu_ticks_enabled = 1;
 175    }
 176}
 177
 178/* disable cpu_get_ticks() : the clock is stopped. You must not call
 179   cpu_get_ticks() after that.  */
 180void cpu_disable_ticks(void)
 181{
 182    if (timers_state.cpu_ticks_enabled) {
 183        timers_state.cpu_ticks_offset = cpu_get_ticks();
 184        timers_state.cpu_clock_offset = cpu_get_clock();
 185        timers_state.cpu_ticks_enabled = 0;
 186    }
 187}
 188
 189/* Correlation between real and virtual time is always going to be
 190   fairly approximate, so ignore small variation.
 191   When the guest is idle real and virtual time will be aligned in
 192   the IO wait loop.  */
 193#define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
 194
 195static void icount_adjust(void)
 196{
 197    int64_t cur_time;
 198    int64_t cur_icount;
 199    int64_t delta;
 200    static int64_t last_delta;
 201    /* If the VM is not running, then do nothing.  */
 202    if (!runstate_is_running()) {
 203        return;
 204    }
 205    cur_time = cpu_get_clock();
 206    cur_icount = qemu_get_clock_ns(vm_clock);
 207    delta = cur_icount - cur_time;
 208    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 209    if (delta > 0
 210        && last_delta + ICOUNT_WOBBLE < delta * 2
 211        && icount_time_shift > 0) {
 212        /* The guest is getting too far ahead.  Slow time down.  */
 213        icount_time_shift--;
 214    }
 215    if (delta < 0
 216        && last_delta - ICOUNT_WOBBLE > delta * 2
 217        && icount_time_shift < MAX_ICOUNT_SHIFT) {
 218        /* The guest is getting too far behind.  Speed time up.  */
 219        icount_time_shift++;
 220    }
 221    last_delta = delta;
 222    qemu_icount_bias = cur_icount - (qemu_icount << icount_time_shift);
 223}
 224
 225static void icount_adjust_rt(void *opaque)
 226{
 227    qemu_mod_timer(icount_rt_timer,
 228                   qemu_get_clock_ms(rt_clock) + 1000);
 229    icount_adjust();
 230}
 231
 232static void icount_adjust_vm(void *opaque)
 233{
 234    qemu_mod_timer(icount_vm_timer,
 235                   qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
 236    icount_adjust();
 237}
 238
 239static int64_t qemu_icount_round(int64_t count)
 240{
 241    return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 242}
 243
 244static void icount_warp_rt(void *opaque)
 245{
 246    if (vm_clock_warp_start == -1) {
 247        return;
 248    }
 249
 250    if (runstate_is_running()) {
 251        int64_t clock = qemu_get_clock_ns(rt_clock);
 252        int64_t warp_delta = clock - vm_clock_warp_start;
 253        if (use_icount == 1) {
 254            qemu_icount_bias += warp_delta;
 255        } else {
 256            /*
 257             * In adaptive mode, do not let the vm_clock run too
 258             * far ahead of real time.
 259             */
 260            int64_t cur_time = cpu_get_clock();
 261            int64_t cur_icount = qemu_get_clock_ns(vm_clock);
 262            int64_t delta = cur_time - cur_icount;
 263            qemu_icount_bias += MIN(warp_delta, delta);
 264        }
 265        if (qemu_clock_expired(vm_clock)) {
 266            qemu_notify_event();
 267        }
 268    }
 269    vm_clock_warp_start = -1;
 270}
 271
 272void qtest_clock_warp(int64_t dest)
 273{
 274    int64_t clock = qemu_get_clock_ns(vm_clock);
 275    assert(qtest_enabled());
 276    while (clock < dest) {
 277        int64_t deadline = qemu_clock_deadline(vm_clock);
 278        int64_t warp = MIN(dest - clock, deadline);
 279        qemu_icount_bias += warp;
 280        qemu_run_timers(vm_clock);
 281        clock = qemu_get_clock_ns(vm_clock);
 282    }
 283    qemu_notify_event();
 284}
 285
 286void qemu_clock_warp(QEMUClock *clock)
 287{
 288    int64_t deadline;
 289
 290    /*
 291     * There are too many global variables to make the "warp" behavior
 292     * applicable to other clocks.  But a clock argument removes the
 293     * need for if statements all over the place.
 294     */
 295    if (clock != vm_clock || !use_icount) {
 296        return;
 297    }
 298
 299    /*
 300     * If the CPUs have been sleeping, advance the vm_clock timer now.  This
 301     * ensures that the deadline for the timer is computed correctly below.
 302     * This also makes sure that the insn counter is synchronized before the
 303     * CPU starts running, in case the CPU is woken by an event other than
 304     * the earliest vm_clock timer.
 305     */
 306    icount_warp_rt(NULL);
 307    if (!all_cpu_threads_idle() || !qemu_clock_has_timers(vm_clock)) {
 308        qemu_del_timer(icount_warp_timer);
 309        return;
 310    }
 311
 312    if (qtest_enabled()) {
 313        /* When testing, qtest commands advance icount.  */
 314        return;
 315    }
 316
 317    vm_clock_warp_start = qemu_get_clock_ns(rt_clock);
 318    deadline = qemu_clock_deadline(vm_clock);
 319    if (deadline > 0) {
 320        /*
 321         * Ensure the vm_clock proceeds even when the virtual CPU goes to
 322         * sleep.  Otherwise, the CPU might be waiting for a future timer
 323         * interrupt to wake it up, but the interrupt never comes because
 324         * the vCPU isn't running any insns and thus doesn't advance the
 325         * vm_clock.
 326         *
 327         * An extreme solution for this problem would be to never let VCPUs
 328         * sleep in icount mode if there is a pending vm_clock timer; rather
 329         * time could just advance to the next vm_clock event.  Instead, we
 330         * do stop VCPUs and only advance vm_clock after some "real" time,
 331         * (related to the time left until the next event) has passed.  This
 332         * rt_clock timer will do this.  This avoids that the warps are too
 333         * visible externally---for example, you will not be sending network
 334         * packets continuously instead of every 100ms.
 335         */
 336        qemu_mod_timer(icount_warp_timer, vm_clock_warp_start + deadline);
 337    } else {
 338        qemu_notify_event();
 339    }
 340}
 341
 342static const VMStateDescription vmstate_timers = {
 343    .name = "timer",
 344    .version_id = 2,
 345    .minimum_version_id = 1,
 346    .minimum_version_id_old = 1,
 347    .fields      = (VMStateField[]) {
 348        VMSTATE_INT64(cpu_ticks_offset, TimersState),
 349        VMSTATE_INT64(dummy, TimersState),
 350        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 351        VMSTATE_END_OF_LIST()
 352    }
 353};
 354
 355void configure_icount(const char *option)
 356{
 357    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 358    if (!option) {
 359        return;
 360    }
 361
 362    icount_warp_timer = qemu_new_timer_ns(rt_clock, icount_warp_rt, NULL);
 363    if (strcmp(option, "auto") != 0) {
 364        icount_time_shift = strtol(option, NULL, 0);
 365        use_icount = 1;
 366        return;
 367    }
 368
 369    use_icount = 2;
 370
 371    /* 125MIPS seems a reasonable initial guess at the guest speed.
 372       It will be corrected fairly quickly anyway.  */
 373    icount_time_shift = 3;
 374
 375    /* Have both realtime and virtual time triggers for speed adjustment.
 376       The realtime trigger catches emulated time passing too slowly,
 377       the virtual time trigger catches emulated time passing too fast.
 378       Realtime triggers occur even when idle, so use them less frequently
 379       than VM triggers.  */
 380    icount_rt_timer = qemu_new_timer_ms(rt_clock, icount_adjust_rt, NULL);
 381    qemu_mod_timer(icount_rt_timer,
 382                   qemu_get_clock_ms(rt_clock) + 1000);
 383    icount_vm_timer = qemu_new_timer_ns(vm_clock, icount_adjust_vm, NULL);
 384    qemu_mod_timer(icount_vm_timer,
 385                   qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
 386}
 387
 388/***********************************************************/
 389void hw_error(const char *fmt, ...)
 390{
 391    va_list ap;
 392    CPUArchState *env;
 393
 394    va_start(ap, fmt);
 395    fprintf(stderr, "qemu: hardware error: ");
 396    vfprintf(stderr, fmt, ap);
 397    fprintf(stderr, "\n");
 398    for(env = first_cpu; env != NULL; env = env->next_cpu) {
 399        fprintf(stderr, "CPU #%d:\n", env->cpu_index);
 400        cpu_dump_state(env, stderr, fprintf, CPU_DUMP_FPU);
 401    }
 402    va_end(ap);
 403    abort();
 404}
 405
 406void cpu_synchronize_all_states(void)
 407{
 408    CPUArchState *cpu;
 409
 410    for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
 411        cpu_synchronize_state(cpu);
 412    }
 413}
 414
 415void cpu_synchronize_all_post_reset(void)
 416{
 417    CPUArchState *cpu;
 418
 419    for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
 420        cpu_synchronize_post_reset(cpu);
 421    }
 422}
 423
 424void cpu_synchronize_all_post_init(void)
 425{
 426    CPUArchState *cpu;
 427
 428    for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
 429        cpu_synchronize_post_init(cpu);
 430    }
 431}
 432
 433bool cpu_is_stopped(CPUState *cpu)
 434{
 435    return !runstate_is_running() || cpu->stopped;
 436}
 437
 438static void do_vm_stop(RunState state)
 439{
 440    if (runstate_is_running()) {
 441        cpu_disable_ticks();
 442        pause_all_vcpus();
 443        runstate_set(state);
 444        vm_state_notify(0, state);
 445        bdrv_drain_all();
 446        bdrv_flush_all();
 447        monitor_protocol_event(QEVENT_STOP, NULL);
 448    }
 449}
 450
 451static bool cpu_can_run(CPUState *cpu)
 452{
 453    if (cpu->stop) {
 454        return false;
 455    }
 456    if (cpu->stopped || !runstate_is_running()) {
 457        return false;
 458    }
 459    return true;
 460}
 461
 462static void cpu_handle_guest_debug(CPUArchState *env)
 463{
 464    CPUState *cpu = ENV_GET_CPU(env);
 465
 466    gdb_set_stop_cpu(env);
 467    qemu_system_debug_request();
 468    cpu->stopped = true;
 469}
 470
 471static void cpu_signal(int sig)
 472{
 473    if (cpu_single_env) {
 474        cpu_exit(cpu_single_env);
 475    }
 476    exit_request = 1;
 477}
 478
 479#ifdef CONFIG_LINUX
 480static void sigbus_reraise(void)
 481{
 482    sigset_t set;
 483    struct sigaction action;
 484
 485    memset(&action, 0, sizeof(action));
 486    action.sa_handler = SIG_DFL;
 487    if (!sigaction(SIGBUS, &action, NULL)) {
 488        raise(SIGBUS);
 489        sigemptyset(&set);
 490        sigaddset(&set, SIGBUS);
 491        sigprocmask(SIG_UNBLOCK, &set, NULL);
 492    }
 493    perror("Failed to re-raise SIGBUS!\n");
 494    abort();
 495}
 496
 497static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
 498                           void *ctx)
 499{
 500    if (kvm_on_sigbus(siginfo->ssi_code,
 501                      (void *)(intptr_t)siginfo->ssi_addr)) {
 502        sigbus_reraise();
 503    }
 504}
 505
 506static void qemu_init_sigbus(void)
 507{
 508    struct sigaction action;
 509
 510    memset(&action, 0, sizeof(action));
 511    action.sa_flags = SA_SIGINFO;
 512    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
 513    sigaction(SIGBUS, &action, NULL);
 514
 515    prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 516}
 517
 518static void qemu_kvm_eat_signals(CPUArchState *env)
 519{
 520    struct timespec ts = { 0, 0 };
 521    siginfo_t siginfo;
 522    sigset_t waitset;
 523    sigset_t chkset;
 524    int r;
 525
 526    sigemptyset(&waitset);
 527    sigaddset(&waitset, SIG_IPI);
 528    sigaddset(&waitset, SIGBUS);
 529
 530    do {
 531        r = sigtimedwait(&waitset, &siginfo, &ts);
 532        if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
 533            perror("sigtimedwait");
 534            exit(1);
 535        }
 536
 537        switch (r) {
 538        case SIGBUS:
 539            if (kvm_on_sigbus_vcpu(env, siginfo.si_code, siginfo.si_addr)) {
 540                sigbus_reraise();
 541            }
 542            break;
 543        default:
 544            break;
 545        }
 546
 547        r = sigpending(&chkset);
 548        if (r == -1) {
 549            perror("sigpending");
 550            exit(1);
 551        }
 552    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 553}
 554
 555#else /* !CONFIG_LINUX */
 556
 557static void qemu_init_sigbus(void)
 558{
 559}
 560
 561static void qemu_kvm_eat_signals(CPUArchState *env)
 562{
 563}
 564#endif /* !CONFIG_LINUX */
 565
 566#ifndef _WIN32
 567static void dummy_signal(int sig)
 568{
 569}
 570
 571static void qemu_kvm_init_cpu_signals(CPUArchState *env)
 572{
 573    int r;
 574    sigset_t set;
 575    struct sigaction sigact;
 576
 577    memset(&sigact, 0, sizeof(sigact));
 578    sigact.sa_handler = dummy_signal;
 579    sigaction(SIG_IPI, &sigact, NULL);
 580
 581    pthread_sigmask(SIG_BLOCK, NULL, &set);
 582    sigdelset(&set, SIG_IPI);
 583    sigdelset(&set, SIGBUS);
 584    r = kvm_set_signal_mask(env, &set);
 585    if (r) {
 586        fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
 587        exit(1);
 588    }
 589}
 590
 591static void qemu_tcg_init_cpu_signals(void)
 592{
 593    sigset_t set;
 594    struct sigaction sigact;
 595
 596    memset(&sigact, 0, sizeof(sigact));
 597    sigact.sa_handler = cpu_signal;
 598    sigaction(SIG_IPI, &sigact, NULL);
 599
 600    sigemptyset(&set);
 601    sigaddset(&set, SIG_IPI);
 602    pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 603}
 604
 605#else /* _WIN32 */
 606static void qemu_kvm_init_cpu_signals(CPUArchState *env)
 607{
 608    abort();
 609}
 610
 611static void qemu_tcg_init_cpu_signals(void)
 612{
 613}
 614#endif /* _WIN32 */
 615
 616static QemuMutex qemu_global_mutex;
 617static QemuCond qemu_io_proceeded_cond;
 618static bool iothread_requesting_mutex;
 619
 620static QemuThread io_thread;
 621
 622static QemuThread *tcg_cpu_thread;
 623static QemuCond *tcg_halt_cond;
 624
 625/* cpu creation */
 626static QemuCond qemu_cpu_cond;
 627/* system init */
 628static QemuCond qemu_pause_cond;
 629static QemuCond qemu_work_cond;
 630
 631void qemu_init_cpu_loop(void)
 632{
 633    qemu_init_sigbus();
 634    qemu_cond_init(&qemu_cpu_cond);
 635    qemu_cond_init(&qemu_pause_cond);
 636    qemu_cond_init(&qemu_work_cond);
 637    qemu_cond_init(&qemu_io_proceeded_cond);
 638    qemu_mutex_init(&qemu_global_mutex);
 639
 640    qemu_thread_get_self(&io_thread);
 641}
 642
 643void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
 644{
 645    struct qemu_work_item wi;
 646
 647    if (qemu_cpu_is_self(cpu)) {
 648        func(data);
 649        return;
 650    }
 651
 652    wi.func = func;
 653    wi.data = data;
 654    if (cpu->queued_work_first == NULL) {
 655        cpu->queued_work_first = &wi;
 656    } else {
 657        cpu->queued_work_last->next = &wi;
 658    }
 659    cpu->queued_work_last = &wi;
 660    wi.next = NULL;
 661    wi.done = false;
 662
 663    qemu_cpu_kick(cpu);
 664    while (!wi.done) {
 665        CPUArchState *self_env = cpu_single_env;
 666
 667        qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
 668        cpu_single_env = self_env;
 669    }
 670}
 671
 672static void flush_queued_work(CPUState *cpu)
 673{
 674    struct qemu_work_item *wi;
 675
 676    if (cpu->queued_work_first == NULL) {
 677        return;
 678    }
 679
 680    while ((wi = cpu->queued_work_first)) {
 681        cpu->queued_work_first = wi->next;
 682        wi->func(wi->data);
 683        wi->done = true;
 684    }
 685    cpu->queued_work_last = NULL;
 686    qemu_cond_broadcast(&qemu_work_cond);
 687}
 688
 689static void qemu_wait_io_event_common(CPUState *cpu)
 690{
 691    if (cpu->stop) {
 692        cpu->stop = false;
 693        cpu->stopped = true;
 694        qemu_cond_signal(&qemu_pause_cond);
 695    }
 696    flush_queued_work(cpu);
 697    cpu->thread_kicked = false;
 698}
 699
 700static void qemu_tcg_wait_io_event(void)
 701{
 702    CPUArchState *env;
 703
 704    while (all_cpu_threads_idle()) {
 705       /* Start accounting real time to the virtual clock if the CPUs
 706          are idle.  */
 707        qemu_clock_warp(vm_clock);
 708        qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
 709    }
 710
 711    while (iothread_requesting_mutex) {
 712        qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
 713    }
 714
 715    for (env = first_cpu; env != NULL; env = env->next_cpu) {
 716        qemu_wait_io_event_common(ENV_GET_CPU(env));
 717    }
 718}
 719
 720static void qemu_kvm_wait_io_event(CPUArchState *env)
 721{
 722    CPUState *cpu = ENV_GET_CPU(env);
 723
 724    while (cpu_thread_is_idle(env)) {
 725        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
 726    }
 727
 728    qemu_kvm_eat_signals(env);
 729    qemu_wait_io_event_common(cpu);
 730}
 731
 732static void *qemu_kvm_cpu_thread_fn(void *arg)
 733{
 734    CPUArchState *env = arg;
 735    CPUState *cpu = ENV_GET_CPU(env);
 736    int r;
 737
 738    qemu_mutex_lock(&qemu_global_mutex);
 739    qemu_thread_get_self(cpu->thread);
 740    cpu->thread_id = qemu_get_thread_id();
 741    cpu_single_env = env;
 742
 743    r = kvm_init_vcpu(env);
 744    if (r < 0) {
 745        fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
 746        exit(1);
 747    }
 748
 749    qemu_kvm_init_cpu_signals(env);
 750
 751    /* signal CPU creation */
 752    cpu->created = true;
 753    qemu_cond_signal(&qemu_cpu_cond);
 754
 755    while (1) {
 756        if (cpu_can_run(cpu)) {
 757            r = kvm_cpu_exec(env);
 758            if (r == EXCP_DEBUG) {
 759                cpu_handle_guest_debug(env);
 760            }
 761        }
 762        qemu_kvm_wait_io_event(env);
 763    }
 764
 765    return NULL;
 766}
 767
 768static void *qemu_dummy_cpu_thread_fn(void *arg)
 769{
 770#ifdef _WIN32
 771    fprintf(stderr, "qtest is not supported under Windows\n");
 772    exit(1);
 773#else
 774    CPUArchState *env = arg;
 775    CPUState *cpu = ENV_GET_CPU(env);
 776    sigset_t waitset;
 777    int r;
 778
 779    qemu_mutex_lock_iothread();
 780    qemu_thread_get_self(cpu->thread);
 781    cpu->thread_id = qemu_get_thread_id();
 782
 783    sigemptyset(&waitset);
 784    sigaddset(&waitset, SIG_IPI);
 785
 786    /* signal CPU creation */
 787    cpu->created = true;
 788    qemu_cond_signal(&qemu_cpu_cond);
 789
 790    cpu_single_env = env;
 791    while (1) {
 792        cpu_single_env = NULL;
 793        qemu_mutex_unlock_iothread();
 794        do {
 795            int sig;
 796            r = sigwait(&waitset, &sig);
 797        } while (r == -1 && (errno == EAGAIN || errno == EINTR));
 798        if (r == -1) {
 799            perror("sigwait");
 800            exit(1);
 801        }
 802        qemu_mutex_lock_iothread();
 803        cpu_single_env = env;
 804        qemu_wait_io_event_common(cpu);
 805    }
 806
 807    return NULL;
 808#endif
 809}
 810
 811static void tcg_exec_all(void);
 812
 813static void *qemu_tcg_cpu_thread_fn(void *arg)
 814{
 815    CPUState *cpu = arg;
 816    CPUArchState *env;
 817
 818    qemu_tcg_init_cpu_signals();
 819    qemu_thread_get_self(cpu->thread);
 820
 821    /* signal CPU creation */
 822    qemu_mutex_lock(&qemu_global_mutex);
 823    for (env = first_cpu; env != NULL; env = env->next_cpu) {
 824        cpu = ENV_GET_CPU(env);
 825        cpu->thread_id = qemu_get_thread_id();
 826        cpu->created = true;
 827    }
 828    qemu_cond_signal(&qemu_cpu_cond);
 829
 830    /* wait for initial kick-off after machine start */
 831    while (ENV_GET_CPU(first_cpu)->stopped) {
 832        qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
 833
 834        /* process any pending work */
 835        for (env = first_cpu; env != NULL; env = env->next_cpu) {
 836            qemu_wait_io_event_common(ENV_GET_CPU(env));
 837        }
 838    }
 839
 840    while (1) {
 841        tcg_exec_all();
 842        if (use_icount && qemu_clock_deadline(vm_clock) <= 0) {
 843            qemu_notify_event();
 844        }
 845        qemu_tcg_wait_io_event();
 846    }
 847
 848    return NULL;
 849}
 850
 851static void qemu_cpu_kick_thread(CPUState *cpu)
 852{
 853#ifndef _WIN32
 854    int err;
 855
 856    err = pthread_kill(cpu->thread->thread, SIG_IPI);
 857    if (err) {
 858        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
 859        exit(1);
 860    }
 861#else /* _WIN32 */
 862    if (!qemu_cpu_is_self(cpu)) {
 863        SuspendThread(cpu->hThread);
 864        cpu_signal(0);
 865        ResumeThread(cpu->hThread);
 866    }
 867#endif
 868}
 869
 870void qemu_cpu_kick(CPUState *cpu)
 871{
 872    qemu_cond_broadcast(cpu->halt_cond);
 873    if (!tcg_enabled() && !cpu->thread_kicked) {
 874        qemu_cpu_kick_thread(cpu);
 875        cpu->thread_kicked = true;
 876    }
 877}
 878
 879void qemu_cpu_kick_self(void)
 880{
 881#ifndef _WIN32
 882    assert(cpu_single_env);
 883    CPUState *cpu_single_cpu = ENV_GET_CPU(cpu_single_env);
 884
 885    if (!cpu_single_cpu->thread_kicked) {
 886        qemu_cpu_kick_thread(cpu_single_cpu);
 887        cpu_single_cpu->thread_kicked = true;
 888    }
 889#else
 890    abort();
 891#endif
 892}
 893
 894bool qemu_cpu_is_self(CPUState *cpu)
 895{
 896    return qemu_thread_is_self(cpu->thread);
 897}
 898
 899static bool qemu_in_vcpu_thread(void)
 900{
 901    return cpu_single_env && qemu_cpu_is_self(ENV_GET_CPU(cpu_single_env));
 902}
 903
 904void qemu_mutex_lock_iothread(void)
 905{
 906    if (!tcg_enabled()) {
 907        qemu_mutex_lock(&qemu_global_mutex);
 908    } else {
 909        iothread_requesting_mutex = true;
 910        if (qemu_mutex_trylock(&qemu_global_mutex)) {
 911            qemu_cpu_kick_thread(ENV_GET_CPU(first_cpu));
 912            qemu_mutex_lock(&qemu_global_mutex);
 913        }
 914        iothread_requesting_mutex = false;
 915        qemu_cond_broadcast(&qemu_io_proceeded_cond);
 916    }
 917}
 918
 919void qemu_mutex_unlock_iothread(void)
 920{
 921    qemu_mutex_unlock(&qemu_global_mutex);
 922}
 923
 924static int all_vcpus_paused(void)
 925{
 926    CPUArchState *penv = first_cpu;
 927
 928    while (penv) {
 929        CPUState *pcpu = ENV_GET_CPU(penv);
 930        if (!pcpu->stopped) {
 931            return 0;
 932        }
 933        penv = penv->next_cpu;
 934    }
 935
 936    return 1;
 937}
 938
 939void pause_all_vcpus(void)
 940{
 941    CPUArchState *penv = first_cpu;
 942
 943    qemu_clock_enable(vm_clock, false);
 944    while (penv) {
 945        CPUState *pcpu = ENV_GET_CPU(penv);
 946        pcpu->stop = true;
 947        qemu_cpu_kick(pcpu);
 948        penv = penv->next_cpu;
 949    }
 950
 951    if (qemu_in_vcpu_thread()) {
 952        cpu_stop_current();
 953        if (!kvm_enabled()) {
 954            while (penv) {
 955                CPUState *pcpu = ENV_GET_CPU(penv);
 956                pcpu->stop = 0;
 957                pcpu->stopped = true;
 958                penv = penv->next_cpu;
 959            }
 960            return;
 961        }
 962    }
 963
 964    while (!all_vcpus_paused()) {
 965        qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
 966        penv = first_cpu;
 967        while (penv) {
 968            qemu_cpu_kick(ENV_GET_CPU(penv));
 969            penv = penv->next_cpu;
 970        }
 971    }
 972}
 973
 974void resume_all_vcpus(void)
 975{
 976    CPUArchState *penv = first_cpu;
 977
 978    qemu_clock_enable(vm_clock, true);
 979    while (penv) {
 980        CPUState *pcpu = ENV_GET_CPU(penv);
 981        pcpu->stop = false;
 982        pcpu->stopped = false;
 983        qemu_cpu_kick(pcpu);
 984        penv = penv->next_cpu;
 985    }
 986}
 987
 988static void qemu_tcg_init_vcpu(CPUState *cpu)
 989{
 990    /* share a single thread for all cpus with TCG */
 991    if (!tcg_cpu_thread) {
 992        cpu->thread = g_malloc0(sizeof(QemuThread));
 993        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
 994        qemu_cond_init(cpu->halt_cond);
 995        tcg_halt_cond = cpu->halt_cond;
 996        qemu_thread_create(cpu->thread, qemu_tcg_cpu_thread_fn, cpu,
 997                           QEMU_THREAD_JOINABLE);
 998#ifdef _WIN32
 999        cpu->hThread = qemu_thread_get_handle(cpu->thread);
1000#endif
1001        while (!cpu->created) {
1002            qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1003        }
1004        tcg_cpu_thread = cpu->thread;
1005    } else {
1006        cpu->thread = tcg_cpu_thread;
1007        cpu->halt_cond = tcg_halt_cond;
1008    }
1009}
1010
1011static void qemu_kvm_start_vcpu(CPUArchState *env)
1012{
1013    CPUState *cpu = ENV_GET_CPU(env);
1014
1015    cpu->thread = g_malloc0(sizeof(QemuThread));
1016    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1017    qemu_cond_init(cpu->halt_cond);
1018    qemu_thread_create(cpu->thread, qemu_kvm_cpu_thread_fn, env,
1019                       QEMU_THREAD_JOINABLE);
1020    while (!cpu->created) {
1021        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1022    }
1023}
1024
1025static void qemu_dummy_start_vcpu(CPUArchState *env)
1026{
1027    CPUState *cpu = ENV_GET_CPU(env);
1028
1029    cpu->thread = g_malloc0(sizeof(QemuThread));
1030    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1031    qemu_cond_init(cpu->halt_cond);
1032    qemu_thread_create(cpu->thread, qemu_dummy_cpu_thread_fn, env,
1033                       QEMU_THREAD_JOINABLE);
1034    while (!cpu->created) {
1035        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1036    }
1037}
1038
1039void qemu_init_vcpu(void *_env)
1040{
1041    CPUArchState *env = _env;
1042    CPUState *cpu = ENV_GET_CPU(env);
1043
1044    env->nr_cores = smp_cores;
1045    env->nr_threads = smp_threads;
1046    cpu->stopped = true;
1047    if (kvm_enabled()) {
1048        qemu_kvm_start_vcpu(env);
1049    } else if (tcg_enabled()) {
1050        qemu_tcg_init_vcpu(cpu);
1051    } else {
1052        qemu_dummy_start_vcpu(env);
1053    }
1054}
1055
1056void cpu_stop_current(void)
1057{
1058    if (cpu_single_env) {
1059        CPUState *cpu_single_cpu = ENV_GET_CPU(cpu_single_env);
1060        cpu_single_cpu->stop = false;
1061        cpu_single_cpu->stopped = true;
1062        cpu_exit(cpu_single_env);
1063        qemu_cond_signal(&qemu_pause_cond);
1064    }
1065}
1066
1067void vm_stop(RunState state)
1068{
1069    if (qemu_in_vcpu_thread()) {
1070        qemu_system_vmstop_request(state);
1071        /*
1072         * FIXME: should not return to device code in case
1073         * vm_stop() has been requested.
1074         */
1075        cpu_stop_current();
1076        return;
1077    }
1078    do_vm_stop(state);
1079}
1080
1081/* does a state transition even if the VM is already stopped,
1082   current state is forgotten forever */
1083void vm_stop_force_state(RunState state)
1084{
1085    if (runstate_is_running()) {
1086        vm_stop(state);
1087    } else {
1088        runstate_set(state);
1089    }
1090}
1091
1092static int tcg_cpu_exec(CPUArchState *env)
1093{
1094    int ret;
1095#ifdef CONFIG_PROFILER
1096    int64_t ti;
1097#endif
1098
1099#ifdef CONFIG_PROFILER
1100    ti = profile_getclock();
1101#endif
1102    if (use_icount) {
1103        int64_t count;
1104        int decr;
1105        qemu_icount -= (env->icount_decr.u16.low + env->icount_extra);
1106        env->icount_decr.u16.low = 0;
1107        env->icount_extra = 0;
1108        count = qemu_icount_round(qemu_clock_deadline(vm_clock));
1109        qemu_icount += count;
1110        decr = (count > 0xffff) ? 0xffff : count;
1111        count -= decr;
1112        env->icount_decr.u16.low = decr;
1113        env->icount_extra = count;
1114    }
1115    ret = cpu_exec(env);
1116#ifdef CONFIG_PROFILER
1117    qemu_time += profile_getclock() - ti;
1118#endif
1119    if (use_icount) {
1120        /* Fold pending instructions back into the
1121           instruction counter, and clear the interrupt flag.  */
1122        qemu_icount -= (env->icount_decr.u16.low
1123                        + env->icount_extra);
1124        env->icount_decr.u32 = 0;
1125        env->icount_extra = 0;
1126    }
1127    return ret;
1128}
1129
1130static void tcg_exec_all(void)
1131{
1132    int r;
1133
1134    /* Account partial waits to the vm_clock.  */
1135    qemu_clock_warp(vm_clock);
1136
1137    if (next_cpu == NULL) {
1138        next_cpu = first_cpu;
1139    }
1140    for (; next_cpu != NULL && !exit_request; next_cpu = next_cpu->next_cpu) {
1141        CPUArchState *env = next_cpu;
1142        CPUState *cpu = ENV_GET_CPU(env);
1143
1144        qemu_clock_enable(vm_clock,
1145                          (env->singlestep_enabled & SSTEP_NOTIMER) == 0);
1146
1147        if (cpu_can_run(cpu)) {
1148            r = tcg_cpu_exec(env);
1149            if (r == EXCP_DEBUG) {
1150                cpu_handle_guest_debug(env);
1151                break;
1152            }
1153        } else if (cpu->stop || cpu->stopped) {
1154            break;
1155        }
1156    }
1157    exit_request = 0;
1158}
1159
1160void set_numa_modes(void)
1161{
1162    CPUArchState *env;
1163    int i;
1164
1165    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1166        for (i = 0; i < nb_numa_nodes; i++) {
1167            if (test_bit(env->cpu_index, node_cpumask[i])) {
1168                env->numa_node = i;
1169            }
1170        }
1171    }
1172}
1173
1174void set_cpu_log(const char *optarg)
1175{
1176    int mask;
1177    const CPULogItem *item;
1178
1179    mask = cpu_str_to_log_mask(optarg);
1180    if (!mask) {
1181        printf("Log items (comma separated):\n");
1182        for (item = cpu_log_items; item->mask != 0; item++) {
1183            printf("%-10s %s\n", item->name, item->help);
1184        }
1185        exit(1);
1186    }
1187    cpu_set_log(mask);
1188}
1189
1190void set_cpu_log_filename(const char *optarg)
1191{
1192    cpu_set_log_filename(optarg);
1193}
1194
1195void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1196{
1197    /* XXX: implement xxx_cpu_list for targets that still miss it */
1198#if defined(cpu_list)
1199    cpu_list(f, cpu_fprintf);
1200#endif
1201}
1202
1203CpuInfoList *qmp_query_cpus(Error **errp)
1204{
1205    CpuInfoList *head = NULL, *cur_item = NULL;
1206    CPUArchState *env;
1207
1208    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1209        CPUState *cpu = ENV_GET_CPU(env);
1210        CpuInfoList *info;
1211
1212        cpu_synchronize_state(env);
1213
1214        info = g_malloc0(sizeof(*info));
1215        info->value = g_malloc0(sizeof(*info->value));
1216        info->value->CPU = env->cpu_index;
1217        info->value->current = (env == first_cpu);
1218        info->value->halted = env->halted;
1219        info->value->thread_id = cpu->thread_id;
1220#if defined(TARGET_I386)
1221        info->value->has_pc = true;
1222        info->value->pc = env->eip + env->segs[R_CS].base;
1223#elif defined(TARGET_PPC)
1224        info->value->has_nip = true;
1225        info->value->nip = env->nip;
1226#elif defined(TARGET_SPARC)
1227        info->value->has_pc = true;
1228        info->value->pc = env->pc;
1229        info->value->has_npc = true;
1230        info->value->npc = env->npc;
1231#elif defined(TARGET_MIPS)
1232        info->value->has_PC = true;
1233        info->value->PC = env->active_tc.PC;
1234#endif
1235
1236        /* XXX: waiting for the qapi to support GSList */
1237        if (!cur_item) {
1238            head = cur_item = info;
1239        } else {
1240            cur_item->next = info;
1241            cur_item = info;
1242        }
1243    }
1244
1245    return head;
1246}
1247
1248void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1249                 bool has_cpu, int64_t cpu_index, Error **errp)
1250{
1251    FILE *f;
1252    uint32_t l;
1253    CPUArchState *env;
1254    uint8_t buf[1024];
1255
1256    if (!has_cpu) {
1257        cpu_index = 0;
1258    }
1259
1260    for (env = first_cpu; env; env = env->next_cpu) {
1261        if (cpu_index == env->cpu_index) {
1262            break;
1263        }
1264    }
1265
1266    if (env == NULL) {
1267        error_set(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1268                  "a CPU number");
1269        return;
1270    }
1271
1272    f = fopen(filename, "wb");
1273    if (!f) {
1274        error_set(errp, QERR_OPEN_FILE_FAILED, filename);
1275        return;
1276    }
1277
1278    while (size != 0) {
1279        l = sizeof(buf);
1280        if (l > size)
1281            l = size;
1282        cpu_memory_rw_debug(env, addr, buf, l, 0);
1283        if (fwrite(buf, 1, l, f) != l) {
1284            error_set(errp, QERR_IO_ERROR);
1285            goto exit;
1286        }
1287        addr += l;
1288        size -= l;
1289    }
1290
1291exit:
1292    fclose(f);
1293}
1294
1295void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1296                  Error **errp)
1297{
1298    FILE *f;
1299    uint32_t l;
1300    uint8_t buf[1024];
1301
1302    f = fopen(filename, "wb");
1303    if (!f) {
1304        error_set(errp, QERR_OPEN_FILE_FAILED, filename);
1305        return;
1306    }
1307
1308    while (size != 0) {
1309        l = sizeof(buf);
1310        if (l > size)
1311            l = size;
1312        cpu_physical_memory_rw(addr, buf, l, 0);
1313        if (fwrite(buf, 1, l, f) != l) {
1314            error_set(errp, QERR_IO_ERROR);
1315            goto exit;
1316        }
1317        addr += l;
1318        size -= l;
1319    }
1320
1321exit:
1322    fclose(f);
1323}
1324
1325void qmp_inject_nmi(Error **errp)
1326{
1327#if defined(TARGET_I386)
1328    CPUArchState *env;
1329
1330    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1331        if (!env->apic_state) {
1332            cpu_interrupt(env, CPU_INTERRUPT_NMI);
1333        } else {
1334            apic_deliver_nmi(env->apic_state);
1335        }
1336    }
1337#else
1338    error_set(errp, QERR_UNSUPPORTED);
1339#endif
1340}
1341