qemu/cpus.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25/* Needed early for CONFIG_BSD etc. */
  26#include "config-host.h"
  27
  28#include "monitor/monitor.h"
  29#include "sysemu/sysemu.h"
  30#include "exec/gdbstub.h"
  31#include "sysemu/dma.h"
  32#include "sysemu/kvm.h"
  33#include "qmp-commands.h"
  34
  35#include "qemu/thread.h"
  36#include "sysemu/cpus.h"
  37#include "sysemu/qtest.h"
  38#include "qemu/main-loop.h"
  39#include "qemu/bitmap.h"
  40
  41#ifndef _WIN32
  42#include "qemu/compatfd.h"
  43#endif
  44
  45#ifdef CONFIG_LINUX
  46
  47#include <sys/prctl.h>
  48
  49#ifndef PR_MCE_KILL
  50#define PR_MCE_KILL 33
  51#endif
  52
  53#ifndef PR_MCE_KILL_SET
  54#define PR_MCE_KILL_SET 1
  55#endif
  56
  57#ifndef PR_MCE_KILL_EARLY
  58#define PR_MCE_KILL_EARLY 1
  59#endif
  60
  61#endif /* CONFIG_LINUX */
  62
  63static CPUArchState *next_cpu;
  64
  65static bool cpu_thread_is_idle(CPUArchState *env)
  66{
  67    CPUState *cpu = ENV_GET_CPU(env);
  68
  69    if (cpu->stop || cpu->queued_work_first) {
  70        return false;
  71    }
  72    if (cpu->stopped || !runstate_is_running()) {
  73        return true;
  74    }
  75    if (!env->halted || qemu_cpu_has_work(cpu) ||
  76        kvm_async_interrupts_enabled()) {
  77        return false;
  78    }
  79    return true;
  80}
  81
  82static bool all_cpu_threads_idle(void)
  83{
  84    CPUArchState *env;
  85
  86    for (env = first_cpu; env != NULL; env = env->next_cpu) {
  87        if (!cpu_thread_is_idle(env)) {
  88            return false;
  89        }
  90    }
  91    return true;
  92}
  93
  94/***********************************************************/
  95/* guest cycle counter */
  96
  97/* Conversion factor from emulated instructions to virtual clock ticks.  */
  98static int icount_time_shift;
  99/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 100#define MAX_ICOUNT_SHIFT 10
 101/* Compensate for varying guest execution speed.  */
 102static int64_t qemu_icount_bias;
 103static QEMUTimer *icount_rt_timer;
 104static QEMUTimer *icount_vm_timer;
 105static QEMUTimer *icount_warp_timer;
 106static int64_t vm_clock_warp_start;
 107static int64_t qemu_icount;
 108
 109typedef struct TimersState {
 110    int64_t cpu_ticks_prev;
 111    int64_t cpu_ticks_offset;
 112    int64_t cpu_clock_offset;
 113    int32_t cpu_ticks_enabled;
 114    int64_t dummy;
 115} TimersState;
 116
 117TimersState timers_state;
 118
 119/* Return the virtual CPU time, based on the instruction counter.  */
 120int64_t cpu_get_icount(void)
 121{
 122    int64_t icount;
 123    CPUArchState *env = cpu_single_env;
 124
 125    icount = qemu_icount;
 126    if (env) {
 127        if (!can_do_io(env)) {
 128            fprintf(stderr, "Bad clock read\n");
 129        }
 130        icount -= (env->icount_decr.u16.low + env->icount_extra);
 131    }
 132    return qemu_icount_bias + (icount << icount_time_shift);
 133}
 134
 135/* return the host CPU cycle counter and handle stop/restart */
 136int64_t cpu_get_ticks(void)
 137{
 138    if (use_icount) {
 139        return cpu_get_icount();
 140    }
 141    if (!timers_state.cpu_ticks_enabled) {
 142        return timers_state.cpu_ticks_offset;
 143    } else {
 144        int64_t ticks;
 145        ticks = cpu_get_real_ticks();
 146        if (timers_state.cpu_ticks_prev > ticks) {
 147            /* Note: non increasing ticks may happen if the host uses
 148               software suspend */
 149            timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 150        }
 151        timers_state.cpu_ticks_prev = ticks;
 152        return ticks + timers_state.cpu_ticks_offset;
 153    }
 154}
 155
 156/* return the host CPU monotonic timer and handle stop/restart */
 157int64_t cpu_get_clock(void)
 158{
 159    int64_t ti;
 160    if (!timers_state.cpu_ticks_enabled) {
 161        return timers_state.cpu_clock_offset;
 162    } else {
 163        ti = get_clock();
 164        return ti + timers_state.cpu_clock_offset;
 165    }
 166}
 167
 168/* enable cpu_get_ticks() */
 169void cpu_enable_ticks(void)
 170{
 171    if (!timers_state.cpu_ticks_enabled) {
 172        timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
 173        timers_state.cpu_clock_offset -= get_clock();
 174        timers_state.cpu_ticks_enabled = 1;
 175    }
 176}
 177
 178/* disable cpu_get_ticks() : the clock is stopped. You must not call
 179   cpu_get_ticks() after that.  */
 180void cpu_disable_ticks(void)
 181{
 182    if (timers_state.cpu_ticks_enabled) {
 183        timers_state.cpu_ticks_offset = cpu_get_ticks();
 184        timers_state.cpu_clock_offset = cpu_get_clock();
 185        timers_state.cpu_ticks_enabled = 0;
 186    }
 187}
 188
 189/* Correlation between real and virtual time is always going to be
 190   fairly approximate, so ignore small variation.
 191   When the guest is idle real and virtual time will be aligned in
 192   the IO wait loop.  */
 193#define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
 194
 195static void icount_adjust(void)
 196{
 197    int64_t cur_time;
 198    int64_t cur_icount;
 199    int64_t delta;
 200    static int64_t last_delta;
 201    /* If the VM is not running, then do nothing.  */
 202    if (!runstate_is_running()) {
 203        return;
 204    }
 205    cur_time = cpu_get_clock();
 206    cur_icount = qemu_get_clock_ns(vm_clock);
 207    delta = cur_icount - cur_time;
 208    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 209    if (delta > 0
 210        && last_delta + ICOUNT_WOBBLE < delta * 2
 211        && icount_time_shift > 0) {
 212        /* The guest is getting too far ahead.  Slow time down.  */
 213        icount_time_shift--;
 214    }
 215    if (delta < 0
 216        && last_delta - ICOUNT_WOBBLE > delta * 2
 217        && icount_time_shift < MAX_ICOUNT_SHIFT) {
 218        /* The guest is getting too far behind.  Speed time up.  */
 219        icount_time_shift++;
 220    }
 221    last_delta = delta;
 222    qemu_icount_bias = cur_icount - (qemu_icount << icount_time_shift);
 223}
 224
 225static void icount_adjust_rt(void *opaque)
 226{
 227    qemu_mod_timer(icount_rt_timer,
 228                   qemu_get_clock_ms(rt_clock) + 1000);
 229    icount_adjust();
 230}
 231
 232static void icount_adjust_vm(void *opaque)
 233{
 234    qemu_mod_timer(icount_vm_timer,
 235                   qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
 236    icount_adjust();
 237}
 238
 239static int64_t qemu_icount_round(int64_t count)
 240{
 241    return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 242}
 243
 244static void icount_warp_rt(void *opaque)
 245{
 246    if (vm_clock_warp_start == -1) {
 247        return;
 248    }
 249
 250    if (runstate_is_running()) {
 251        int64_t clock = qemu_get_clock_ns(rt_clock);
 252        int64_t warp_delta = clock - vm_clock_warp_start;
 253        if (use_icount == 1) {
 254            qemu_icount_bias += warp_delta;
 255        } else {
 256            /*
 257             * In adaptive mode, do not let the vm_clock run too
 258             * far ahead of real time.
 259             */
 260            int64_t cur_time = cpu_get_clock();
 261            int64_t cur_icount = qemu_get_clock_ns(vm_clock);
 262            int64_t delta = cur_time - cur_icount;
 263            qemu_icount_bias += MIN(warp_delta, delta);
 264        }
 265        if (qemu_clock_expired(vm_clock)) {
 266            qemu_notify_event();
 267        }
 268    }
 269    vm_clock_warp_start = -1;
 270}
 271
 272void qtest_clock_warp(int64_t dest)
 273{
 274    int64_t clock = qemu_get_clock_ns(vm_clock);
 275    assert(qtest_enabled());
 276    while (clock < dest) {
 277        int64_t deadline = qemu_clock_deadline(vm_clock);
 278        int64_t warp = MIN(dest - clock, deadline);
 279        qemu_icount_bias += warp;
 280        qemu_run_timers(vm_clock);
 281        clock = qemu_get_clock_ns(vm_clock);
 282    }
 283    qemu_notify_event();
 284}
 285
 286void qemu_clock_warp(QEMUClock *clock)
 287{
 288    int64_t deadline;
 289
 290    /*
 291     * There are too many global variables to make the "warp" behavior
 292     * applicable to other clocks.  But a clock argument removes the
 293     * need for if statements all over the place.
 294     */
 295    if (clock != vm_clock || !use_icount) {
 296        return;
 297    }
 298
 299    /*
 300     * If the CPUs have been sleeping, advance the vm_clock timer now.  This
 301     * ensures that the deadline for the timer is computed correctly below.
 302     * This also makes sure that the insn counter is synchronized before the
 303     * CPU starts running, in case the CPU is woken by an event other than
 304     * the earliest vm_clock timer.
 305     */
 306    icount_warp_rt(NULL);
 307    if (!all_cpu_threads_idle() || !qemu_clock_has_timers(vm_clock)) {
 308        qemu_del_timer(icount_warp_timer);
 309        return;
 310    }
 311
 312    if (qtest_enabled()) {
 313        /* When testing, qtest commands advance icount.  */
 314        return;
 315    }
 316
 317    vm_clock_warp_start = qemu_get_clock_ns(rt_clock);
 318    deadline = qemu_clock_deadline(vm_clock);
 319    if (deadline > 0) {
 320        /*
 321         * Ensure the vm_clock proceeds even when the virtual CPU goes to
 322         * sleep.  Otherwise, the CPU might be waiting for a future timer
 323         * interrupt to wake it up, but the interrupt never comes because
 324         * the vCPU isn't running any insns and thus doesn't advance the
 325         * vm_clock.
 326         *
 327         * An extreme solution for this problem would be to never let VCPUs
 328         * sleep in icount mode if there is a pending vm_clock timer; rather
 329         * time could just advance to the next vm_clock event.  Instead, we
 330         * do stop VCPUs and only advance vm_clock after some "real" time,
 331         * (related to the time left until the next event) has passed.  This
 332         * rt_clock timer will do this.  This avoids that the warps are too
 333         * visible externally---for example, you will not be sending network
 334         * packets continuously instead of every 100ms.
 335         */
 336        qemu_mod_timer(icount_warp_timer, vm_clock_warp_start + deadline);
 337    } else {
 338        qemu_notify_event();
 339    }
 340}
 341
 342static const VMStateDescription vmstate_timers = {
 343    .name = "timer",
 344    .version_id = 2,
 345    .minimum_version_id = 1,
 346    .minimum_version_id_old = 1,
 347    .fields      = (VMStateField[]) {
 348        VMSTATE_INT64(cpu_ticks_offset, TimersState),
 349        VMSTATE_INT64(dummy, TimersState),
 350        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 351        VMSTATE_END_OF_LIST()
 352    }
 353};
 354
 355void configure_icount(const char *option)
 356{
 357    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 358    if (!option) {
 359        return;
 360    }
 361
 362    icount_warp_timer = qemu_new_timer_ns(rt_clock, icount_warp_rt, NULL);
 363    if (strcmp(option, "auto") != 0) {
 364        icount_time_shift = strtol(option, NULL, 0);
 365        use_icount = 1;
 366        return;
 367    }
 368
 369    use_icount = 2;
 370
 371    /* 125MIPS seems a reasonable initial guess at the guest speed.
 372       It will be corrected fairly quickly anyway.  */
 373    icount_time_shift = 3;
 374
 375    /* Have both realtime and virtual time triggers for speed adjustment.
 376       The realtime trigger catches emulated time passing too slowly,
 377       the virtual time trigger catches emulated time passing too fast.
 378       Realtime triggers occur even when idle, so use them less frequently
 379       than VM triggers.  */
 380    icount_rt_timer = qemu_new_timer_ms(rt_clock, icount_adjust_rt, NULL);
 381    qemu_mod_timer(icount_rt_timer,
 382                   qemu_get_clock_ms(rt_clock) + 1000);
 383    icount_vm_timer = qemu_new_timer_ns(vm_clock, icount_adjust_vm, NULL);
 384    qemu_mod_timer(icount_vm_timer,
 385                   qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
 386}
 387
 388/***********************************************************/
 389void hw_error(const char *fmt, ...)
 390{
 391    va_list ap;
 392    CPUArchState *env;
 393    CPUState *cpu;
 394
 395    va_start(ap, fmt);
 396    fprintf(stderr, "qemu: hardware error: ");
 397    vfprintf(stderr, fmt, ap);
 398    fprintf(stderr, "\n");
 399    for (env = first_cpu; env != NULL; env = env->next_cpu) {
 400        cpu = ENV_GET_CPU(env);
 401        fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 402        cpu_dump_state(env, stderr, fprintf, CPU_DUMP_FPU);
 403    }
 404    va_end(ap);
 405    abort();
 406}
 407
 408void cpu_synchronize_all_states(void)
 409{
 410    CPUArchState *cpu;
 411
 412    for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
 413        cpu_synchronize_state(cpu);
 414    }
 415}
 416
 417void cpu_synchronize_all_post_reset(void)
 418{
 419    CPUArchState *cpu;
 420
 421    for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
 422        cpu_synchronize_post_reset(cpu);
 423    }
 424}
 425
 426void cpu_synchronize_all_post_init(void)
 427{
 428    CPUArchState *cpu;
 429
 430    for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
 431        cpu_synchronize_post_init(cpu);
 432    }
 433}
 434
 435bool cpu_is_stopped(CPUState *cpu)
 436{
 437    return !runstate_is_running() || cpu->stopped;
 438}
 439
 440static void do_vm_stop(RunState state)
 441{
 442    if (runstate_is_running()) {
 443        cpu_disable_ticks();
 444        pause_all_vcpus();
 445        runstate_set(state);
 446        vm_state_notify(0, state);
 447        bdrv_drain_all();
 448        bdrv_flush_all();
 449        monitor_protocol_event(QEVENT_STOP, NULL);
 450    }
 451}
 452
 453static bool cpu_can_run(CPUState *cpu)
 454{
 455    if (cpu->stop) {
 456        return false;
 457    }
 458    if (cpu->stopped || !runstate_is_running()) {
 459        return false;
 460    }
 461    return true;
 462}
 463
 464static void cpu_handle_guest_debug(CPUArchState *env)
 465{
 466    CPUState *cpu = ENV_GET_CPU(env);
 467
 468    gdb_set_stop_cpu(env);
 469    qemu_system_debug_request();
 470    cpu->stopped = true;
 471}
 472
 473static void cpu_signal(int sig)
 474{
 475    if (cpu_single_env) {
 476        cpu_exit(cpu_single_env);
 477    }
 478    exit_request = 1;
 479}
 480
 481#ifdef CONFIG_LINUX
 482static void sigbus_reraise(void)
 483{
 484    sigset_t set;
 485    struct sigaction action;
 486
 487    memset(&action, 0, sizeof(action));
 488    action.sa_handler = SIG_DFL;
 489    if (!sigaction(SIGBUS, &action, NULL)) {
 490        raise(SIGBUS);
 491        sigemptyset(&set);
 492        sigaddset(&set, SIGBUS);
 493        sigprocmask(SIG_UNBLOCK, &set, NULL);
 494    }
 495    perror("Failed to re-raise SIGBUS!\n");
 496    abort();
 497}
 498
 499static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
 500                           void *ctx)
 501{
 502    if (kvm_on_sigbus(siginfo->ssi_code,
 503                      (void *)(intptr_t)siginfo->ssi_addr)) {
 504        sigbus_reraise();
 505    }
 506}
 507
 508static void qemu_init_sigbus(void)
 509{
 510    struct sigaction action;
 511
 512    memset(&action, 0, sizeof(action));
 513    action.sa_flags = SA_SIGINFO;
 514    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
 515    sigaction(SIGBUS, &action, NULL);
 516
 517    prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 518}
 519
 520static void qemu_kvm_eat_signals(CPUState *cpu)
 521{
 522    struct timespec ts = { 0, 0 };
 523    siginfo_t siginfo;
 524    sigset_t waitset;
 525    sigset_t chkset;
 526    int r;
 527
 528    sigemptyset(&waitset);
 529    sigaddset(&waitset, SIG_IPI);
 530    sigaddset(&waitset, SIGBUS);
 531
 532    do {
 533        r = sigtimedwait(&waitset, &siginfo, &ts);
 534        if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
 535            perror("sigtimedwait");
 536            exit(1);
 537        }
 538
 539        switch (r) {
 540        case SIGBUS:
 541            if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
 542                sigbus_reraise();
 543            }
 544            break;
 545        default:
 546            break;
 547        }
 548
 549        r = sigpending(&chkset);
 550        if (r == -1) {
 551            perror("sigpending");
 552            exit(1);
 553        }
 554    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 555}
 556
 557#else /* !CONFIG_LINUX */
 558
 559static void qemu_init_sigbus(void)
 560{
 561}
 562
 563static void qemu_kvm_eat_signals(CPUState *cpu)
 564{
 565}
 566#endif /* !CONFIG_LINUX */
 567
 568#ifndef _WIN32
 569static void dummy_signal(int sig)
 570{
 571}
 572
 573static void qemu_kvm_init_cpu_signals(CPUArchState *env)
 574{
 575    int r;
 576    sigset_t set;
 577    struct sigaction sigact;
 578
 579    memset(&sigact, 0, sizeof(sigact));
 580    sigact.sa_handler = dummy_signal;
 581    sigaction(SIG_IPI, &sigact, NULL);
 582
 583    pthread_sigmask(SIG_BLOCK, NULL, &set);
 584    sigdelset(&set, SIG_IPI);
 585    sigdelset(&set, SIGBUS);
 586    r = kvm_set_signal_mask(env, &set);
 587    if (r) {
 588        fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
 589        exit(1);
 590    }
 591}
 592
 593static void qemu_tcg_init_cpu_signals(void)
 594{
 595    sigset_t set;
 596    struct sigaction sigact;
 597
 598    memset(&sigact, 0, sizeof(sigact));
 599    sigact.sa_handler = cpu_signal;
 600    sigaction(SIG_IPI, &sigact, NULL);
 601
 602    sigemptyset(&set);
 603    sigaddset(&set, SIG_IPI);
 604    pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 605}
 606
 607#else /* _WIN32 */
 608static void qemu_kvm_init_cpu_signals(CPUArchState *env)
 609{
 610    abort();
 611}
 612
 613static void qemu_tcg_init_cpu_signals(void)
 614{
 615}
 616#endif /* _WIN32 */
 617
 618static QemuMutex qemu_global_mutex;
 619static QemuCond qemu_io_proceeded_cond;
 620static bool iothread_requesting_mutex;
 621
 622static QemuThread io_thread;
 623
 624static QemuThread *tcg_cpu_thread;
 625static QemuCond *tcg_halt_cond;
 626
 627/* cpu creation */
 628static QemuCond qemu_cpu_cond;
 629/* system init */
 630static QemuCond qemu_pause_cond;
 631static QemuCond qemu_work_cond;
 632
 633void qemu_init_cpu_loop(void)
 634{
 635    qemu_init_sigbus();
 636    qemu_cond_init(&qemu_cpu_cond);
 637    qemu_cond_init(&qemu_pause_cond);
 638    qemu_cond_init(&qemu_work_cond);
 639    qemu_cond_init(&qemu_io_proceeded_cond);
 640    qemu_mutex_init(&qemu_global_mutex);
 641
 642    qemu_thread_get_self(&io_thread);
 643}
 644
 645void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
 646{
 647    struct qemu_work_item wi;
 648
 649    if (qemu_cpu_is_self(cpu)) {
 650        func(data);
 651        return;
 652    }
 653
 654    wi.func = func;
 655    wi.data = data;
 656    if (cpu->queued_work_first == NULL) {
 657        cpu->queued_work_first = &wi;
 658    } else {
 659        cpu->queued_work_last->next = &wi;
 660    }
 661    cpu->queued_work_last = &wi;
 662    wi.next = NULL;
 663    wi.done = false;
 664
 665    qemu_cpu_kick(cpu);
 666    while (!wi.done) {
 667        CPUArchState *self_env = cpu_single_env;
 668
 669        qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
 670        cpu_single_env = self_env;
 671    }
 672}
 673
 674static void flush_queued_work(CPUState *cpu)
 675{
 676    struct qemu_work_item *wi;
 677
 678    if (cpu->queued_work_first == NULL) {
 679        return;
 680    }
 681
 682    while ((wi = cpu->queued_work_first)) {
 683        cpu->queued_work_first = wi->next;
 684        wi->func(wi->data);
 685        wi->done = true;
 686    }
 687    cpu->queued_work_last = NULL;
 688    qemu_cond_broadcast(&qemu_work_cond);
 689}
 690
 691static void qemu_wait_io_event_common(CPUState *cpu)
 692{
 693    if (cpu->stop) {
 694        cpu->stop = false;
 695        cpu->stopped = true;
 696        qemu_cond_signal(&qemu_pause_cond);
 697    }
 698    flush_queued_work(cpu);
 699    cpu->thread_kicked = false;
 700}
 701
 702static void qemu_tcg_wait_io_event(void)
 703{
 704    CPUArchState *env;
 705
 706    while (all_cpu_threads_idle()) {
 707       /* Start accounting real time to the virtual clock if the CPUs
 708          are idle.  */
 709        qemu_clock_warp(vm_clock);
 710        qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
 711    }
 712
 713    while (iothread_requesting_mutex) {
 714        qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
 715    }
 716
 717    for (env = first_cpu; env != NULL; env = env->next_cpu) {
 718        qemu_wait_io_event_common(ENV_GET_CPU(env));
 719    }
 720}
 721
 722static void qemu_kvm_wait_io_event(CPUArchState *env)
 723{
 724    CPUState *cpu = ENV_GET_CPU(env);
 725
 726    while (cpu_thread_is_idle(env)) {
 727        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
 728    }
 729
 730    qemu_kvm_eat_signals(cpu);
 731    qemu_wait_io_event_common(cpu);
 732}
 733
 734static void *qemu_kvm_cpu_thread_fn(void *arg)
 735{
 736    CPUArchState *env = arg;
 737    CPUState *cpu = ENV_GET_CPU(env);
 738    int r;
 739
 740    qemu_mutex_lock(&qemu_global_mutex);
 741    qemu_thread_get_self(cpu->thread);
 742    cpu->thread_id = qemu_get_thread_id();
 743    cpu_single_env = env;
 744
 745    r = kvm_init_vcpu(cpu);
 746    if (r < 0) {
 747        fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
 748        exit(1);
 749    }
 750
 751    qemu_kvm_init_cpu_signals(env);
 752
 753    /* signal CPU creation */
 754    cpu->created = true;
 755    qemu_cond_signal(&qemu_cpu_cond);
 756
 757    while (1) {
 758        if (cpu_can_run(cpu)) {
 759            r = kvm_cpu_exec(env);
 760            if (r == EXCP_DEBUG) {
 761                cpu_handle_guest_debug(env);
 762            }
 763        }
 764        qemu_kvm_wait_io_event(env);
 765    }
 766
 767    return NULL;
 768}
 769
 770static void *qemu_dummy_cpu_thread_fn(void *arg)
 771{
 772#ifdef _WIN32
 773    fprintf(stderr, "qtest is not supported under Windows\n");
 774    exit(1);
 775#else
 776    CPUArchState *env = arg;
 777    CPUState *cpu = ENV_GET_CPU(env);
 778    sigset_t waitset;
 779    int r;
 780
 781    qemu_mutex_lock_iothread();
 782    qemu_thread_get_self(cpu->thread);
 783    cpu->thread_id = qemu_get_thread_id();
 784
 785    sigemptyset(&waitset);
 786    sigaddset(&waitset, SIG_IPI);
 787
 788    /* signal CPU creation */
 789    cpu->created = true;
 790    qemu_cond_signal(&qemu_cpu_cond);
 791
 792    cpu_single_env = env;
 793    while (1) {
 794        cpu_single_env = NULL;
 795        qemu_mutex_unlock_iothread();
 796        do {
 797            int sig;
 798            r = sigwait(&waitset, &sig);
 799        } while (r == -1 && (errno == EAGAIN || errno == EINTR));
 800        if (r == -1) {
 801            perror("sigwait");
 802            exit(1);
 803        }
 804        qemu_mutex_lock_iothread();
 805        cpu_single_env = env;
 806        qemu_wait_io_event_common(cpu);
 807    }
 808
 809    return NULL;
 810#endif
 811}
 812
 813static void tcg_exec_all(void);
 814
 815static void *qemu_tcg_cpu_thread_fn(void *arg)
 816{
 817    CPUState *cpu = arg;
 818    CPUArchState *env;
 819
 820    qemu_tcg_init_cpu_signals();
 821    qemu_thread_get_self(cpu->thread);
 822
 823    /* signal CPU creation */
 824    qemu_mutex_lock(&qemu_global_mutex);
 825    for (env = first_cpu; env != NULL; env = env->next_cpu) {
 826        cpu = ENV_GET_CPU(env);
 827        cpu->thread_id = qemu_get_thread_id();
 828        cpu->created = true;
 829    }
 830    qemu_cond_signal(&qemu_cpu_cond);
 831
 832    /* wait for initial kick-off after machine start */
 833    while (ENV_GET_CPU(first_cpu)->stopped) {
 834        qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
 835
 836        /* process any pending work */
 837        for (env = first_cpu; env != NULL; env = env->next_cpu) {
 838            qemu_wait_io_event_common(ENV_GET_CPU(env));
 839        }
 840    }
 841
 842    while (1) {
 843        tcg_exec_all();
 844        if (use_icount && qemu_clock_deadline(vm_clock) <= 0) {
 845            qemu_notify_event();
 846        }
 847        qemu_tcg_wait_io_event();
 848    }
 849
 850    return NULL;
 851}
 852
 853static void qemu_cpu_kick_thread(CPUState *cpu)
 854{
 855#ifndef _WIN32
 856    int err;
 857
 858    err = pthread_kill(cpu->thread->thread, SIG_IPI);
 859    if (err) {
 860        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
 861        exit(1);
 862    }
 863#else /* _WIN32 */
 864    if (!qemu_cpu_is_self(cpu)) {
 865        SuspendThread(cpu->hThread);
 866        cpu_signal(0);
 867        ResumeThread(cpu->hThread);
 868    }
 869#endif
 870}
 871
 872void qemu_cpu_kick(CPUState *cpu)
 873{
 874    qemu_cond_broadcast(cpu->halt_cond);
 875    if (!tcg_enabled() && !cpu->thread_kicked) {
 876        qemu_cpu_kick_thread(cpu);
 877        cpu->thread_kicked = true;
 878    }
 879}
 880
 881void qemu_cpu_kick_self(void)
 882{
 883#ifndef _WIN32
 884    assert(cpu_single_env);
 885    CPUState *cpu_single_cpu = ENV_GET_CPU(cpu_single_env);
 886
 887    if (!cpu_single_cpu->thread_kicked) {
 888        qemu_cpu_kick_thread(cpu_single_cpu);
 889        cpu_single_cpu->thread_kicked = true;
 890    }
 891#else
 892    abort();
 893#endif
 894}
 895
 896bool qemu_cpu_is_self(CPUState *cpu)
 897{
 898    return qemu_thread_is_self(cpu->thread);
 899}
 900
 901static bool qemu_in_vcpu_thread(void)
 902{
 903    return cpu_single_env && qemu_cpu_is_self(ENV_GET_CPU(cpu_single_env));
 904}
 905
 906void qemu_mutex_lock_iothread(void)
 907{
 908    if (!tcg_enabled()) {
 909        qemu_mutex_lock(&qemu_global_mutex);
 910    } else {
 911        iothread_requesting_mutex = true;
 912        if (qemu_mutex_trylock(&qemu_global_mutex)) {
 913            qemu_cpu_kick_thread(ENV_GET_CPU(first_cpu));
 914            qemu_mutex_lock(&qemu_global_mutex);
 915        }
 916        iothread_requesting_mutex = false;
 917        qemu_cond_broadcast(&qemu_io_proceeded_cond);
 918    }
 919}
 920
 921void qemu_mutex_unlock_iothread(void)
 922{
 923    qemu_mutex_unlock(&qemu_global_mutex);
 924}
 925
 926static int all_vcpus_paused(void)
 927{
 928    CPUArchState *penv = first_cpu;
 929
 930    while (penv) {
 931        CPUState *pcpu = ENV_GET_CPU(penv);
 932        if (!pcpu->stopped) {
 933            return 0;
 934        }
 935        penv = penv->next_cpu;
 936    }
 937
 938    return 1;
 939}
 940
 941void pause_all_vcpus(void)
 942{
 943    CPUArchState *penv = first_cpu;
 944
 945    qemu_clock_enable(vm_clock, false);
 946    while (penv) {
 947        CPUState *pcpu = ENV_GET_CPU(penv);
 948        pcpu->stop = true;
 949        qemu_cpu_kick(pcpu);
 950        penv = penv->next_cpu;
 951    }
 952
 953    if (qemu_in_vcpu_thread()) {
 954        cpu_stop_current();
 955        if (!kvm_enabled()) {
 956            while (penv) {
 957                CPUState *pcpu = ENV_GET_CPU(penv);
 958                pcpu->stop = 0;
 959                pcpu->stopped = true;
 960                penv = penv->next_cpu;
 961            }
 962            return;
 963        }
 964    }
 965
 966    while (!all_vcpus_paused()) {
 967        qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
 968        penv = first_cpu;
 969        while (penv) {
 970            qemu_cpu_kick(ENV_GET_CPU(penv));
 971            penv = penv->next_cpu;
 972        }
 973    }
 974}
 975
 976void resume_all_vcpus(void)
 977{
 978    CPUArchState *penv = first_cpu;
 979
 980    qemu_clock_enable(vm_clock, true);
 981    while (penv) {
 982        CPUState *pcpu = ENV_GET_CPU(penv);
 983        pcpu->stop = false;
 984        pcpu->stopped = false;
 985        qemu_cpu_kick(pcpu);
 986        penv = penv->next_cpu;
 987    }
 988}
 989
 990static void qemu_tcg_init_vcpu(CPUState *cpu)
 991{
 992    /* share a single thread for all cpus with TCG */
 993    if (!tcg_cpu_thread) {
 994        cpu->thread = g_malloc0(sizeof(QemuThread));
 995        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
 996        qemu_cond_init(cpu->halt_cond);
 997        tcg_halt_cond = cpu->halt_cond;
 998        qemu_thread_create(cpu->thread, qemu_tcg_cpu_thread_fn, cpu,
 999                           QEMU_THREAD_JOINABLE);
1000#ifdef _WIN32
1001        cpu->hThread = qemu_thread_get_handle(cpu->thread);
1002#endif
1003        while (!cpu->created) {
1004            qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1005        }
1006        tcg_cpu_thread = cpu->thread;
1007    } else {
1008        cpu->thread = tcg_cpu_thread;
1009        cpu->halt_cond = tcg_halt_cond;
1010    }
1011}
1012
1013static void qemu_kvm_start_vcpu(CPUArchState *env)
1014{
1015    CPUState *cpu = ENV_GET_CPU(env);
1016
1017    cpu->thread = g_malloc0(sizeof(QemuThread));
1018    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1019    qemu_cond_init(cpu->halt_cond);
1020    qemu_thread_create(cpu->thread, qemu_kvm_cpu_thread_fn, env,
1021                       QEMU_THREAD_JOINABLE);
1022    while (!cpu->created) {
1023        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1024    }
1025}
1026
1027static void qemu_dummy_start_vcpu(CPUArchState *env)
1028{
1029    CPUState *cpu = ENV_GET_CPU(env);
1030
1031    cpu->thread = g_malloc0(sizeof(QemuThread));
1032    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1033    qemu_cond_init(cpu->halt_cond);
1034    qemu_thread_create(cpu->thread, qemu_dummy_cpu_thread_fn, env,
1035                       QEMU_THREAD_JOINABLE);
1036    while (!cpu->created) {
1037        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1038    }
1039}
1040
1041void qemu_init_vcpu(void *_env)
1042{
1043    CPUArchState *env = _env;
1044    CPUState *cpu = ENV_GET_CPU(env);
1045
1046    cpu->nr_cores = smp_cores;
1047    cpu->nr_threads = smp_threads;
1048    cpu->stopped = true;
1049    if (kvm_enabled()) {
1050        qemu_kvm_start_vcpu(env);
1051    } else if (tcg_enabled()) {
1052        qemu_tcg_init_vcpu(cpu);
1053    } else {
1054        qemu_dummy_start_vcpu(env);
1055    }
1056}
1057
1058void cpu_stop_current(void)
1059{
1060    if (cpu_single_env) {
1061        CPUState *cpu_single_cpu = ENV_GET_CPU(cpu_single_env);
1062        cpu_single_cpu->stop = false;
1063        cpu_single_cpu->stopped = true;
1064        cpu_exit(cpu_single_env);
1065        qemu_cond_signal(&qemu_pause_cond);
1066    }
1067}
1068
1069void vm_stop(RunState state)
1070{
1071    if (qemu_in_vcpu_thread()) {
1072        qemu_system_vmstop_request(state);
1073        /*
1074         * FIXME: should not return to device code in case
1075         * vm_stop() has been requested.
1076         */
1077        cpu_stop_current();
1078        return;
1079    }
1080    do_vm_stop(state);
1081}
1082
1083/* does a state transition even if the VM is already stopped,
1084   current state is forgotten forever */
1085void vm_stop_force_state(RunState state)
1086{
1087    if (runstate_is_running()) {
1088        vm_stop(state);
1089    } else {
1090        runstate_set(state);
1091    }
1092}
1093
1094static int tcg_cpu_exec(CPUArchState *env)
1095{
1096    int ret;
1097#ifdef CONFIG_PROFILER
1098    int64_t ti;
1099#endif
1100
1101#ifdef CONFIG_PROFILER
1102    ti = profile_getclock();
1103#endif
1104    if (use_icount) {
1105        int64_t count;
1106        int decr;
1107        qemu_icount -= (env->icount_decr.u16.low + env->icount_extra);
1108        env->icount_decr.u16.low = 0;
1109        env->icount_extra = 0;
1110        count = qemu_icount_round(qemu_clock_deadline(vm_clock));
1111        qemu_icount += count;
1112        decr = (count > 0xffff) ? 0xffff : count;
1113        count -= decr;
1114        env->icount_decr.u16.low = decr;
1115        env->icount_extra = count;
1116    }
1117    ret = cpu_exec(env);
1118#ifdef CONFIG_PROFILER
1119    qemu_time += profile_getclock() - ti;
1120#endif
1121    if (use_icount) {
1122        /* Fold pending instructions back into the
1123           instruction counter, and clear the interrupt flag.  */
1124        qemu_icount -= (env->icount_decr.u16.low
1125                        + env->icount_extra);
1126        env->icount_decr.u32 = 0;
1127        env->icount_extra = 0;
1128    }
1129    return ret;
1130}
1131
1132static void tcg_exec_all(void)
1133{
1134    int r;
1135
1136    /* Account partial waits to the vm_clock.  */
1137    qemu_clock_warp(vm_clock);
1138
1139    if (next_cpu == NULL) {
1140        next_cpu = first_cpu;
1141    }
1142    for (; next_cpu != NULL && !exit_request; next_cpu = next_cpu->next_cpu) {
1143        CPUArchState *env = next_cpu;
1144        CPUState *cpu = ENV_GET_CPU(env);
1145
1146        qemu_clock_enable(vm_clock,
1147                          (env->singlestep_enabled & SSTEP_NOTIMER) == 0);
1148
1149        if (cpu_can_run(cpu)) {
1150            r = tcg_cpu_exec(env);
1151            if (r == EXCP_DEBUG) {
1152                cpu_handle_guest_debug(env);
1153                break;
1154            }
1155        } else if (cpu->stop || cpu->stopped) {
1156            break;
1157        }
1158    }
1159    exit_request = 0;
1160}
1161
1162void set_numa_modes(void)
1163{
1164    CPUArchState *env;
1165    CPUState *cpu;
1166    int i;
1167
1168    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1169        cpu = ENV_GET_CPU(env);
1170        for (i = 0; i < nb_numa_nodes; i++) {
1171            if (test_bit(cpu->cpu_index, node_cpumask[i])) {
1172                cpu->numa_node = i;
1173            }
1174        }
1175    }
1176}
1177
1178void set_cpu_log(const char *optarg)
1179{
1180    int mask;
1181    const CPULogItem *item;
1182
1183    mask = cpu_str_to_log_mask(optarg);
1184    if (!mask) {
1185        printf("Log items (comma separated):\n");
1186        for (item = cpu_log_items; item->mask != 0; item++) {
1187            printf("%-10s %s\n", item->name, item->help);
1188        }
1189        exit(1);
1190    }
1191    cpu_set_log(mask);
1192}
1193
1194void set_cpu_log_filename(const char *optarg)
1195{
1196    cpu_set_log_filename(optarg);
1197}
1198
1199void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1200{
1201    /* XXX: implement xxx_cpu_list for targets that still miss it */
1202#if defined(cpu_list)
1203    cpu_list(f, cpu_fprintf);
1204#endif
1205}
1206
1207CpuInfoList *qmp_query_cpus(Error **errp)
1208{
1209    CpuInfoList *head = NULL, *cur_item = NULL;
1210    CPUArchState *env;
1211
1212    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1213        CPUState *cpu = ENV_GET_CPU(env);
1214        CpuInfoList *info;
1215
1216        cpu_synchronize_state(env);
1217
1218        info = g_malloc0(sizeof(*info));
1219        info->value = g_malloc0(sizeof(*info->value));
1220        info->value->CPU = cpu->cpu_index;
1221        info->value->current = (env == first_cpu);
1222        info->value->halted = env->halted;
1223        info->value->thread_id = cpu->thread_id;
1224#if defined(TARGET_I386)
1225        info->value->has_pc = true;
1226        info->value->pc = env->eip + env->segs[R_CS].base;
1227#elif defined(TARGET_PPC)
1228        info->value->has_nip = true;
1229        info->value->nip = env->nip;
1230#elif defined(TARGET_SPARC)
1231        info->value->has_pc = true;
1232        info->value->pc = env->pc;
1233        info->value->has_npc = true;
1234        info->value->npc = env->npc;
1235#elif defined(TARGET_MIPS)
1236        info->value->has_PC = true;
1237        info->value->PC = env->active_tc.PC;
1238#endif
1239
1240        /* XXX: waiting for the qapi to support GSList */
1241        if (!cur_item) {
1242            head = cur_item = info;
1243        } else {
1244            cur_item->next = info;
1245            cur_item = info;
1246        }
1247    }
1248
1249    return head;
1250}
1251
1252void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1253                 bool has_cpu, int64_t cpu_index, Error **errp)
1254{
1255    FILE *f;
1256    uint32_t l;
1257    CPUArchState *env;
1258    CPUState *cpu;
1259    uint8_t buf[1024];
1260
1261    if (!has_cpu) {
1262        cpu_index = 0;
1263    }
1264
1265    for (env = first_cpu; env; env = env->next_cpu) {
1266        cpu = ENV_GET_CPU(env);
1267        if (cpu_index == cpu->cpu_index) {
1268            break;
1269        }
1270    }
1271
1272    if (env == NULL) {
1273        error_set(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1274                  "a CPU number");
1275        return;
1276    }
1277
1278    f = fopen(filename, "wb");
1279    if (!f) {
1280        error_set(errp, QERR_OPEN_FILE_FAILED, filename);
1281        return;
1282    }
1283
1284    while (size != 0) {
1285        l = sizeof(buf);
1286        if (l > size)
1287            l = size;
1288        cpu_memory_rw_debug(env, addr, buf, l, 0);
1289        if (fwrite(buf, 1, l, f) != l) {
1290            error_set(errp, QERR_IO_ERROR);
1291            goto exit;
1292        }
1293        addr += l;
1294        size -= l;
1295    }
1296
1297exit:
1298    fclose(f);
1299}
1300
1301void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1302                  Error **errp)
1303{
1304    FILE *f;
1305    uint32_t l;
1306    uint8_t buf[1024];
1307
1308    f = fopen(filename, "wb");
1309    if (!f) {
1310        error_set(errp, QERR_OPEN_FILE_FAILED, filename);
1311        return;
1312    }
1313
1314    while (size != 0) {
1315        l = sizeof(buf);
1316        if (l > size)
1317            l = size;
1318        cpu_physical_memory_rw(addr, buf, l, 0);
1319        if (fwrite(buf, 1, l, f) != l) {
1320            error_set(errp, QERR_IO_ERROR);
1321            goto exit;
1322        }
1323        addr += l;
1324        size -= l;
1325    }
1326
1327exit:
1328    fclose(f);
1329}
1330
1331void qmp_inject_nmi(Error **errp)
1332{
1333#if defined(TARGET_I386)
1334    CPUArchState *env;
1335
1336    for (env = first_cpu; env != NULL; env = env->next_cpu) {
1337        if (!env->apic_state) {
1338            cpu_interrupt(env, CPU_INTERRUPT_NMI);
1339        } else {
1340            apic_deliver_nmi(env->apic_state);
1341        }
1342    }
1343#else
1344    error_set(errp, QERR_UNSUPPORTED);
1345#endif
1346}
1347