qemu/cpus.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25/* Needed early for CONFIG_BSD etc. */
  26#include "config-host.h"
  27
  28#include "monitor/monitor.h"
  29#include "sysemu/sysemu.h"
  30#include "exec/gdbstub.h"
  31#include "sysemu/dma.h"
  32#include "sysemu/kvm.h"
  33#include "qmp-commands.h"
  34
  35#include "qemu/thread.h"
  36#include "sysemu/cpus.h"
  37#include "sysemu/qtest.h"
  38#include "qemu/main-loop.h"
  39#include "qemu/bitmap.h"
  40
  41#ifndef _WIN32
  42#include "qemu/compatfd.h"
  43#endif
  44
  45#ifdef CONFIG_LINUX
  46
  47#include <sys/prctl.h>
  48
  49#ifndef PR_MCE_KILL
  50#define PR_MCE_KILL 33
  51#endif
  52
  53#ifndef PR_MCE_KILL_SET
  54#define PR_MCE_KILL_SET 1
  55#endif
  56
  57#ifndef PR_MCE_KILL_EARLY
  58#define PR_MCE_KILL_EARLY 1
  59#endif
  60
  61#endif /* CONFIG_LINUX */
  62
  63static CPUState *next_cpu;
  64
  65static bool cpu_thread_is_idle(CPUState *cpu)
  66{
  67    if (cpu->stop || cpu->queued_work_first) {
  68        return false;
  69    }
  70    if (cpu->stopped || !runstate_is_running()) {
  71        return true;
  72    }
  73    if (!cpu->halted || qemu_cpu_has_work(cpu) ||
  74        kvm_halt_in_kernel()) {
  75        return false;
  76    }
  77    return true;
  78}
  79
  80static bool all_cpu_threads_idle(void)
  81{
  82    CPUState *cpu;
  83
  84    for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) {
  85        if (!cpu_thread_is_idle(cpu)) {
  86            return false;
  87        }
  88    }
  89    return true;
  90}
  91
  92/***********************************************************/
  93/* guest cycle counter */
  94
  95/* Conversion factor from emulated instructions to virtual clock ticks.  */
  96static int icount_time_shift;
  97/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
  98#define MAX_ICOUNT_SHIFT 10
  99/* Compensate for varying guest execution speed.  */
 100static int64_t qemu_icount_bias;
 101static QEMUTimer *icount_rt_timer;
 102static QEMUTimer *icount_vm_timer;
 103static QEMUTimer *icount_warp_timer;
 104static int64_t vm_clock_warp_start;
 105static int64_t qemu_icount;
 106
 107typedef struct TimersState {
 108    int64_t cpu_ticks_prev;
 109    int64_t cpu_ticks_offset;
 110    int64_t cpu_clock_offset;
 111    int32_t cpu_ticks_enabled;
 112    int64_t dummy;
 113} TimersState;
 114
 115static TimersState timers_state;
 116
 117/* Return the virtual CPU time, based on the instruction counter.  */
 118int64_t cpu_get_icount(void)
 119{
 120    int64_t icount;
 121    CPUState *cpu = current_cpu;
 122
 123    icount = qemu_icount;
 124    if (cpu) {
 125        CPUArchState *env = cpu->env_ptr;
 126        if (!can_do_io(env)) {
 127            fprintf(stderr, "Bad clock read\n");
 128        }
 129        icount -= (env->icount_decr.u16.low + env->icount_extra);
 130    }
 131    return qemu_icount_bias + (icount << icount_time_shift);
 132}
 133
 134/* return the host CPU cycle counter and handle stop/restart */
 135int64_t cpu_get_ticks(void)
 136{
 137    if (use_icount) {
 138        return cpu_get_icount();
 139    }
 140    if (!timers_state.cpu_ticks_enabled) {
 141        return timers_state.cpu_ticks_offset;
 142    } else {
 143        int64_t ticks;
 144        ticks = cpu_get_real_ticks();
 145        if (timers_state.cpu_ticks_prev > ticks) {
 146            /* Note: non increasing ticks may happen if the host uses
 147               software suspend */
 148            timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 149        }
 150        timers_state.cpu_ticks_prev = ticks;
 151        return ticks + timers_state.cpu_ticks_offset;
 152    }
 153}
 154
 155/* return the host CPU monotonic timer and handle stop/restart */
 156int64_t cpu_get_clock(void)
 157{
 158    int64_t ti;
 159    if (!timers_state.cpu_ticks_enabled) {
 160        return timers_state.cpu_clock_offset;
 161    } else {
 162        ti = get_clock();
 163        return ti + timers_state.cpu_clock_offset;
 164    }
 165}
 166
 167/* enable cpu_get_ticks() */
 168void cpu_enable_ticks(void)
 169{
 170    if (!timers_state.cpu_ticks_enabled) {
 171        timers_state.cpu_ticks_offset -= cpu_get_real_ticks();
 172        timers_state.cpu_clock_offset -= get_clock();
 173        timers_state.cpu_ticks_enabled = 1;
 174    }
 175}
 176
 177/* disable cpu_get_ticks() : the clock is stopped. You must not call
 178   cpu_get_ticks() after that.  */
 179void cpu_disable_ticks(void)
 180{
 181    if (timers_state.cpu_ticks_enabled) {
 182        timers_state.cpu_ticks_offset = cpu_get_ticks();
 183        timers_state.cpu_clock_offset = cpu_get_clock();
 184        timers_state.cpu_ticks_enabled = 0;
 185    }
 186}
 187
 188/* Correlation between real and virtual time is always going to be
 189   fairly approximate, so ignore small variation.
 190   When the guest is idle real and virtual time will be aligned in
 191   the IO wait loop.  */
 192#define ICOUNT_WOBBLE (get_ticks_per_sec() / 10)
 193
 194static void icount_adjust(void)
 195{
 196    int64_t cur_time;
 197    int64_t cur_icount;
 198    int64_t delta;
 199    static int64_t last_delta;
 200    /* If the VM is not running, then do nothing.  */
 201    if (!runstate_is_running()) {
 202        return;
 203    }
 204    cur_time = cpu_get_clock();
 205    cur_icount = qemu_get_clock_ns(vm_clock);
 206    delta = cur_icount - cur_time;
 207    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 208    if (delta > 0
 209        && last_delta + ICOUNT_WOBBLE < delta * 2
 210        && icount_time_shift > 0) {
 211        /* The guest is getting too far ahead.  Slow time down.  */
 212        icount_time_shift--;
 213    }
 214    if (delta < 0
 215        && last_delta - ICOUNT_WOBBLE > delta * 2
 216        && icount_time_shift < MAX_ICOUNT_SHIFT) {
 217        /* The guest is getting too far behind.  Speed time up.  */
 218        icount_time_shift++;
 219    }
 220    last_delta = delta;
 221    qemu_icount_bias = cur_icount - (qemu_icount << icount_time_shift);
 222}
 223
 224static void icount_adjust_rt(void *opaque)
 225{
 226    qemu_mod_timer(icount_rt_timer,
 227                   qemu_get_clock_ms(rt_clock) + 1000);
 228    icount_adjust();
 229}
 230
 231static void icount_adjust_vm(void *opaque)
 232{
 233    qemu_mod_timer(icount_vm_timer,
 234                   qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
 235    icount_adjust();
 236}
 237
 238static int64_t qemu_icount_round(int64_t count)
 239{
 240    return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 241}
 242
 243static void icount_warp_rt(void *opaque)
 244{
 245    if (vm_clock_warp_start == -1) {
 246        return;
 247    }
 248
 249    if (runstate_is_running()) {
 250        int64_t clock = qemu_get_clock_ns(rt_clock);
 251        int64_t warp_delta = clock - vm_clock_warp_start;
 252        if (use_icount == 1) {
 253            qemu_icount_bias += warp_delta;
 254        } else {
 255            /*
 256             * In adaptive mode, do not let the vm_clock run too
 257             * far ahead of real time.
 258             */
 259            int64_t cur_time = cpu_get_clock();
 260            int64_t cur_icount = qemu_get_clock_ns(vm_clock);
 261            int64_t delta = cur_time - cur_icount;
 262            qemu_icount_bias += MIN(warp_delta, delta);
 263        }
 264        if (qemu_clock_expired(vm_clock)) {
 265            qemu_notify_event();
 266        }
 267    }
 268    vm_clock_warp_start = -1;
 269}
 270
 271void qtest_clock_warp(int64_t dest)
 272{
 273    int64_t clock = qemu_get_clock_ns(vm_clock);
 274    assert(qtest_enabled());
 275    while (clock < dest) {
 276        int64_t deadline = qemu_clock_deadline(vm_clock);
 277        int64_t warp = MIN(dest - clock, deadline);
 278        qemu_icount_bias += warp;
 279        qemu_run_timers(vm_clock);
 280        clock = qemu_get_clock_ns(vm_clock);
 281    }
 282    qemu_notify_event();
 283}
 284
 285void qemu_clock_warp(QEMUClock *clock)
 286{
 287    int64_t deadline;
 288
 289    /*
 290     * There are too many global variables to make the "warp" behavior
 291     * applicable to other clocks.  But a clock argument removes the
 292     * need for if statements all over the place.
 293     */
 294    if (clock != vm_clock || !use_icount) {
 295        return;
 296    }
 297
 298    /*
 299     * If the CPUs have been sleeping, advance the vm_clock timer now.  This
 300     * ensures that the deadline for the timer is computed correctly below.
 301     * This also makes sure that the insn counter is synchronized before the
 302     * CPU starts running, in case the CPU is woken by an event other than
 303     * the earliest vm_clock timer.
 304     */
 305    icount_warp_rt(NULL);
 306    if (!all_cpu_threads_idle() || !qemu_clock_has_timers(vm_clock)) {
 307        qemu_del_timer(icount_warp_timer);
 308        return;
 309    }
 310
 311    if (qtest_enabled()) {
 312        /* When testing, qtest commands advance icount.  */
 313        return;
 314    }
 315
 316    vm_clock_warp_start = qemu_get_clock_ns(rt_clock);
 317    deadline = qemu_clock_deadline(vm_clock);
 318    if (deadline > 0) {
 319        /*
 320         * Ensure the vm_clock proceeds even when the virtual CPU goes to
 321         * sleep.  Otherwise, the CPU might be waiting for a future timer
 322         * interrupt to wake it up, but the interrupt never comes because
 323         * the vCPU isn't running any insns and thus doesn't advance the
 324         * vm_clock.
 325         *
 326         * An extreme solution for this problem would be to never let VCPUs
 327         * sleep in icount mode if there is a pending vm_clock timer; rather
 328         * time could just advance to the next vm_clock event.  Instead, we
 329         * do stop VCPUs and only advance vm_clock after some "real" time,
 330         * (related to the time left until the next event) has passed.  This
 331         * rt_clock timer will do this.  This avoids that the warps are too
 332         * visible externally---for example, you will not be sending network
 333         * packets continuously instead of every 100ms.
 334         */
 335        qemu_mod_timer(icount_warp_timer, vm_clock_warp_start + deadline);
 336    } else {
 337        qemu_notify_event();
 338    }
 339}
 340
 341static const VMStateDescription vmstate_timers = {
 342    .name = "timer",
 343    .version_id = 2,
 344    .minimum_version_id = 1,
 345    .minimum_version_id_old = 1,
 346    .fields      = (VMStateField[]) {
 347        VMSTATE_INT64(cpu_ticks_offset, TimersState),
 348        VMSTATE_INT64(dummy, TimersState),
 349        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 350        VMSTATE_END_OF_LIST()
 351    }
 352};
 353
 354void configure_icount(const char *option)
 355{
 356    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 357    if (!option) {
 358        return;
 359    }
 360
 361    icount_warp_timer = qemu_new_timer_ns(rt_clock, icount_warp_rt, NULL);
 362    if (strcmp(option, "auto") != 0) {
 363        icount_time_shift = strtol(option, NULL, 0);
 364        use_icount = 1;
 365        return;
 366    }
 367
 368    use_icount = 2;
 369
 370    /* 125MIPS seems a reasonable initial guess at the guest speed.
 371       It will be corrected fairly quickly anyway.  */
 372    icount_time_shift = 3;
 373
 374    /* Have both realtime and virtual time triggers for speed adjustment.
 375       The realtime trigger catches emulated time passing too slowly,
 376       the virtual time trigger catches emulated time passing too fast.
 377       Realtime triggers occur even when idle, so use them less frequently
 378       than VM triggers.  */
 379    icount_rt_timer = qemu_new_timer_ms(rt_clock, icount_adjust_rt, NULL);
 380    qemu_mod_timer(icount_rt_timer,
 381                   qemu_get_clock_ms(rt_clock) + 1000);
 382    icount_vm_timer = qemu_new_timer_ns(vm_clock, icount_adjust_vm, NULL);
 383    qemu_mod_timer(icount_vm_timer,
 384                   qemu_get_clock_ns(vm_clock) + get_ticks_per_sec() / 10);
 385}
 386
 387/***********************************************************/
 388void hw_error(const char *fmt, ...)
 389{
 390    va_list ap;
 391    CPUState *cpu;
 392
 393    va_start(ap, fmt);
 394    fprintf(stderr, "qemu: hardware error: ");
 395    vfprintf(stderr, fmt, ap);
 396    fprintf(stderr, "\n");
 397    for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) {
 398        fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 399        cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 400    }
 401    va_end(ap);
 402    abort();
 403}
 404
 405void cpu_synchronize_all_states(void)
 406{
 407    CPUState *cpu;
 408
 409    for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
 410        cpu_synchronize_state(cpu);
 411    }
 412}
 413
 414void cpu_synchronize_all_post_reset(void)
 415{
 416    CPUState *cpu;
 417
 418    for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
 419        cpu_synchronize_post_reset(cpu);
 420    }
 421}
 422
 423void cpu_synchronize_all_post_init(void)
 424{
 425    CPUState *cpu;
 426
 427    for (cpu = first_cpu; cpu; cpu = cpu->next_cpu) {
 428        cpu_synchronize_post_init(cpu);
 429    }
 430}
 431
 432bool cpu_is_stopped(CPUState *cpu)
 433{
 434    return !runstate_is_running() || cpu->stopped;
 435}
 436
 437static int do_vm_stop(RunState state)
 438{
 439    int ret = 0;
 440
 441    if (runstate_is_running()) {
 442        cpu_disable_ticks();
 443        pause_all_vcpus();
 444        runstate_set(state);
 445        vm_state_notify(0, state);
 446        monitor_protocol_event(QEVENT_STOP, NULL);
 447    }
 448
 449    bdrv_drain_all();
 450    ret = bdrv_flush_all();
 451
 452    return ret;
 453}
 454
 455static bool cpu_can_run(CPUState *cpu)
 456{
 457    if (cpu->stop) {
 458        return false;
 459    }
 460    if (cpu->stopped || !runstate_is_running()) {
 461        return false;
 462    }
 463    return true;
 464}
 465
 466static void cpu_handle_guest_debug(CPUState *cpu)
 467{
 468    gdb_set_stop_cpu(cpu);
 469    qemu_system_debug_request();
 470    cpu->stopped = true;
 471}
 472
 473static void cpu_signal(int sig)
 474{
 475    if (current_cpu) {
 476        cpu_exit(current_cpu);
 477    }
 478    exit_request = 1;
 479}
 480
 481#ifdef CONFIG_LINUX
 482static void sigbus_reraise(void)
 483{
 484    sigset_t set;
 485    struct sigaction action;
 486
 487    memset(&action, 0, sizeof(action));
 488    action.sa_handler = SIG_DFL;
 489    if (!sigaction(SIGBUS, &action, NULL)) {
 490        raise(SIGBUS);
 491        sigemptyset(&set);
 492        sigaddset(&set, SIGBUS);
 493        sigprocmask(SIG_UNBLOCK, &set, NULL);
 494    }
 495    perror("Failed to re-raise SIGBUS!\n");
 496    abort();
 497}
 498
 499static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
 500                           void *ctx)
 501{
 502    if (kvm_on_sigbus(siginfo->ssi_code,
 503                      (void *)(intptr_t)siginfo->ssi_addr)) {
 504        sigbus_reraise();
 505    }
 506}
 507
 508static void qemu_init_sigbus(void)
 509{
 510    struct sigaction action;
 511
 512    memset(&action, 0, sizeof(action));
 513    action.sa_flags = SA_SIGINFO;
 514    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
 515    sigaction(SIGBUS, &action, NULL);
 516
 517    prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 518}
 519
 520static void qemu_kvm_eat_signals(CPUState *cpu)
 521{
 522    struct timespec ts = { 0, 0 };
 523    siginfo_t siginfo;
 524    sigset_t waitset;
 525    sigset_t chkset;
 526    int r;
 527
 528    sigemptyset(&waitset);
 529    sigaddset(&waitset, SIG_IPI);
 530    sigaddset(&waitset, SIGBUS);
 531
 532    do {
 533        r = sigtimedwait(&waitset, &siginfo, &ts);
 534        if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
 535            perror("sigtimedwait");
 536            exit(1);
 537        }
 538
 539        switch (r) {
 540        case SIGBUS:
 541            if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
 542                sigbus_reraise();
 543            }
 544            break;
 545        default:
 546            break;
 547        }
 548
 549        r = sigpending(&chkset);
 550        if (r == -1) {
 551            perror("sigpending");
 552            exit(1);
 553        }
 554    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 555}
 556
 557#else /* !CONFIG_LINUX */
 558
 559static void qemu_init_sigbus(void)
 560{
 561}
 562
 563static void qemu_kvm_eat_signals(CPUState *cpu)
 564{
 565}
 566#endif /* !CONFIG_LINUX */
 567
 568#ifndef _WIN32
 569static void dummy_signal(int sig)
 570{
 571}
 572
 573static void qemu_kvm_init_cpu_signals(CPUState *cpu)
 574{
 575    int r;
 576    sigset_t set;
 577    struct sigaction sigact;
 578
 579    memset(&sigact, 0, sizeof(sigact));
 580    sigact.sa_handler = dummy_signal;
 581    sigaction(SIG_IPI, &sigact, NULL);
 582
 583    pthread_sigmask(SIG_BLOCK, NULL, &set);
 584    sigdelset(&set, SIG_IPI);
 585    sigdelset(&set, SIGBUS);
 586    r = kvm_set_signal_mask(cpu, &set);
 587    if (r) {
 588        fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
 589        exit(1);
 590    }
 591}
 592
 593static void qemu_tcg_init_cpu_signals(void)
 594{
 595    sigset_t set;
 596    struct sigaction sigact;
 597
 598    memset(&sigact, 0, sizeof(sigact));
 599    sigact.sa_handler = cpu_signal;
 600    sigaction(SIG_IPI, &sigact, NULL);
 601
 602    sigemptyset(&set);
 603    sigaddset(&set, SIG_IPI);
 604    pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 605}
 606
 607#else /* _WIN32 */
 608static void qemu_kvm_init_cpu_signals(CPUState *cpu)
 609{
 610    abort();
 611}
 612
 613static void qemu_tcg_init_cpu_signals(void)
 614{
 615}
 616#endif /* _WIN32 */
 617
 618static QemuMutex qemu_global_mutex;
 619static QemuCond qemu_io_proceeded_cond;
 620static bool iothread_requesting_mutex;
 621
 622static QemuThread io_thread;
 623
 624static QemuThread *tcg_cpu_thread;
 625static QemuCond *tcg_halt_cond;
 626
 627/* cpu creation */
 628static QemuCond qemu_cpu_cond;
 629/* system init */
 630static QemuCond qemu_pause_cond;
 631static QemuCond qemu_work_cond;
 632
 633void qemu_init_cpu_loop(void)
 634{
 635    qemu_init_sigbus();
 636    qemu_cond_init(&qemu_cpu_cond);
 637    qemu_cond_init(&qemu_pause_cond);
 638    qemu_cond_init(&qemu_work_cond);
 639    qemu_cond_init(&qemu_io_proceeded_cond);
 640    qemu_mutex_init(&qemu_global_mutex);
 641
 642    qemu_thread_get_self(&io_thread);
 643}
 644
 645void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
 646{
 647    struct qemu_work_item wi;
 648
 649    if (qemu_cpu_is_self(cpu)) {
 650        func(data);
 651        return;
 652    }
 653
 654    wi.func = func;
 655    wi.data = data;
 656    wi.free = false;
 657    if (cpu->queued_work_first == NULL) {
 658        cpu->queued_work_first = &wi;
 659    } else {
 660        cpu->queued_work_last->next = &wi;
 661    }
 662    cpu->queued_work_last = &wi;
 663    wi.next = NULL;
 664    wi.done = false;
 665
 666    qemu_cpu_kick(cpu);
 667    while (!wi.done) {
 668        CPUState *self_cpu = current_cpu;
 669
 670        qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
 671        current_cpu = self_cpu;
 672    }
 673}
 674
 675void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
 676{
 677    struct qemu_work_item *wi;
 678
 679    if (qemu_cpu_is_self(cpu)) {
 680        func(data);
 681        return;
 682    }
 683
 684    wi = g_malloc0(sizeof(struct qemu_work_item));
 685    wi->func = func;
 686    wi->data = data;
 687    wi->free = true;
 688    if (cpu->queued_work_first == NULL) {
 689        cpu->queued_work_first = wi;
 690    } else {
 691        cpu->queued_work_last->next = wi;
 692    }
 693    cpu->queued_work_last = wi;
 694    wi->next = NULL;
 695    wi->done = false;
 696
 697    qemu_cpu_kick(cpu);
 698}
 699
 700static void flush_queued_work(CPUState *cpu)
 701{
 702    struct qemu_work_item *wi;
 703
 704    if (cpu->queued_work_first == NULL) {
 705        return;
 706    }
 707
 708    while ((wi = cpu->queued_work_first)) {
 709        cpu->queued_work_first = wi->next;
 710        wi->func(wi->data);
 711        wi->done = true;
 712        if (wi->free) {
 713            g_free(wi);
 714        }
 715    }
 716    cpu->queued_work_last = NULL;
 717    qemu_cond_broadcast(&qemu_work_cond);
 718}
 719
 720static void qemu_wait_io_event_common(CPUState *cpu)
 721{
 722    if (cpu->stop) {
 723        cpu->stop = false;
 724        cpu->stopped = true;
 725        qemu_cond_signal(&qemu_pause_cond);
 726    }
 727    flush_queued_work(cpu);
 728    cpu->thread_kicked = false;
 729}
 730
 731static void qemu_tcg_wait_io_event(void)
 732{
 733    CPUState *cpu;
 734
 735    while (all_cpu_threads_idle()) {
 736       /* Start accounting real time to the virtual clock if the CPUs
 737          are idle.  */
 738        qemu_clock_warp(vm_clock);
 739        qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
 740    }
 741
 742    while (iothread_requesting_mutex) {
 743        qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
 744    }
 745
 746    for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) {
 747        qemu_wait_io_event_common(cpu);
 748    }
 749}
 750
 751static void qemu_kvm_wait_io_event(CPUState *cpu)
 752{
 753    while (cpu_thread_is_idle(cpu)) {
 754        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
 755    }
 756
 757    qemu_kvm_eat_signals(cpu);
 758    qemu_wait_io_event_common(cpu);
 759}
 760
 761static void *qemu_kvm_cpu_thread_fn(void *arg)
 762{
 763    CPUState *cpu = arg;
 764    int r;
 765
 766    qemu_mutex_lock(&qemu_global_mutex);
 767    qemu_thread_get_self(cpu->thread);
 768    cpu->thread_id = qemu_get_thread_id();
 769    current_cpu = cpu;
 770
 771    r = kvm_init_vcpu(cpu);
 772    if (r < 0) {
 773        fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
 774        exit(1);
 775    }
 776
 777    qemu_kvm_init_cpu_signals(cpu);
 778
 779    /* signal CPU creation */
 780    cpu->created = true;
 781    qemu_cond_signal(&qemu_cpu_cond);
 782
 783    while (1) {
 784        if (cpu_can_run(cpu)) {
 785            r = kvm_cpu_exec(cpu);
 786            if (r == EXCP_DEBUG) {
 787                cpu_handle_guest_debug(cpu);
 788            }
 789        }
 790        qemu_kvm_wait_io_event(cpu);
 791    }
 792
 793    return NULL;
 794}
 795
 796static void *qemu_dummy_cpu_thread_fn(void *arg)
 797{
 798#ifdef _WIN32
 799    fprintf(stderr, "qtest is not supported under Windows\n");
 800    exit(1);
 801#else
 802    CPUState *cpu = arg;
 803    sigset_t waitset;
 804    int r;
 805
 806    qemu_mutex_lock_iothread();
 807    qemu_thread_get_self(cpu->thread);
 808    cpu->thread_id = qemu_get_thread_id();
 809
 810    sigemptyset(&waitset);
 811    sigaddset(&waitset, SIG_IPI);
 812
 813    /* signal CPU creation */
 814    cpu->created = true;
 815    qemu_cond_signal(&qemu_cpu_cond);
 816
 817    current_cpu = cpu;
 818    while (1) {
 819        current_cpu = NULL;
 820        qemu_mutex_unlock_iothread();
 821        do {
 822            int sig;
 823            r = sigwait(&waitset, &sig);
 824        } while (r == -1 && (errno == EAGAIN || errno == EINTR));
 825        if (r == -1) {
 826            perror("sigwait");
 827            exit(1);
 828        }
 829        qemu_mutex_lock_iothread();
 830        current_cpu = cpu;
 831        qemu_wait_io_event_common(cpu);
 832    }
 833
 834    return NULL;
 835#endif
 836}
 837
 838static void tcg_exec_all(void);
 839
 840static void tcg_signal_cpu_creation(CPUState *cpu, void *data)
 841{
 842    cpu->thread_id = qemu_get_thread_id();
 843    cpu->created = true;
 844}
 845
 846static void *qemu_tcg_cpu_thread_fn(void *arg)
 847{
 848    CPUState *cpu = arg;
 849
 850    qemu_tcg_init_cpu_signals();
 851    qemu_thread_get_self(cpu->thread);
 852
 853    qemu_mutex_lock(&qemu_global_mutex);
 854    qemu_for_each_cpu(tcg_signal_cpu_creation, NULL);
 855    qemu_cond_signal(&qemu_cpu_cond);
 856
 857    /* wait for initial kick-off after machine start */
 858    while (first_cpu->stopped) {
 859        qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex);
 860
 861        /* process any pending work */
 862        for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) {
 863            qemu_wait_io_event_common(cpu);
 864        }
 865    }
 866
 867    while (1) {
 868        tcg_exec_all();
 869        if (use_icount && qemu_clock_deadline(vm_clock) <= 0) {
 870            qemu_notify_event();
 871        }
 872        qemu_tcg_wait_io_event();
 873    }
 874
 875    return NULL;
 876}
 877
 878static void qemu_cpu_kick_thread(CPUState *cpu)
 879{
 880#ifndef _WIN32
 881    int err;
 882
 883    err = pthread_kill(cpu->thread->thread, SIG_IPI);
 884    if (err) {
 885        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
 886        exit(1);
 887    }
 888#else /* _WIN32 */
 889    if (!qemu_cpu_is_self(cpu)) {
 890        CONTEXT tcgContext;
 891
 892        if (SuspendThread(cpu->hThread) == (DWORD)-1) {
 893            fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
 894                    GetLastError());
 895            exit(1);
 896        }
 897
 898        /* On multi-core systems, we are not sure that the thread is actually
 899         * suspended until we can get the context.
 900         */
 901        tcgContext.ContextFlags = CONTEXT_CONTROL;
 902        while (GetThreadContext(cpu->hThread, &tcgContext) != 0) {
 903            continue;
 904        }
 905
 906        cpu_signal(0);
 907
 908        if (ResumeThread(cpu->hThread) == (DWORD)-1) {
 909            fprintf(stderr, "qemu:%s: GetLastError:%lu\n", __func__,
 910                    GetLastError());
 911            exit(1);
 912        }
 913    }
 914#endif
 915}
 916
 917void qemu_cpu_kick(CPUState *cpu)
 918{
 919    qemu_cond_broadcast(cpu->halt_cond);
 920    if (!tcg_enabled() && !cpu->thread_kicked) {
 921        qemu_cpu_kick_thread(cpu);
 922        cpu->thread_kicked = true;
 923    }
 924}
 925
 926void qemu_cpu_kick_self(void)
 927{
 928#ifndef _WIN32
 929    assert(current_cpu);
 930
 931    if (!current_cpu->thread_kicked) {
 932        qemu_cpu_kick_thread(current_cpu);
 933        current_cpu->thread_kicked = true;
 934    }
 935#else
 936    abort();
 937#endif
 938}
 939
 940bool qemu_cpu_is_self(CPUState *cpu)
 941{
 942    return qemu_thread_is_self(cpu->thread);
 943}
 944
 945static bool qemu_in_vcpu_thread(void)
 946{
 947    return current_cpu && qemu_cpu_is_self(current_cpu);
 948}
 949
 950void qemu_mutex_lock_iothread(void)
 951{
 952    if (!tcg_enabled()) {
 953        qemu_mutex_lock(&qemu_global_mutex);
 954    } else {
 955        iothread_requesting_mutex = true;
 956        if (qemu_mutex_trylock(&qemu_global_mutex)) {
 957            qemu_cpu_kick_thread(first_cpu);
 958            qemu_mutex_lock(&qemu_global_mutex);
 959        }
 960        iothread_requesting_mutex = false;
 961        qemu_cond_broadcast(&qemu_io_proceeded_cond);
 962    }
 963}
 964
 965void qemu_mutex_unlock_iothread(void)
 966{
 967    qemu_mutex_unlock(&qemu_global_mutex);
 968}
 969
 970static int all_vcpus_paused(void)
 971{
 972    CPUState *cpu = first_cpu;
 973
 974    while (cpu) {
 975        if (!cpu->stopped) {
 976            return 0;
 977        }
 978        cpu = cpu->next_cpu;
 979    }
 980
 981    return 1;
 982}
 983
 984void pause_all_vcpus(void)
 985{
 986    CPUState *cpu = first_cpu;
 987
 988    qemu_clock_enable(vm_clock, false);
 989    while (cpu) {
 990        cpu->stop = true;
 991        qemu_cpu_kick(cpu);
 992        cpu = cpu->next_cpu;
 993    }
 994
 995    if (qemu_in_vcpu_thread()) {
 996        cpu_stop_current();
 997        if (!kvm_enabled()) {
 998            cpu = first_cpu;
 999            while (cpu) {
1000                cpu->stop = false;
1001                cpu->stopped = true;
1002                cpu = cpu->next_cpu;
1003            }
1004            return;
1005        }
1006    }
1007
1008    while (!all_vcpus_paused()) {
1009        qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1010        cpu = first_cpu;
1011        while (cpu) {
1012            qemu_cpu_kick(cpu);
1013            cpu = cpu->next_cpu;
1014        }
1015    }
1016}
1017
1018void cpu_resume(CPUState *cpu)
1019{
1020    cpu->stop = false;
1021    cpu->stopped = false;
1022    qemu_cpu_kick(cpu);
1023}
1024
1025void resume_all_vcpus(void)
1026{
1027    CPUState *cpu = first_cpu;
1028
1029    qemu_clock_enable(vm_clock, true);
1030    while (cpu) {
1031        cpu_resume(cpu);
1032        cpu = cpu->next_cpu;
1033    }
1034}
1035
1036static void qemu_tcg_init_vcpu(CPUState *cpu)
1037{
1038    /* share a single thread for all cpus with TCG */
1039    if (!tcg_cpu_thread) {
1040        cpu->thread = g_malloc0(sizeof(QemuThread));
1041        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1042        qemu_cond_init(cpu->halt_cond);
1043        tcg_halt_cond = cpu->halt_cond;
1044        qemu_thread_create(cpu->thread, qemu_tcg_cpu_thread_fn, cpu,
1045                           QEMU_THREAD_JOINABLE);
1046#ifdef _WIN32
1047        cpu->hThread = qemu_thread_get_handle(cpu->thread);
1048#endif
1049        while (!cpu->created) {
1050            qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1051        }
1052        tcg_cpu_thread = cpu->thread;
1053    } else {
1054        cpu->thread = tcg_cpu_thread;
1055        cpu->halt_cond = tcg_halt_cond;
1056    }
1057}
1058
1059static void qemu_kvm_start_vcpu(CPUState *cpu)
1060{
1061    cpu->thread = g_malloc0(sizeof(QemuThread));
1062    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1063    qemu_cond_init(cpu->halt_cond);
1064    qemu_thread_create(cpu->thread, qemu_kvm_cpu_thread_fn, cpu,
1065                       QEMU_THREAD_JOINABLE);
1066    while (!cpu->created) {
1067        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1068    }
1069}
1070
1071static void qemu_dummy_start_vcpu(CPUState *cpu)
1072{
1073    cpu->thread = g_malloc0(sizeof(QemuThread));
1074    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1075    qemu_cond_init(cpu->halt_cond);
1076    qemu_thread_create(cpu->thread, qemu_dummy_cpu_thread_fn, cpu,
1077                       QEMU_THREAD_JOINABLE);
1078    while (!cpu->created) {
1079        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1080    }
1081}
1082
1083void qemu_init_vcpu(CPUState *cpu)
1084{
1085    cpu->nr_cores = smp_cores;
1086    cpu->nr_threads = smp_threads;
1087    cpu->stopped = true;
1088    if (kvm_enabled()) {
1089        qemu_kvm_start_vcpu(cpu);
1090    } else if (tcg_enabled()) {
1091        qemu_tcg_init_vcpu(cpu);
1092    } else {
1093        qemu_dummy_start_vcpu(cpu);
1094    }
1095}
1096
1097void cpu_stop_current(void)
1098{
1099    if (current_cpu) {
1100        current_cpu->stop = false;
1101        current_cpu->stopped = true;
1102        cpu_exit(current_cpu);
1103        qemu_cond_signal(&qemu_pause_cond);
1104    }
1105}
1106
1107int vm_stop(RunState state)
1108{
1109    if (qemu_in_vcpu_thread()) {
1110        qemu_system_vmstop_request(state);
1111        /*
1112         * FIXME: should not return to device code in case
1113         * vm_stop() has been requested.
1114         */
1115        cpu_stop_current();
1116        return 0;
1117    }
1118
1119    return do_vm_stop(state);
1120}
1121
1122/* does a state transition even if the VM is already stopped,
1123   current state is forgotten forever */
1124int vm_stop_force_state(RunState state)
1125{
1126    if (runstate_is_running()) {
1127        return vm_stop(state);
1128    } else {
1129        runstate_set(state);
1130        /* Make sure to return an error if the flush in a previous vm_stop()
1131         * failed. */
1132        return bdrv_flush_all();
1133    }
1134}
1135
1136static int tcg_cpu_exec(CPUArchState *env)
1137{
1138    int ret;
1139#ifdef CONFIG_PROFILER
1140    int64_t ti;
1141#endif
1142
1143#ifdef CONFIG_PROFILER
1144    ti = profile_getclock();
1145#endif
1146    if (use_icount) {
1147        int64_t count;
1148        int decr;
1149        qemu_icount -= (env->icount_decr.u16.low + env->icount_extra);
1150        env->icount_decr.u16.low = 0;
1151        env->icount_extra = 0;
1152        count = qemu_icount_round(qemu_clock_deadline(vm_clock));
1153        qemu_icount += count;
1154        decr = (count > 0xffff) ? 0xffff : count;
1155        count -= decr;
1156        env->icount_decr.u16.low = decr;
1157        env->icount_extra = count;
1158    }
1159    ret = cpu_exec(env);
1160#ifdef CONFIG_PROFILER
1161    qemu_time += profile_getclock() - ti;
1162#endif
1163    if (use_icount) {
1164        /* Fold pending instructions back into the
1165           instruction counter, and clear the interrupt flag.  */
1166        qemu_icount -= (env->icount_decr.u16.low
1167                        + env->icount_extra);
1168        env->icount_decr.u32 = 0;
1169        env->icount_extra = 0;
1170    }
1171    return ret;
1172}
1173
1174static void tcg_exec_all(void)
1175{
1176    int r;
1177
1178    /* Account partial waits to the vm_clock.  */
1179    qemu_clock_warp(vm_clock);
1180
1181    if (next_cpu == NULL) {
1182        next_cpu = first_cpu;
1183    }
1184    for (; next_cpu != NULL && !exit_request; next_cpu = next_cpu->next_cpu) {
1185        CPUState *cpu = next_cpu;
1186        CPUArchState *env = cpu->env_ptr;
1187
1188        qemu_clock_enable(vm_clock,
1189                          (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1190
1191        if (cpu_can_run(cpu)) {
1192            r = tcg_cpu_exec(env);
1193            if (r == EXCP_DEBUG) {
1194                cpu_handle_guest_debug(cpu);
1195                break;
1196            }
1197        } else if (cpu->stop || cpu->stopped) {
1198            break;
1199        }
1200    }
1201    exit_request = 0;
1202}
1203
1204void set_numa_modes(void)
1205{
1206    CPUState *cpu;
1207    int i;
1208
1209    for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) {
1210        for (i = 0; i < nb_numa_nodes; i++) {
1211            if (test_bit(cpu->cpu_index, node_cpumask[i])) {
1212                cpu->numa_node = i;
1213            }
1214        }
1215    }
1216}
1217
1218void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1219{
1220    /* XXX: implement xxx_cpu_list for targets that still miss it */
1221#if defined(cpu_list)
1222    cpu_list(f, cpu_fprintf);
1223#endif
1224}
1225
1226CpuInfoList *qmp_query_cpus(Error **errp)
1227{
1228    CpuInfoList *head = NULL, *cur_item = NULL;
1229    CPUState *cpu;
1230
1231    for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) {
1232        CpuInfoList *info;
1233#if defined(TARGET_I386)
1234        X86CPU *x86_cpu = X86_CPU(cpu);
1235        CPUX86State *env = &x86_cpu->env;
1236#elif defined(TARGET_PPC)
1237        PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1238        CPUPPCState *env = &ppc_cpu->env;
1239#elif defined(TARGET_SPARC)
1240        SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1241        CPUSPARCState *env = &sparc_cpu->env;
1242#elif defined(TARGET_MIPS)
1243        MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1244        CPUMIPSState *env = &mips_cpu->env;
1245#endif
1246
1247        cpu_synchronize_state(cpu);
1248
1249        info = g_malloc0(sizeof(*info));
1250        info->value = g_malloc0(sizeof(*info->value));
1251        info->value->CPU = cpu->cpu_index;
1252        info->value->current = (cpu == first_cpu);
1253        info->value->halted = cpu->halted;
1254        info->value->thread_id = cpu->thread_id;
1255#if defined(TARGET_I386)
1256        info->value->has_pc = true;
1257        info->value->pc = env->eip + env->segs[R_CS].base;
1258#elif defined(TARGET_PPC)
1259        info->value->has_nip = true;
1260        info->value->nip = env->nip;
1261#elif defined(TARGET_SPARC)
1262        info->value->has_pc = true;
1263        info->value->pc = env->pc;
1264        info->value->has_npc = true;
1265        info->value->npc = env->npc;
1266#elif defined(TARGET_MIPS)
1267        info->value->has_PC = true;
1268        info->value->PC = env->active_tc.PC;
1269#endif
1270
1271        /* XXX: waiting for the qapi to support GSList */
1272        if (!cur_item) {
1273            head = cur_item = info;
1274        } else {
1275            cur_item->next = info;
1276            cur_item = info;
1277        }
1278    }
1279
1280    return head;
1281}
1282
1283void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1284                 bool has_cpu, int64_t cpu_index, Error **errp)
1285{
1286    FILE *f;
1287    uint32_t l;
1288    CPUState *cpu;
1289    uint8_t buf[1024];
1290
1291    if (!has_cpu) {
1292        cpu_index = 0;
1293    }
1294
1295    cpu = qemu_get_cpu(cpu_index);
1296    if (cpu == NULL) {
1297        error_set(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1298                  "a CPU number");
1299        return;
1300    }
1301
1302    f = fopen(filename, "wb");
1303    if (!f) {
1304        error_setg_file_open(errp, errno, filename);
1305        return;
1306    }
1307
1308    while (size != 0) {
1309        l = sizeof(buf);
1310        if (l > size)
1311            l = size;
1312        cpu_memory_rw_debug(cpu, addr, buf, l, 0);
1313        if (fwrite(buf, 1, l, f) != l) {
1314            error_set(errp, QERR_IO_ERROR);
1315            goto exit;
1316        }
1317        addr += l;
1318        size -= l;
1319    }
1320
1321exit:
1322    fclose(f);
1323}
1324
1325void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1326                  Error **errp)
1327{
1328    FILE *f;
1329    uint32_t l;
1330    uint8_t buf[1024];
1331
1332    f = fopen(filename, "wb");
1333    if (!f) {
1334        error_setg_file_open(errp, errno, filename);
1335        return;
1336    }
1337
1338    while (size != 0) {
1339        l = sizeof(buf);
1340        if (l > size)
1341            l = size;
1342        cpu_physical_memory_rw(addr, buf, l, 0);
1343        if (fwrite(buf, 1, l, f) != l) {
1344            error_set(errp, QERR_IO_ERROR);
1345            goto exit;
1346        }
1347        addr += l;
1348        size -= l;
1349    }
1350
1351exit:
1352    fclose(f);
1353}
1354
1355void qmp_inject_nmi(Error **errp)
1356{
1357#if defined(TARGET_I386)
1358    CPUState *cs;
1359
1360    for (cs = first_cpu; cs != NULL; cs = cs->next_cpu) {
1361        X86CPU *cpu = X86_CPU(cs);
1362        CPUX86State *env = &cpu->env;
1363
1364        if (!env->apic_state) {
1365            cpu_interrupt(cs, CPU_INTERRUPT_NMI);
1366        } else {
1367            apic_deliver_nmi(env->apic_state);
1368        }
1369    }
1370#else
1371    error_set(errp, QERR_UNSUPPORTED);
1372#endif
1373}
1374