qemu/cpus.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25/* Needed early for CONFIG_BSD etc. */
  26#include "qemu/osdep.h"
  27
  28#include "monitor/monitor.h"
  29#include "qapi/qmp/qerror.h"
  30#include "qemu/error-report.h"
  31#include "sysemu/sysemu.h"
  32#include "sysemu/block-backend.h"
  33#include "exec/gdbstub.h"
  34#include "sysemu/dma.h"
  35#include "sysemu/kvm.h"
  36#include "qmp-commands.h"
  37
  38#include "qemu/thread.h"
  39#include "sysemu/cpus.h"
  40#include "sysemu/qtest.h"
  41#include "qemu/main-loop.h"
  42#include "qemu/bitmap.h"
  43#include "qemu/seqlock.h"
  44#include "qapi-event.h"
  45#include "hw/nmi.h"
  46#include "sysemu/replay.h"
  47
  48#ifndef _WIN32
  49#include "qemu/compatfd.h"
  50#endif
  51
  52#ifdef CONFIG_LINUX
  53
  54#include <sys/prctl.h>
  55
  56#ifndef PR_MCE_KILL
  57#define PR_MCE_KILL 33
  58#endif
  59
  60#ifndef PR_MCE_KILL_SET
  61#define PR_MCE_KILL_SET 1
  62#endif
  63
  64#ifndef PR_MCE_KILL_EARLY
  65#define PR_MCE_KILL_EARLY 1
  66#endif
  67
  68#endif /* CONFIG_LINUX */
  69
  70static CPUState *next_cpu;
  71int64_t max_delay;
  72int64_t max_advance;
  73
  74/* vcpu throttling controls */
  75static QEMUTimer *throttle_timer;
  76static unsigned int throttle_percentage;
  77
  78#define CPU_THROTTLE_PCT_MIN 1
  79#define CPU_THROTTLE_PCT_MAX 99
  80#define CPU_THROTTLE_TIMESLICE_NS 10000000
  81
  82bool cpu_is_stopped(CPUState *cpu)
  83{
  84    return cpu->stopped || !runstate_is_running();
  85}
  86
  87static bool cpu_thread_is_idle(CPUState *cpu)
  88{
  89    if (cpu->stop || cpu->queued_work_first) {
  90        return false;
  91    }
  92    if (cpu_is_stopped(cpu)) {
  93        return true;
  94    }
  95    if (!cpu->halted || cpu_has_work(cpu) ||
  96        kvm_halt_in_kernel()) {
  97        return false;
  98    }
  99    return true;
 100}
 101
 102static bool all_cpu_threads_idle(void)
 103{
 104    CPUState *cpu;
 105
 106    CPU_FOREACH(cpu) {
 107        if (!cpu_thread_is_idle(cpu)) {
 108            return false;
 109        }
 110    }
 111    return true;
 112}
 113
 114/***********************************************************/
 115/* guest cycle counter */
 116
 117/* Protected by TimersState seqlock */
 118
 119static bool icount_sleep = true;
 120static int64_t vm_clock_warp_start = -1;
 121/* Conversion factor from emulated instructions to virtual clock ticks.  */
 122static int icount_time_shift;
 123/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 124#define MAX_ICOUNT_SHIFT 10
 125
 126static QEMUTimer *icount_rt_timer;
 127static QEMUTimer *icount_vm_timer;
 128static QEMUTimer *icount_warp_timer;
 129
 130typedef struct TimersState {
 131    /* Protected by BQL.  */
 132    int64_t cpu_ticks_prev;
 133    int64_t cpu_ticks_offset;
 134
 135    /* cpu_clock_offset can be read out of BQL, so protect it with
 136     * this lock.
 137     */
 138    QemuSeqLock vm_clock_seqlock;
 139    int64_t cpu_clock_offset;
 140    int32_t cpu_ticks_enabled;
 141    int64_t dummy;
 142
 143    /* Compensate for varying guest execution speed.  */
 144    int64_t qemu_icount_bias;
 145    /* Only written by TCG thread */
 146    int64_t qemu_icount;
 147} TimersState;
 148
 149static TimersState timers_state;
 150
 151int64_t cpu_get_icount_raw(void)
 152{
 153    int64_t icount;
 154    CPUState *cpu = current_cpu;
 155
 156    icount = timers_state.qemu_icount;
 157    if (cpu) {
 158        if (!cpu->can_do_io) {
 159            fprintf(stderr, "Bad icount read\n");
 160            exit(1);
 161        }
 162        icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
 163    }
 164    return icount;
 165}
 166
 167/* Return the virtual CPU time, based on the instruction counter.  */
 168static int64_t cpu_get_icount_locked(void)
 169{
 170    int64_t icount = cpu_get_icount_raw();
 171    return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 172}
 173
 174int64_t cpu_get_icount(void)
 175{
 176    int64_t icount;
 177    unsigned start;
 178
 179    do {
 180        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 181        icount = cpu_get_icount_locked();
 182    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 183
 184    return icount;
 185}
 186
 187int64_t cpu_icount_to_ns(int64_t icount)
 188{
 189    return icount << icount_time_shift;
 190}
 191
 192/* return the host CPU cycle counter and handle stop/restart */
 193/* Caller must hold the BQL */
 194int64_t cpu_get_ticks(void)
 195{
 196    int64_t ticks;
 197
 198    if (use_icount) {
 199        return cpu_get_icount();
 200    }
 201
 202    ticks = timers_state.cpu_ticks_offset;
 203    if (timers_state.cpu_ticks_enabled) {
 204        ticks += cpu_get_host_ticks();
 205    }
 206
 207    if (timers_state.cpu_ticks_prev > ticks) {
 208        /* Note: non increasing ticks may happen if the host uses
 209           software suspend */
 210        timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 211        ticks = timers_state.cpu_ticks_prev;
 212    }
 213
 214    timers_state.cpu_ticks_prev = ticks;
 215    return ticks;
 216}
 217
 218static int64_t cpu_get_clock_locked(void)
 219{
 220    int64_t ticks;
 221
 222    ticks = timers_state.cpu_clock_offset;
 223    if (timers_state.cpu_ticks_enabled) {
 224        ticks += get_clock();
 225    }
 226
 227    return ticks;
 228}
 229
 230/* return the host CPU monotonic timer and handle stop/restart */
 231int64_t cpu_get_clock(void)
 232{
 233    int64_t ti;
 234    unsigned start;
 235
 236    do {
 237        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 238        ti = cpu_get_clock_locked();
 239    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 240
 241    return ti;
 242}
 243
 244/* enable cpu_get_ticks()
 245 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
 246 */
 247void cpu_enable_ticks(void)
 248{
 249    /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 250    seqlock_write_lock(&timers_state.vm_clock_seqlock);
 251    if (!timers_state.cpu_ticks_enabled) {
 252        timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 253        timers_state.cpu_clock_offset -= get_clock();
 254        timers_state.cpu_ticks_enabled = 1;
 255    }
 256    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 257}
 258
 259/* disable cpu_get_ticks() : the clock is stopped. You must not call
 260 * cpu_get_ticks() after that.
 261 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
 262 */
 263void cpu_disable_ticks(void)
 264{
 265    /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 266    seqlock_write_lock(&timers_state.vm_clock_seqlock);
 267    if (timers_state.cpu_ticks_enabled) {
 268        timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 269        timers_state.cpu_clock_offset = cpu_get_clock_locked();
 270        timers_state.cpu_ticks_enabled = 0;
 271    }
 272    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 273}
 274
 275/* Correlation between real and virtual time is always going to be
 276   fairly approximate, so ignore small variation.
 277   When the guest is idle real and virtual time will be aligned in
 278   the IO wait loop.  */
 279#define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 280
 281static void icount_adjust(void)
 282{
 283    int64_t cur_time;
 284    int64_t cur_icount;
 285    int64_t delta;
 286
 287    /* Protected by TimersState mutex.  */
 288    static int64_t last_delta;
 289
 290    /* If the VM is not running, then do nothing.  */
 291    if (!runstate_is_running()) {
 292        return;
 293    }
 294
 295    seqlock_write_lock(&timers_state.vm_clock_seqlock);
 296    cur_time = cpu_get_clock_locked();
 297    cur_icount = cpu_get_icount_locked();
 298
 299    delta = cur_icount - cur_time;
 300    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 301    if (delta > 0
 302        && last_delta + ICOUNT_WOBBLE < delta * 2
 303        && icount_time_shift > 0) {
 304        /* The guest is getting too far ahead.  Slow time down.  */
 305        icount_time_shift--;
 306    }
 307    if (delta < 0
 308        && last_delta - ICOUNT_WOBBLE > delta * 2
 309        && icount_time_shift < MAX_ICOUNT_SHIFT) {
 310        /* The guest is getting too far behind.  Speed time up.  */
 311        icount_time_shift++;
 312    }
 313    last_delta = delta;
 314    timers_state.qemu_icount_bias = cur_icount
 315                              - (timers_state.qemu_icount << icount_time_shift);
 316    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 317}
 318
 319static void icount_adjust_rt(void *opaque)
 320{
 321    timer_mod(icount_rt_timer,
 322              qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 323    icount_adjust();
 324}
 325
 326static void icount_adjust_vm(void *opaque)
 327{
 328    timer_mod(icount_vm_timer,
 329                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 330                   NANOSECONDS_PER_SECOND / 10);
 331    icount_adjust();
 332}
 333
 334static int64_t qemu_icount_round(int64_t count)
 335{
 336    return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 337}
 338
 339static void icount_warp_rt(void)
 340{
 341    unsigned seq;
 342    int64_t warp_start;
 343
 344    /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 345     * changes from -1 to another value, so the race here is okay.
 346     */
 347    do {
 348        seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 349        warp_start = vm_clock_warp_start;
 350    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 351
 352    if (warp_start == -1) {
 353        return;
 354    }
 355
 356    seqlock_write_lock(&timers_state.vm_clock_seqlock);
 357    if (runstate_is_running()) {
 358        int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 359                                     cpu_get_clock_locked());
 360        int64_t warp_delta;
 361
 362        warp_delta = clock - vm_clock_warp_start;
 363        if (use_icount == 2) {
 364            /*
 365             * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 366             * far ahead of real time.
 367             */
 368            int64_t cur_icount = cpu_get_icount_locked();
 369            int64_t delta = clock - cur_icount;
 370            warp_delta = MIN(warp_delta, delta);
 371        }
 372        timers_state.qemu_icount_bias += warp_delta;
 373    }
 374    vm_clock_warp_start = -1;
 375    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 376
 377    if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 378        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 379    }
 380}
 381
 382static void icount_timer_cb(void *opaque)
 383{
 384    /* No need for a checkpoint because the timer already synchronizes
 385     * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 386     */
 387    icount_warp_rt();
 388}
 389
 390void qtest_clock_warp(int64_t dest)
 391{
 392    int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 393    AioContext *aio_context;
 394    assert(qtest_enabled());
 395    aio_context = qemu_get_aio_context();
 396    while (clock < dest) {
 397        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 398        int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 399
 400        seqlock_write_lock(&timers_state.vm_clock_seqlock);
 401        timers_state.qemu_icount_bias += warp;
 402        seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 403
 404        qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 405        timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 406        clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 407    }
 408    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 409}
 410
 411void qemu_start_warp_timer(void)
 412{
 413    int64_t clock;
 414    int64_t deadline;
 415
 416    if (!use_icount) {
 417        return;
 418    }
 419
 420    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 421     * do not fire, so computing the deadline does not make sense.
 422     */
 423    if (!runstate_is_running()) {
 424        return;
 425    }
 426
 427    /* warp clock deterministically in record/replay mode */
 428    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 429        return;
 430    }
 431
 432    if (!all_cpu_threads_idle()) {
 433        return;
 434    }
 435
 436    if (qtest_enabled()) {
 437        /* When testing, qtest commands advance icount.  */
 438        return;
 439    }
 440
 441    /* We want to use the earliest deadline from ALL vm_clocks */
 442    clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 443    deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 444    if (deadline < 0) {
 445        static bool notified;
 446        if (!icount_sleep && !notified) {
 447            error_report("WARNING: icount sleep disabled and no active timers");
 448            notified = true;
 449        }
 450        return;
 451    }
 452
 453    if (deadline > 0) {
 454        /*
 455         * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 456         * sleep.  Otherwise, the CPU might be waiting for a future timer
 457         * interrupt to wake it up, but the interrupt never comes because
 458         * the vCPU isn't running any insns and thus doesn't advance the
 459         * QEMU_CLOCK_VIRTUAL.
 460         */
 461        if (!icount_sleep) {
 462            /*
 463             * We never let VCPUs sleep in no sleep icount mode.
 464             * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 465             * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 466             * It is useful when we want a deterministic execution time,
 467             * isolated from host latencies.
 468             */
 469            seqlock_write_lock(&timers_state.vm_clock_seqlock);
 470            timers_state.qemu_icount_bias += deadline;
 471            seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 472            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 473        } else {
 474            /*
 475             * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 476             * "real" time, (related to the time left until the next event) has
 477             * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 478             * This avoids that the warps are visible externally; for example,
 479             * you will not be sending network packets continuously instead of
 480             * every 100ms.
 481             */
 482            seqlock_write_lock(&timers_state.vm_clock_seqlock);
 483            if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
 484                vm_clock_warp_start = clock;
 485            }
 486            seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 487            timer_mod_anticipate(icount_warp_timer, clock + deadline);
 488        }
 489    } else if (deadline == 0) {
 490        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 491    }
 492}
 493
 494static void qemu_account_warp_timer(void)
 495{
 496    if (!use_icount || !icount_sleep) {
 497        return;
 498    }
 499
 500    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 501     * do not fire, so computing the deadline does not make sense.
 502     */
 503    if (!runstate_is_running()) {
 504        return;
 505    }
 506
 507    /* warp clock deterministically in record/replay mode */
 508    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 509        return;
 510    }
 511
 512    timer_del(icount_warp_timer);
 513    icount_warp_rt();
 514}
 515
 516static bool icount_state_needed(void *opaque)
 517{
 518    return use_icount;
 519}
 520
 521/*
 522 * This is a subsection for icount migration.
 523 */
 524static const VMStateDescription icount_vmstate_timers = {
 525    .name = "timer/icount",
 526    .version_id = 1,
 527    .minimum_version_id = 1,
 528    .needed = icount_state_needed,
 529    .fields = (VMStateField[]) {
 530        VMSTATE_INT64(qemu_icount_bias, TimersState),
 531        VMSTATE_INT64(qemu_icount, TimersState),
 532        VMSTATE_END_OF_LIST()
 533    }
 534};
 535
 536static const VMStateDescription vmstate_timers = {
 537    .name = "timer",
 538    .version_id = 2,
 539    .minimum_version_id = 1,
 540    .fields = (VMStateField[]) {
 541        VMSTATE_INT64(cpu_ticks_offset, TimersState),
 542        VMSTATE_INT64(dummy, TimersState),
 543        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 544        VMSTATE_END_OF_LIST()
 545    },
 546    .subsections = (const VMStateDescription*[]) {
 547        &icount_vmstate_timers,
 548        NULL
 549    }
 550};
 551
 552static void cpu_throttle_thread(void *opaque)
 553{
 554    CPUState *cpu = opaque;
 555    double pct;
 556    double throttle_ratio;
 557    long sleeptime_ns;
 558
 559    if (!cpu_throttle_get_percentage()) {
 560        return;
 561    }
 562
 563    pct = (double)cpu_throttle_get_percentage()/100;
 564    throttle_ratio = pct / (1 - pct);
 565    sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 566
 567    qemu_mutex_unlock_iothread();
 568    atomic_set(&cpu->throttle_thread_scheduled, 0);
 569    g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 570    qemu_mutex_lock_iothread();
 571}
 572
 573static void cpu_throttle_timer_tick(void *opaque)
 574{
 575    CPUState *cpu;
 576    double pct;
 577
 578    /* Stop the timer if needed */
 579    if (!cpu_throttle_get_percentage()) {
 580        return;
 581    }
 582    CPU_FOREACH(cpu) {
 583        if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 584            async_run_on_cpu(cpu, cpu_throttle_thread, cpu);
 585        }
 586    }
 587
 588    pct = (double)cpu_throttle_get_percentage()/100;
 589    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 590                                   CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 591}
 592
 593void cpu_throttle_set(int new_throttle_pct)
 594{
 595    /* Ensure throttle percentage is within valid range */
 596    new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 597    new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 598
 599    atomic_set(&throttle_percentage, new_throttle_pct);
 600
 601    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 602                                       CPU_THROTTLE_TIMESLICE_NS);
 603}
 604
 605void cpu_throttle_stop(void)
 606{
 607    atomic_set(&throttle_percentage, 0);
 608}
 609
 610bool cpu_throttle_active(void)
 611{
 612    return (cpu_throttle_get_percentage() != 0);
 613}
 614
 615int cpu_throttle_get_percentage(void)
 616{
 617    return atomic_read(&throttle_percentage);
 618}
 619
 620void cpu_ticks_init(void)
 621{
 622    seqlock_init(&timers_state.vm_clock_seqlock, NULL);
 623    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 624    throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 625                                           cpu_throttle_timer_tick, NULL);
 626}
 627
 628void configure_icount(QemuOpts *opts, Error **errp)
 629{
 630    const char *option;
 631    char *rem_str = NULL;
 632
 633    option = qemu_opt_get(opts, "shift");
 634    if (!option) {
 635        if (qemu_opt_get(opts, "align") != NULL) {
 636            error_setg(errp, "Please specify shift option when using align");
 637        }
 638        return;
 639    }
 640
 641    icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 642    if (icount_sleep) {
 643        icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 644                                         icount_timer_cb, NULL);
 645    }
 646
 647    icount_align_option = qemu_opt_get_bool(opts, "align", false);
 648
 649    if (icount_align_option && !icount_sleep) {
 650        error_setg(errp, "align=on and sleep=off are incompatible");
 651    }
 652    if (strcmp(option, "auto") != 0) {
 653        errno = 0;
 654        icount_time_shift = strtol(option, &rem_str, 0);
 655        if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 656            error_setg(errp, "icount: Invalid shift value");
 657        }
 658        use_icount = 1;
 659        return;
 660    } else if (icount_align_option) {
 661        error_setg(errp, "shift=auto and align=on are incompatible");
 662    } else if (!icount_sleep) {
 663        error_setg(errp, "shift=auto and sleep=off are incompatible");
 664    }
 665
 666    use_icount = 2;
 667
 668    /* 125MIPS seems a reasonable initial guess at the guest speed.
 669       It will be corrected fairly quickly anyway.  */
 670    icount_time_shift = 3;
 671
 672    /* Have both realtime and virtual time triggers for speed adjustment.
 673       The realtime trigger catches emulated time passing too slowly,
 674       the virtual time trigger catches emulated time passing too fast.
 675       Realtime triggers occur even when idle, so use them less frequently
 676       than VM triggers.  */
 677    icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 678                                   icount_adjust_rt, NULL);
 679    timer_mod(icount_rt_timer,
 680                   qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 681    icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 682                                        icount_adjust_vm, NULL);
 683    timer_mod(icount_vm_timer,
 684                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 685                   NANOSECONDS_PER_SECOND / 10);
 686}
 687
 688/***********************************************************/
 689void hw_error(const char *fmt, ...)
 690{
 691    va_list ap;
 692    CPUState *cpu;
 693
 694    va_start(ap, fmt);
 695    fprintf(stderr, "qemu: hardware error: ");
 696    vfprintf(stderr, fmt, ap);
 697    fprintf(stderr, "\n");
 698    CPU_FOREACH(cpu) {
 699        fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 700        cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 701    }
 702    va_end(ap);
 703    abort();
 704}
 705
 706void cpu_synchronize_all_states(void)
 707{
 708    CPUState *cpu;
 709
 710    CPU_FOREACH(cpu) {
 711        cpu_synchronize_state(cpu);
 712    }
 713}
 714
 715void cpu_synchronize_all_post_reset(void)
 716{
 717    CPUState *cpu;
 718
 719    CPU_FOREACH(cpu) {
 720        cpu_synchronize_post_reset(cpu);
 721    }
 722}
 723
 724void cpu_synchronize_all_post_init(void)
 725{
 726    CPUState *cpu;
 727
 728    CPU_FOREACH(cpu) {
 729        cpu_synchronize_post_init(cpu);
 730    }
 731}
 732
 733static int do_vm_stop(RunState state)
 734{
 735    int ret = 0;
 736
 737    if (runstate_is_running()) {
 738        cpu_disable_ticks();
 739        pause_all_vcpus();
 740        runstate_set(state);
 741        vm_state_notify(0, state);
 742        qapi_event_send_stop(&error_abort);
 743    }
 744
 745    bdrv_drain_all();
 746    ret = blk_flush_all();
 747
 748    return ret;
 749}
 750
 751static bool cpu_can_run(CPUState *cpu)
 752{
 753    if (cpu->stop) {
 754        return false;
 755    }
 756    if (cpu_is_stopped(cpu)) {
 757        return false;
 758    }
 759    return true;
 760}
 761
 762static void cpu_handle_guest_debug(CPUState *cpu)
 763{
 764    gdb_set_stop_cpu(cpu);
 765    qemu_system_debug_request();
 766    cpu->stopped = true;
 767}
 768
 769#ifdef CONFIG_LINUX
 770static void sigbus_reraise(void)
 771{
 772    sigset_t set;
 773    struct sigaction action;
 774
 775    memset(&action, 0, sizeof(action));
 776    action.sa_handler = SIG_DFL;
 777    if (!sigaction(SIGBUS, &action, NULL)) {
 778        raise(SIGBUS);
 779        sigemptyset(&set);
 780        sigaddset(&set, SIGBUS);
 781        sigprocmask(SIG_UNBLOCK, &set, NULL);
 782    }
 783    perror("Failed to re-raise SIGBUS!\n");
 784    abort();
 785}
 786
 787static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
 788                           void *ctx)
 789{
 790    if (kvm_on_sigbus(siginfo->ssi_code,
 791                      (void *)(intptr_t)siginfo->ssi_addr)) {
 792        sigbus_reraise();
 793    }
 794}
 795
 796static void qemu_init_sigbus(void)
 797{
 798    struct sigaction action;
 799
 800    memset(&action, 0, sizeof(action));
 801    action.sa_flags = SA_SIGINFO;
 802    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
 803    sigaction(SIGBUS, &action, NULL);
 804
 805    prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 806}
 807
 808static void qemu_kvm_eat_signals(CPUState *cpu)
 809{
 810    struct timespec ts = { 0, 0 };
 811    siginfo_t siginfo;
 812    sigset_t waitset;
 813    sigset_t chkset;
 814    int r;
 815
 816    sigemptyset(&waitset);
 817    sigaddset(&waitset, SIG_IPI);
 818    sigaddset(&waitset, SIGBUS);
 819
 820    do {
 821        r = sigtimedwait(&waitset, &siginfo, &ts);
 822        if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
 823            perror("sigtimedwait");
 824            exit(1);
 825        }
 826
 827        switch (r) {
 828        case SIGBUS:
 829            if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
 830                sigbus_reraise();
 831            }
 832            break;
 833        default:
 834            break;
 835        }
 836
 837        r = sigpending(&chkset);
 838        if (r == -1) {
 839            perror("sigpending");
 840            exit(1);
 841        }
 842    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 843}
 844
 845#else /* !CONFIG_LINUX */
 846
 847static void qemu_init_sigbus(void)
 848{
 849}
 850
 851static void qemu_kvm_eat_signals(CPUState *cpu)
 852{
 853}
 854#endif /* !CONFIG_LINUX */
 855
 856#ifndef _WIN32
 857static void dummy_signal(int sig)
 858{
 859}
 860
 861static void qemu_kvm_init_cpu_signals(CPUState *cpu)
 862{
 863    int r;
 864    sigset_t set;
 865    struct sigaction sigact;
 866
 867    memset(&sigact, 0, sizeof(sigact));
 868    sigact.sa_handler = dummy_signal;
 869    sigaction(SIG_IPI, &sigact, NULL);
 870
 871    pthread_sigmask(SIG_BLOCK, NULL, &set);
 872    sigdelset(&set, SIG_IPI);
 873    sigdelset(&set, SIGBUS);
 874    r = kvm_set_signal_mask(cpu, &set);
 875    if (r) {
 876        fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
 877        exit(1);
 878    }
 879}
 880
 881#else /* _WIN32 */
 882static void qemu_kvm_init_cpu_signals(CPUState *cpu)
 883{
 884    abort();
 885}
 886#endif /* _WIN32 */
 887
 888static QemuMutex qemu_global_mutex;
 889static QemuCond qemu_io_proceeded_cond;
 890static unsigned iothread_requesting_mutex;
 891
 892static QemuThread io_thread;
 893
 894/* cpu creation */
 895static QemuCond qemu_cpu_cond;
 896/* system init */
 897static QemuCond qemu_pause_cond;
 898static QemuCond qemu_work_cond;
 899
 900void qemu_init_cpu_loop(void)
 901{
 902    qemu_init_sigbus();
 903    qemu_cond_init(&qemu_cpu_cond);
 904    qemu_cond_init(&qemu_pause_cond);
 905    qemu_cond_init(&qemu_work_cond);
 906    qemu_cond_init(&qemu_io_proceeded_cond);
 907    qemu_mutex_init(&qemu_global_mutex);
 908
 909    qemu_thread_get_self(&io_thread);
 910}
 911
 912void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
 913{
 914    struct qemu_work_item wi;
 915
 916    if (qemu_cpu_is_self(cpu)) {
 917        func(data);
 918        return;
 919    }
 920
 921    wi.func = func;
 922    wi.data = data;
 923    wi.free = false;
 924
 925    qemu_mutex_lock(&cpu->work_mutex);
 926    if (cpu->queued_work_first == NULL) {
 927        cpu->queued_work_first = &wi;
 928    } else {
 929        cpu->queued_work_last->next = &wi;
 930    }
 931    cpu->queued_work_last = &wi;
 932    wi.next = NULL;
 933    wi.done = false;
 934    qemu_mutex_unlock(&cpu->work_mutex);
 935
 936    qemu_cpu_kick(cpu);
 937    while (!atomic_mb_read(&wi.done)) {
 938        CPUState *self_cpu = current_cpu;
 939
 940        qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
 941        current_cpu = self_cpu;
 942    }
 943}
 944
 945void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
 946{
 947    struct qemu_work_item *wi;
 948
 949    if (qemu_cpu_is_self(cpu)) {
 950        func(data);
 951        return;
 952    }
 953
 954    wi = g_malloc0(sizeof(struct qemu_work_item));
 955    wi->func = func;
 956    wi->data = data;
 957    wi->free = true;
 958
 959    qemu_mutex_lock(&cpu->work_mutex);
 960    if (cpu->queued_work_first == NULL) {
 961        cpu->queued_work_first = wi;
 962    } else {
 963        cpu->queued_work_last->next = wi;
 964    }
 965    cpu->queued_work_last = wi;
 966    wi->next = NULL;
 967    wi->done = false;
 968    qemu_mutex_unlock(&cpu->work_mutex);
 969
 970    qemu_cpu_kick(cpu);
 971}
 972
 973static void flush_queued_work(CPUState *cpu)
 974{
 975    struct qemu_work_item *wi;
 976
 977    if (cpu->queued_work_first == NULL) {
 978        return;
 979    }
 980
 981    qemu_mutex_lock(&cpu->work_mutex);
 982    while (cpu->queued_work_first != NULL) {
 983        wi = cpu->queued_work_first;
 984        cpu->queued_work_first = wi->next;
 985        if (!cpu->queued_work_first) {
 986            cpu->queued_work_last = NULL;
 987        }
 988        qemu_mutex_unlock(&cpu->work_mutex);
 989        wi->func(wi->data);
 990        qemu_mutex_lock(&cpu->work_mutex);
 991        if (wi->free) {
 992            g_free(wi);
 993        } else {
 994            atomic_mb_set(&wi->done, true);
 995        }
 996    }
 997    qemu_mutex_unlock(&cpu->work_mutex);
 998    qemu_cond_broadcast(&qemu_work_cond);
 999}
1000
1001static void qemu_wait_io_event_common(CPUState *cpu)
1002{
1003    if (cpu->stop) {
1004        cpu->stop = false;
1005        cpu->stopped = true;
1006        qemu_cond_broadcast(&qemu_pause_cond);
1007    }
1008    flush_queued_work(cpu);
1009    cpu->thread_kicked = false;
1010}
1011
1012static void qemu_tcg_wait_io_event(CPUState *cpu)
1013{
1014    while (all_cpu_threads_idle()) {
1015        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1016    }
1017
1018    while (iothread_requesting_mutex) {
1019        qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
1020    }
1021
1022    CPU_FOREACH(cpu) {
1023        qemu_wait_io_event_common(cpu);
1024    }
1025}
1026
1027static void qemu_kvm_wait_io_event(CPUState *cpu)
1028{
1029    while (cpu_thread_is_idle(cpu)) {
1030        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1031    }
1032
1033    qemu_kvm_eat_signals(cpu);
1034    qemu_wait_io_event_common(cpu);
1035}
1036
1037static void *qemu_kvm_cpu_thread_fn(void *arg)
1038{
1039    CPUState *cpu = arg;
1040    int r;
1041
1042    rcu_register_thread();
1043
1044    qemu_mutex_lock_iothread();
1045    qemu_thread_get_self(cpu->thread);
1046    cpu->thread_id = qemu_get_thread_id();
1047    cpu->can_do_io = 1;
1048    current_cpu = cpu;
1049
1050    r = kvm_init_vcpu(cpu);
1051    if (r < 0) {
1052        fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1053        exit(1);
1054    }
1055
1056    qemu_kvm_init_cpu_signals(cpu);
1057
1058    /* signal CPU creation */
1059    cpu->created = true;
1060    qemu_cond_signal(&qemu_cpu_cond);
1061
1062    while (1) {
1063        if (cpu_can_run(cpu)) {
1064            r = kvm_cpu_exec(cpu);
1065            if (r == EXCP_DEBUG) {
1066                cpu_handle_guest_debug(cpu);
1067            }
1068        }
1069        qemu_kvm_wait_io_event(cpu);
1070    }
1071
1072    return NULL;
1073}
1074
1075static void *qemu_dummy_cpu_thread_fn(void *arg)
1076{
1077#ifdef _WIN32
1078    fprintf(stderr, "qtest is not supported under Windows\n");
1079    exit(1);
1080#else
1081    CPUState *cpu = arg;
1082    sigset_t waitset;
1083    int r;
1084
1085    rcu_register_thread();
1086
1087    qemu_mutex_lock_iothread();
1088    qemu_thread_get_self(cpu->thread);
1089    cpu->thread_id = qemu_get_thread_id();
1090    cpu->can_do_io = 1;
1091
1092    sigemptyset(&waitset);
1093    sigaddset(&waitset, SIG_IPI);
1094
1095    /* signal CPU creation */
1096    cpu->created = true;
1097    qemu_cond_signal(&qemu_cpu_cond);
1098
1099    current_cpu = cpu;
1100    while (1) {
1101        current_cpu = NULL;
1102        qemu_mutex_unlock_iothread();
1103        do {
1104            int sig;
1105            r = sigwait(&waitset, &sig);
1106        } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1107        if (r == -1) {
1108            perror("sigwait");
1109            exit(1);
1110        }
1111        qemu_mutex_lock_iothread();
1112        current_cpu = cpu;
1113        qemu_wait_io_event_common(cpu);
1114    }
1115
1116    return NULL;
1117#endif
1118}
1119
1120static void tcg_exec_all(void);
1121
1122static void *qemu_tcg_cpu_thread_fn(void *arg)
1123{
1124    CPUState *cpu = arg;
1125
1126    rcu_register_thread();
1127
1128    qemu_mutex_lock_iothread();
1129    qemu_thread_get_self(cpu->thread);
1130
1131    CPU_FOREACH(cpu) {
1132        cpu->thread_id = qemu_get_thread_id();
1133        cpu->created = true;
1134        cpu->can_do_io = 1;
1135    }
1136    qemu_cond_signal(&qemu_cpu_cond);
1137
1138    /* wait for initial kick-off after machine start */
1139    while (first_cpu->stopped) {
1140        qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1141
1142        /* process any pending work */
1143        CPU_FOREACH(cpu) {
1144            qemu_wait_io_event_common(cpu);
1145        }
1146    }
1147
1148    /* process any pending work */
1149    atomic_mb_set(&exit_request, 1);
1150
1151    while (1) {
1152        tcg_exec_all();
1153
1154        if (use_icount) {
1155            int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1156
1157            if (deadline == 0) {
1158                qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1159            }
1160        }
1161        qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
1162    }
1163
1164    return NULL;
1165}
1166
1167static void qemu_cpu_kick_thread(CPUState *cpu)
1168{
1169#ifndef _WIN32
1170    int err;
1171
1172    if (cpu->thread_kicked) {
1173        return;
1174    }
1175    cpu->thread_kicked = true;
1176    err = pthread_kill(cpu->thread->thread, SIG_IPI);
1177    if (err) {
1178        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1179        exit(1);
1180    }
1181#else /* _WIN32 */
1182    abort();
1183#endif
1184}
1185
1186static void qemu_cpu_kick_no_halt(void)
1187{
1188    CPUState *cpu;
1189    /* Ensure whatever caused the exit has reached the CPU threads before
1190     * writing exit_request.
1191     */
1192    atomic_mb_set(&exit_request, 1);
1193    cpu = atomic_mb_read(&tcg_current_cpu);
1194    if (cpu) {
1195        cpu_exit(cpu);
1196    }
1197}
1198
1199void qemu_cpu_kick(CPUState *cpu)
1200{
1201    qemu_cond_broadcast(cpu->halt_cond);
1202    if (tcg_enabled()) {
1203        qemu_cpu_kick_no_halt();
1204    } else {
1205        qemu_cpu_kick_thread(cpu);
1206    }
1207}
1208
1209void qemu_cpu_kick_self(void)
1210{
1211    assert(current_cpu);
1212    qemu_cpu_kick_thread(current_cpu);
1213}
1214
1215bool qemu_cpu_is_self(CPUState *cpu)
1216{
1217    return qemu_thread_is_self(cpu->thread);
1218}
1219
1220bool qemu_in_vcpu_thread(void)
1221{
1222    return current_cpu && qemu_cpu_is_self(current_cpu);
1223}
1224
1225static __thread bool iothread_locked = false;
1226
1227bool qemu_mutex_iothread_locked(void)
1228{
1229    return iothread_locked;
1230}
1231
1232void qemu_mutex_lock_iothread(void)
1233{
1234    atomic_inc(&iothread_requesting_mutex);
1235    /* In the simple case there is no need to bump the VCPU thread out of
1236     * TCG code execution.
1237     */
1238    if (!tcg_enabled() || qemu_in_vcpu_thread() ||
1239        !first_cpu || !first_cpu->created) {
1240        qemu_mutex_lock(&qemu_global_mutex);
1241        atomic_dec(&iothread_requesting_mutex);
1242    } else {
1243        if (qemu_mutex_trylock(&qemu_global_mutex)) {
1244            qemu_cpu_kick_no_halt();
1245            qemu_mutex_lock(&qemu_global_mutex);
1246        }
1247        atomic_dec(&iothread_requesting_mutex);
1248        qemu_cond_broadcast(&qemu_io_proceeded_cond);
1249    }
1250    iothread_locked = true;
1251}
1252
1253void qemu_mutex_unlock_iothread(void)
1254{
1255    iothread_locked = false;
1256    qemu_mutex_unlock(&qemu_global_mutex);
1257}
1258
1259static int all_vcpus_paused(void)
1260{
1261    CPUState *cpu;
1262
1263    CPU_FOREACH(cpu) {
1264        if (!cpu->stopped) {
1265            return 0;
1266        }
1267    }
1268
1269    return 1;
1270}
1271
1272void pause_all_vcpus(void)
1273{
1274    CPUState *cpu;
1275
1276    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1277    CPU_FOREACH(cpu) {
1278        cpu->stop = true;
1279        qemu_cpu_kick(cpu);
1280    }
1281
1282    if (qemu_in_vcpu_thread()) {
1283        cpu_stop_current();
1284        if (!kvm_enabled()) {
1285            CPU_FOREACH(cpu) {
1286                cpu->stop = false;
1287                cpu->stopped = true;
1288            }
1289            return;
1290        }
1291    }
1292
1293    while (!all_vcpus_paused()) {
1294        qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1295        CPU_FOREACH(cpu) {
1296            qemu_cpu_kick(cpu);
1297        }
1298    }
1299}
1300
1301void cpu_resume(CPUState *cpu)
1302{
1303    cpu->stop = false;
1304    cpu->stopped = false;
1305    qemu_cpu_kick(cpu);
1306}
1307
1308void resume_all_vcpus(void)
1309{
1310    CPUState *cpu;
1311
1312    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1313    CPU_FOREACH(cpu) {
1314        cpu_resume(cpu);
1315    }
1316}
1317
1318/* For temporary buffers for forming a name */
1319#define VCPU_THREAD_NAME_SIZE 16
1320
1321static void qemu_tcg_init_vcpu(CPUState *cpu)
1322{
1323    char thread_name[VCPU_THREAD_NAME_SIZE];
1324    static QemuCond *tcg_halt_cond;
1325    static QemuThread *tcg_cpu_thread;
1326
1327    /* share a single thread for all cpus with TCG */
1328    if (!tcg_cpu_thread) {
1329        cpu->thread = g_malloc0(sizeof(QemuThread));
1330        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1331        qemu_cond_init(cpu->halt_cond);
1332        tcg_halt_cond = cpu->halt_cond;
1333        snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1334                 cpu->cpu_index);
1335        qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1336                           cpu, QEMU_THREAD_JOINABLE);
1337#ifdef _WIN32
1338        cpu->hThread = qemu_thread_get_handle(cpu->thread);
1339#endif
1340        while (!cpu->created) {
1341            qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1342        }
1343        tcg_cpu_thread = cpu->thread;
1344    } else {
1345        cpu->thread = tcg_cpu_thread;
1346        cpu->halt_cond = tcg_halt_cond;
1347    }
1348}
1349
1350static void qemu_kvm_start_vcpu(CPUState *cpu)
1351{
1352    char thread_name[VCPU_THREAD_NAME_SIZE];
1353
1354    cpu->thread = g_malloc0(sizeof(QemuThread));
1355    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1356    qemu_cond_init(cpu->halt_cond);
1357    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1358             cpu->cpu_index);
1359    qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1360                       cpu, QEMU_THREAD_JOINABLE);
1361    while (!cpu->created) {
1362        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1363    }
1364}
1365
1366static void qemu_dummy_start_vcpu(CPUState *cpu)
1367{
1368    char thread_name[VCPU_THREAD_NAME_SIZE];
1369
1370    cpu->thread = g_malloc0(sizeof(QemuThread));
1371    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1372    qemu_cond_init(cpu->halt_cond);
1373    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1374             cpu->cpu_index);
1375    qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1376                       QEMU_THREAD_JOINABLE);
1377    while (!cpu->created) {
1378        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1379    }
1380}
1381
1382void qemu_init_vcpu(CPUState *cpu)
1383{
1384    cpu->nr_cores = smp_cores;
1385    cpu->nr_threads = smp_threads;
1386    cpu->stopped = true;
1387
1388    if (!cpu->as) {
1389        /* If the target cpu hasn't set up any address spaces itself,
1390         * give it the default one.
1391         */
1392        AddressSpace *as = address_space_init_shareable(cpu->memory,
1393                                                        "cpu-memory");
1394        cpu->num_ases = 1;
1395        cpu_address_space_init(cpu, as, 0);
1396    }
1397
1398    if (kvm_enabled()) {
1399        qemu_kvm_start_vcpu(cpu);
1400    } else if (tcg_enabled()) {
1401        qemu_tcg_init_vcpu(cpu);
1402    } else {
1403        qemu_dummy_start_vcpu(cpu);
1404    }
1405}
1406
1407void cpu_stop_current(void)
1408{
1409    if (current_cpu) {
1410        current_cpu->stop = false;
1411        current_cpu->stopped = true;
1412        cpu_exit(current_cpu);
1413        qemu_cond_broadcast(&qemu_pause_cond);
1414    }
1415}
1416
1417int vm_stop(RunState state)
1418{
1419    if (qemu_in_vcpu_thread()) {
1420        qemu_system_vmstop_request_prepare();
1421        qemu_system_vmstop_request(state);
1422        /*
1423         * FIXME: should not return to device code in case
1424         * vm_stop() has been requested.
1425         */
1426        cpu_stop_current();
1427        return 0;
1428    }
1429
1430    return do_vm_stop(state);
1431}
1432
1433/* does a state transition even if the VM is already stopped,
1434   current state is forgotten forever */
1435int vm_stop_force_state(RunState state)
1436{
1437    if (runstate_is_running()) {
1438        return vm_stop(state);
1439    } else {
1440        runstate_set(state);
1441
1442        bdrv_drain_all();
1443        /* Make sure to return an error if the flush in a previous vm_stop()
1444         * failed. */
1445        return blk_flush_all();
1446    }
1447}
1448
1449static int64_t tcg_get_icount_limit(void)
1450{
1451    int64_t deadline;
1452
1453    if (replay_mode != REPLAY_MODE_PLAY) {
1454        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1455
1456        /* Maintain prior (possibly buggy) behaviour where if no deadline
1457         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1458         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1459         * nanoseconds.
1460         */
1461        if ((deadline < 0) || (deadline > INT32_MAX)) {
1462            deadline = INT32_MAX;
1463        }
1464
1465        return qemu_icount_round(deadline);
1466    } else {
1467        return replay_get_instructions();
1468    }
1469}
1470
1471static int tcg_cpu_exec(CPUState *cpu)
1472{
1473    int ret;
1474#ifdef CONFIG_PROFILER
1475    int64_t ti;
1476#endif
1477
1478#ifdef CONFIG_PROFILER
1479    ti = profile_getclock();
1480#endif
1481    if (use_icount) {
1482        int64_t count;
1483        int decr;
1484        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1485                                    + cpu->icount_extra);
1486        cpu->icount_decr.u16.low = 0;
1487        cpu->icount_extra = 0;
1488        count = tcg_get_icount_limit();
1489        timers_state.qemu_icount += count;
1490        decr = (count > 0xffff) ? 0xffff : count;
1491        count -= decr;
1492        cpu->icount_decr.u16.low = decr;
1493        cpu->icount_extra = count;
1494    }
1495    ret = cpu_exec(cpu);
1496#ifdef CONFIG_PROFILER
1497    tcg_time += profile_getclock() - ti;
1498#endif
1499    if (use_icount) {
1500        /* Fold pending instructions back into the
1501           instruction counter, and clear the interrupt flag.  */
1502        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1503                        + cpu->icount_extra);
1504        cpu->icount_decr.u32 = 0;
1505        cpu->icount_extra = 0;
1506        replay_account_executed_instructions();
1507    }
1508    return ret;
1509}
1510
1511static void tcg_exec_all(void)
1512{
1513    int r;
1514
1515    /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1516    qemu_account_warp_timer();
1517
1518    if (next_cpu == NULL) {
1519        next_cpu = first_cpu;
1520    }
1521    for (; next_cpu != NULL && !exit_request; next_cpu = CPU_NEXT(next_cpu)) {
1522        CPUState *cpu = next_cpu;
1523
1524        qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1525                          (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1526
1527        if (cpu_can_run(cpu)) {
1528            r = tcg_cpu_exec(cpu);
1529            if (r == EXCP_DEBUG) {
1530                cpu_handle_guest_debug(cpu);
1531                break;
1532            }
1533        } else if (cpu->stop || cpu->stopped) {
1534            break;
1535        }
1536    }
1537
1538    /* Pairs with smp_wmb in qemu_cpu_kick.  */
1539    atomic_mb_set(&exit_request, 0);
1540}
1541
1542void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1543{
1544    /* XXX: implement xxx_cpu_list for targets that still miss it */
1545#if defined(cpu_list)
1546    cpu_list(f, cpu_fprintf);
1547#endif
1548}
1549
1550CpuInfoList *qmp_query_cpus(Error **errp)
1551{
1552    CpuInfoList *head = NULL, *cur_item = NULL;
1553    CPUState *cpu;
1554
1555    CPU_FOREACH(cpu) {
1556        CpuInfoList *info;
1557#if defined(TARGET_I386)
1558        X86CPU *x86_cpu = X86_CPU(cpu);
1559        CPUX86State *env = &x86_cpu->env;
1560#elif defined(TARGET_PPC)
1561        PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1562        CPUPPCState *env = &ppc_cpu->env;
1563#elif defined(TARGET_SPARC)
1564        SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1565        CPUSPARCState *env = &sparc_cpu->env;
1566#elif defined(TARGET_MIPS)
1567        MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1568        CPUMIPSState *env = &mips_cpu->env;
1569#elif defined(TARGET_TRICORE)
1570        TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1571        CPUTriCoreState *env = &tricore_cpu->env;
1572#endif
1573
1574        cpu_synchronize_state(cpu);
1575
1576        info = g_malloc0(sizeof(*info));
1577        info->value = g_malloc0(sizeof(*info->value));
1578        info->value->CPU = cpu->cpu_index;
1579        info->value->current = (cpu == first_cpu);
1580        info->value->halted = cpu->halted;
1581        info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1582        info->value->thread_id = cpu->thread_id;
1583#if defined(TARGET_I386)
1584        info->value->arch = CPU_INFO_ARCH_X86;
1585        info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1586#elif defined(TARGET_PPC)
1587        info->value->arch = CPU_INFO_ARCH_PPC;
1588        info->value->u.ppc.nip = env->nip;
1589#elif defined(TARGET_SPARC)
1590        info->value->arch = CPU_INFO_ARCH_SPARC;
1591        info->value->u.q_sparc.pc = env->pc;
1592        info->value->u.q_sparc.npc = env->npc;
1593#elif defined(TARGET_MIPS)
1594        info->value->arch = CPU_INFO_ARCH_MIPS;
1595        info->value->u.q_mips.PC = env->active_tc.PC;
1596#elif defined(TARGET_TRICORE)
1597        info->value->arch = CPU_INFO_ARCH_TRICORE;
1598        info->value->u.tricore.PC = env->PC;
1599#else
1600        info->value->arch = CPU_INFO_ARCH_OTHER;
1601#endif
1602
1603        /* XXX: waiting for the qapi to support GSList */
1604        if (!cur_item) {
1605            head = cur_item = info;
1606        } else {
1607            cur_item->next = info;
1608            cur_item = info;
1609        }
1610    }
1611
1612    return head;
1613}
1614
1615void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1616                 bool has_cpu, int64_t cpu_index, Error **errp)
1617{
1618    FILE *f;
1619    uint32_t l;
1620    CPUState *cpu;
1621    uint8_t buf[1024];
1622    int64_t orig_addr = addr, orig_size = size;
1623
1624    if (!has_cpu) {
1625        cpu_index = 0;
1626    }
1627
1628    cpu = qemu_get_cpu(cpu_index);
1629    if (cpu == NULL) {
1630        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1631                   "a CPU number");
1632        return;
1633    }
1634
1635    f = fopen(filename, "wb");
1636    if (!f) {
1637        error_setg_file_open(errp, errno, filename);
1638        return;
1639    }
1640
1641    while (size != 0) {
1642        l = sizeof(buf);
1643        if (l > size)
1644            l = size;
1645        if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1646            error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1647                             " specified", orig_addr, orig_size);
1648            goto exit;
1649        }
1650        if (fwrite(buf, 1, l, f) != l) {
1651            error_setg(errp, QERR_IO_ERROR);
1652            goto exit;
1653        }
1654        addr += l;
1655        size -= l;
1656    }
1657
1658exit:
1659    fclose(f);
1660}
1661
1662void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1663                  Error **errp)
1664{
1665    FILE *f;
1666    uint32_t l;
1667    uint8_t buf[1024];
1668
1669    f = fopen(filename, "wb");
1670    if (!f) {
1671        error_setg_file_open(errp, errno, filename);
1672        return;
1673    }
1674
1675    while (size != 0) {
1676        l = sizeof(buf);
1677        if (l > size)
1678            l = size;
1679        cpu_physical_memory_read(addr, buf, l);
1680        if (fwrite(buf, 1, l, f) != l) {
1681            error_setg(errp, QERR_IO_ERROR);
1682            goto exit;
1683        }
1684        addr += l;
1685        size -= l;
1686    }
1687
1688exit:
1689    fclose(f);
1690}
1691
1692void qmp_inject_nmi(Error **errp)
1693{
1694#if defined(TARGET_I386)
1695    CPUState *cs;
1696
1697    CPU_FOREACH(cs) {
1698        X86CPU *cpu = X86_CPU(cs);
1699
1700        if (!cpu->apic_state) {
1701            cpu_interrupt(cs, CPU_INTERRUPT_NMI);
1702        } else {
1703            apic_deliver_nmi(cpu->apic_state);
1704        }
1705    }
1706#else
1707    nmi_monitor_handle(monitor_get_cpu_index(), errp);
1708#endif
1709}
1710
1711void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1712{
1713    if (!use_icount) {
1714        return;
1715    }
1716
1717    cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
1718                (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1719    if (icount_align_option) {
1720        cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
1721        cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
1722    } else {
1723        cpu_fprintf(f, "Max guest delay     NA\n");
1724        cpu_fprintf(f, "Max guest advance   NA\n");
1725    }
1726}
1727