qemu/cpus.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25/* Needed early for CONFIG_BSD etc. */
  26#include "qemu/osdep.h"
  27
  28#include "monitor/monitor.h"
  29#include "qapi/qmp/qerror.h"
  30#include "qemu/error-report.h"
  31#include "sysemu/sysemu.h"
  32#include "sysemu/block-backend.h"
  33#include "exec/gdbstub.h"
  34#include "sysemu/dma.h"
  35#include "sysemu/kvm.h"
  36#include "qmp-commands.h"
  37
  38#include "qemu/thread.h"
  39#include "sysemu/cpus.h"
  40#include "sysemu/qtest.h"
  41#include "qemu/main-loop.h"
  42#include "qemu/bitmap.h"
  43#include "qemu/seqlock.h"
  44#include "qapi-event.h"
  45#include "hw/nmi.h"
  46#include "sysemu/replay.h"
  47
  48#ifndef _WIN32
  49#include "qemu/compatfd.h"
  50#endif
  51
  52#ifdef CONFIG_LINUX
  53
  54#include <sys/prctl.h>
  55
  56#ifndef PR_MCE_KILL
  57#define PR_MCE_KILL 33
  58#endif
  59
  60#ifndef PR_MCE_KILL_SET
  61#define PR_MCE_KILL_SET 1
  62#endif
  63
  64#ifndef PR_MCE_KILL_EARLY
  65#define PR_MCE_KILL_EARLY 1
  66#endif
  67
  68#endif /* CONFIG_LINUX */
  69
  70static CPUState *next_cpu;
  71int64_t max_delay;
  72int64_t max_advance;
  73
  74/* vcpu throttling controls */
  75static QEMUTimer *throttle_timer;
  76static unsigned int throttle_percentage;
  77
  78#define CPU_THROTTLE_PCT_MIN 1
  79#define CPU_THROTTLE_PCT_MAX 99
  80#define CPU_THROTTLE_TIMESLICE_NS 10000000
  81
  82bool cpu_is_stopped(CPUState *cpu)
  83{
  84    return cpu->stopped || !runstate_is_running();
  85}
  86
  87static bool cpu_thread_is_idle(CPUState *cpu)
  88{
  89    if (cpu->stop || cpu->queued_work_first) {
  90        return false;
  91    }
  92    if (cpu_is_stopped(cpu)) {
  93        return true;
  94    }
  95    if (!cpu->halted || cpu_has_work(cpu) ||
  96        kvm_halt_in_kernel()) {
  97        return false;
  98    }
  99    return true;
 100}
 101
 102bool all_cpu_threads_idle(void)
 103{
 104    CPUState *cpu;
 105
 106    CPU_FOREACH(cpu) {
 107        if (!cpu_thread_is_idle(cpu)) {
 108            return false;
 109        }
 110    }
 111    return true;
 112}
 113
 114/***********************************************************/
 115/* guest cycle counter */
 116
 117/* Protected by TimersState seqlock */
 118
 119static bool icount_sleep = true;
 120static int64_t vm_clock_warp_start = -1;
 121/* Conversion factor from emulated instructions to virtual clock ticks.  */
 122static int icount_time_shift;
 123/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 124#define MAX_ICOUNT_SHIFT 10
 125
 126static QEMUTimer *icount_rt_timer;
 127static QEMUTimer *icount_vm_timer;
 128static QEMUTimer *icount_warp_timer;
 129
 130typedef struct TimersState {
 131    /* Protected by BQL.  */
 132    int64_t cpu_ticks_prev;
 133    int64_t cpu_ticks_offset;
 134
 135    /* cpu_clock_offset can be read out of BQL, so protect it with
 136     * this lock.
 137     */
 138    QemuSeqLock vm_clock_seqlock;
 139    int64_t cpu_clock_offset;
 140    int32_t cpu_ticks_enabled;
 141    int64_t dummy;
 142
 143    /* Compensate for varying guest execution speed.  */
 144    int64_t qemu_icount_bias;
 145    /* Only written by TCG thread */
 146    int64_t qemu_icount;
 147} TimersState;
 148
 149static TimersState timers_state;
 150
 151int64_t cpu_get_icount_raw(void)
 152{
 153    int64_t icount;
 154    CPUState *cpu = current_cpu;
 155
 156    icount = timers_state.qemu_icount;
 157    if (cpu) {
 158        if (!cpu->can_do_io) {
 159            fprintf(stderr, "Bad icount read\n");
 160            exit(1);
 161        }
 162        icount -= (cpu->icount_decr.u16.low + cpu->icount_extra);
 163    }
 164    return icount;
 165}
 166
 167/* Return the virtual CPU time, based on the instruction counter.  */
 168static int64_t cpu_get_icount_locked(void)
 169{
 170    int64_t icount = cpu_get_icount_raw();
 171    return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 172}
 173
 174int64_t cpu_get_icount(void)
 175{
 176    int64_t icount;
 177    unsigned start;
 178
 179    do {
 180        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 181        icount = cpu_get_icount_locked();
 182    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 183
 184    return icount;
 185}
 186
 187int64_t cpu_icount_to_ns(int64_t icount)
 188{
 189    return icount << icount_time_shift;
 190}
 191
 192/* return the host CPU cycle counter and handle stop/restart */
 193/* Caller must hold the BQL */
 194int64_t cpu_get_ticks(void)
 195{
 196    int64_t ticks;
 197
 198    if (use_icount) {
 199        return cpu_get_icount();
 200    }
 201
 202    ticks = timers_state.cpu_ticks_offset;
 203    if (timers_state.cpu_ticks_enabled) {
 204        ticks += cpu_get_host_ticks();
 205    }
 206
 207    if (timers_state.cpu_ticks_prev > ticks) {
 208        /* Note: non increasing ticks may happen if the host uses
 209           software suspend */
 210        timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 211        ticks = timers_state.cpu_ticks_prev;
 212    }
 213
 214    timers_state.cpu_ticks_prev = ticks;
 215    return ticks;
 216}
 217
 218static int64_t cpu_get_clock_locked(void)
 219{
 220    int64_t ticks;
 221
 222    ticks = timers_state.cpu_clock_offset;
 223    if (timers_state.cpu_ticks_enabled) {
 224        ticks += get_clock();
 225    }
 226
 227    return ticks;
 228}
 229
 230/* return the host CPU monotonic timer and handle stop/restart */
 231int64_t cpu_get_clock(void)
 232{
 233    int64_t ti;
 234    unsigned start;
 235
 236    do {
 237        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 238        ti = cpu_get_clock_locked();
 239    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 240
 241    return ti;
 242}
 243
 244/* enable cpu_get_ticks()
 245 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
 246 */
 247void cpu_enable_ticks(void)
 248{
 249    /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 250    seqlock_write_lock(&timers_state.vm_clock_seqlock);
 251    if (!timers_state.cpu_ticks_enabled) {
 252        timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 253        timers_state.cpu_clock_offset -= get_clock();
 254        timers_state.cpu_ticks_enabled = 1;
 255    }
 256    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 257}
 258
 259/* disable cpu_get_ticks() : the clock is stopped. You must not call
 260 * cpu_get_ticks() after that.
 261 * Caller must hold BQL which server as mutex for vm_clock_seqlock.
 262 */
 263void cpu_disable_ticks(void)
 264{
 265    /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 266    seqlock_write_lock(&timers_state.vm_clock_seqlock);
 267    if (timers_state.cpu_ticks_enabled) {
 268        timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 269        timers_state.cpu_clock_offset = cpu_get_clock_locked();
 270        timers_state.cpu_ticks_enabled = 0;
 271    }
 272    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 273}
 274
 275/* Correlation between real and virtual time is always going to be
 276   fairly approximate, so ignore small variation.
 277   When the guest is idle real and virtual time will be aligned in
 278   the IO wait loop.  */
 279#define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 280
 281static void icount_adjust(void)
 282{
 283    int64_t cur_time;
 284    int64_t cur_icount;
 285    int64_t delta;
 286
 287    /* Protected by TimersState mutex.  */
 288    static int64_t last_delta;
 289
 290    /* If the VM is not running, then do nothing.  */
 291    if (!runstate_is_running()) {
 292        return;
 293    }
 294
 295    seqlock_write_lock(&timers_state.vm_clock_seqlock);
 296    cur_time = cpu_get_clock_locked();
 297    cur_icount = cpu_get_icount_locked();
 298
 299    delta = cur_icount - cur_time;
 300    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 301    if (delta > 0
 302        && last_delta + ICOUNT_WOBBLE < delta * 2
 303        && icount_time_shift > 0) {
 304        /* The guest is getting too far ahead.  Slow time down.  */
 305        icount_time_shift--;
 306    }
 307    if (delta < 0
 308        && last_delta - ICOUNT_WOBBLE > delta * 2
 309        && icount_time_shift < MAX_ICOUNT_SHIFT) {
 310        /* The guest is getting too far behind.  Speed time up.  */
 311        icount_time_shift++;
 312    }
 313    last_delta = delta;
 314    timers_state.qemu_icount_bias = cur_icount
 315                              - (timers_state.qemu_icount << icount_time_shift);
 316    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 317}
 318
 319static void icount_adjust_rt(void *opaque)
 320{
 321    timer_mod(icount_rt_timer,
 322              qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 323    icount_adjust();
 324}
 325
 326static void icount_adjust_vm(void *opaque)
 327{
 328    timer_mod(icount_vm_timer,
 329                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 330                   NANOSECONDS_PER_SECOND / 10);
 331    icount_adjust();
 332}
 333
 334static int64_t qemu_icount_round(int64_t count)
 335{
 336    return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 337}
 338
 339static bool icount_idle_timewarps = true;
 340void qemu_icount_enable_idle_timewarps(bool enable)
 341{
 342    icount_idle_timewarps = enable;
 343}
 344
 345static void icount_warp_rt(void)
 346{
 347    unsigned seq;
 348    int64_t warp_start;
 349
 350    /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 351     * changes from -1 to another value, so the race here is okay.
 352     */
 353    do {
 354        seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 355        warp_start = vm_clock_warp_start;
 356    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 357
 358    if (warp_start == -1) {
 359        return;
 360    }
 361
 362    seqlock_write_lock(&timers_state.vm_clock_seqlock);
 363    if (runstate_is_running()) {
 364        int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 365                                     cpu_get_clock_locked());
 366        int64_t warp_delta;
 367
 368        warp_delta = clock - vm_clock_warp_start;
 369        if (use_icount == 2) {
 370            /*
 371             * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 372             * far ahead of real time.
 373             */
 374            int64_t cur_icount = cpu_get_icount_locked();
 375            int64_t delta = clock - cur_icount;
 376            warp_delta = MIN(warp_delta, delta);
 377        }
 378        timers_state.qemu_icount_bias += warp_delta;
 379    }
 380    vm_clock_warp_start = -1;
 381    seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 382
 383    if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 384        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 385    }
 386}
 387
 388static void icount_timer_cb(void *opaque)
 389{
 390    /* No need for a checkpoint because the timer already synchronizes
 391     * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 392     */
 393    icount_warp_rt();
 394}
 395
 396void tcg_clock_warp(int64_t dest)
 397{
 398    int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 399
 400    if (clock < dest) {
 401        qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 402        clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 403    }
 404    qemu_notify_event();
 405}
 406
 407bool tcg_idle_clock_warp(int64_t dest)
 408{
 409    if (!all_cpu_threads_idle()) {
 410        return false;
 411    }
 412
 413    tcg_clock_warp(dest);
 414    return true;
 415}
 416
 417void qtest_clock_warp(int64_t dest)
 418{
 419    int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 420    AioContext *aio_context;
 421    assert(qtest_enabled());
 422    aio_context = qemu_get_aio_context();
 423    while (clock < dest) {
 424        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 425        int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 426
 427        seqlock_write_lock(&timers_state.vm_clock_seqlock);
 428        timers_state.qemu_icount_bias += warp;
 429        seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 430
 431        qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 432        timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 433        clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 434    }
 435    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 436}
 437
 438void qemu_start_warp_timer(void)
 439{
 440    int64_t clock;
 441    int64_t deadline;
 442
 443    if (!use_icount) {
 444        return;
 445    }
 446
 447    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 448     * do not fire, so computing the deadline does not make sense.
 449     */
 450    if (!runstate_is_running()) {
 451        return;
 452    }
 453
 454    /* warp clock deterministically in record/replay mode */
 455    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 456        return;
 457    }
 458
 459    if (!all_cpu_threads_idle()) {
 460        return;
 461    }
 462
 463    if (qtest_enabled()) {
 464        /* When testing, qtest commands advance icount.  */
 465        return;
 466    }
 467
 468    /* We want to use the earliest deadline from ALL vm_clocks */
 469    clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 470    deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 471    if (deadline < 0) {
 472        static bool notified;
 473        if (!icount_sleep && !notified) {
 474            error_report("WARNING: icount sleep disabled and no active timers");
 475            notified = true;
 476        }
 477        return;
 478    }
 479
 480    if (deadline > 0) {
 481        /*
 482         * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 483         * sleep.  Otherwise, the CPU might be waiting for a future timer
 484         * interrupt to wake it up, but the interrupt never comes because
 485         * the vCPU isn't running any insns and thus doesn't advance the
 486         * QEMU_CLOCK_VIRTUAL.
 487         */
 488        if (!icount_sleep) {
 489            /*
 490             * We never let VCPUs sleep in no sleep icount mode.
 491             * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 492             * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 493             * It is useful when we want a deterministic execution time,
 494             * isolated from host latencies.
 495             */
 496            seqlock_write_lock(&timers_state.vm_clock_seqlock);
 497            timers_state.qemu_icount_bias += deadline;
 498            seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 499            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 500        } else {
 501            /*
 502             * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 503             * "real" time, (related to the time left until the next event) has
 504             * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 505             * This avoids that the warps are visible externally; for example,
 506             * you will not be sending network packets continuously instead of
 507             * every 100ms.
 508             */
 509            seqlock_write_lock(&timers_state.vm_clock_seqlock);
 510            if (vm_clock_warp_start == -1 || vm_clock_warp_start > clock) {
 511                vm_clock_warp_start = clock;
 512            }
 513            seqlock_write_unlock(&timers_state.vm_clock_seqlock);
 514            timer_mod_anticipate(icount_warp_timer, clock + deadline);
 515        }
 516    } else if (deadline == 0) {
 517        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 518    }
 519}
 520
 521static void qemu_account_warp_timer(void)
 522{
 523    if (!use_icount || !icount_sleep) {
 524        return;
 525    }
 526
 527    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 528     * do not fire, so computing the deadline does not make sense.
 529     */
 530    if (!runstate_is_running()) {
 531        return;
 532    }
 533
 534    /* warp clock deterministically in record/replay mode */
 535    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 536        return;
 537    }
 538
 539    timer_del(icount_warp_timer);
 540    icount_warp_rt();
 541}
 542
 543static bool icount_state_needed(void *opaque)
 544{
 545    return use_icount;
 546}
 547
 548/*
 549 * This is a subsection for icount migration.
 550 */
 551static const VMStateDescription icount_vmstate_timers = {
 552    .name = "timer/icount",
 553    .version_id = 1,
 554    .minimum_version_id = 1,
 555    .needed = icount_state_needed,
 556    .fields = (VMStateField[]) {
 557        VMSTATE_INT64(qemu_icount_bias, TimersState),
 558        VMSTATE_INT64(qemu_icount, TimersState),
 559        VMSTATE_END_OF_LIST()
 560    }
 561};
 562
 563static const VMStateDescription vmstate_timers = {
 564    .name = "timer",
 565    .version_id = 2,
 566    .minimum_version_id = 1,
 567    .fields = (VMStateField[]) {
 568        VMSTATE_INT64(cpu_ticks_offset, TimersState),
 569        VMSTATE_INT64(dummy, TimersState),
 570        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 571        VMSTATE_END_OF_LIST()
 572    },
 573    .subsections = (const VMStateDescription*[]) {
 574        &icount_vmstate_timers,
 575        NULL
 576    }
 577};
 578
 579static void cpu_throttle_thread(void *opaque)
 580{
 581    CPUState *cpu = opaque;
 582    double pct;
 583    double throttle_ratio;
 584    long sleeptime_ns;
 585
 586    if (!cpu_throttle_get_percentage()) {
 587        return;
 588    }
 589
 590    pct = (double)cpu_throttle_get_percentage()/100;
 591    throttle_ratio = pct / (1 - pct);
 592    sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 593
 594    qemu_mutex_unlock_iothread();
 595    atomic_set(&cpu->throttle_thread_scheduled, 0);
 596    g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 597    qemu_mutex_lock_iothread();
 598}
 599
 600static void cpu_throttle_timer_tick(void *opaque)
 601{
 602    CPUState *cpu;
 603    double pct;
 604
 605    /* Stop the timer if needed */
 606    if (!cpu_throttle_get_percentage()) {
 607        return;
 608    }
 609    CPU_FOREACH(cpu) {
 610        if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 611            async_run_on_cpu(cpu, cpu_throttle_thread, cpu);
 612        }
 613    }
 614
 615    pct = (double)cpu_throttle_get_percentage()/100;
 616    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 617                                   CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 618}
 619
 620void cpu_throttle_set(int new_throttle_pct)
 621{
 622    /* Ensure throttle percentage is within valid range */
 623    new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 624    new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 625
 626    atomic_set(&throttle_percentage, new_throttle_pct);
 627
 628    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 629                                       CPU_THROTTLE_TIMESLICE_NS);
 630}
 631
 632void cpu_throttle_stop(void)
 633{
 634    atomic_set(&throttle_percentage, 0);
 635}
 636
 637bool cpu_throttle_active(void)
 638{
 639    return (cpu_throttle_get_percentage() != 0);
 640}
 641
 642int cpu_throttle_get_percentage(void)
 643{
 644    return atomic_read(&throttle_percentage);
 645}
 646
 647void cpu_ticks_init(void)
 648{
 649    seqlock_init(&timers_state.vm_clock_seqlock, NULL);
 650    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 651    throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 652                                           cpu_throttle_timer_tick, NULL);
 653}
 654
 655void configure_icount(QemuOpts *opts, Error **errp)
 656{
 657    const char *option;
 658    char *rem_str = NULL;
 659
 660    option = qemu_opt_get(opts, "shift");
 661    if (!option) {
 662        if (qemu_opt_get(opts, "align") != NULL) {
 663            error_setg(errp, "Please specify shift option when using align");
 664        }
 665        return;
 666    }
 667
 668    icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 669    if (icount_sleep) {
 670        icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 671                                         icount_timer_cb, NULL);
 672    }
 673
 674    icount_align_option = qemu_opt_get_bool(opts, "align", false);
 675
 676    if (icount_align_option && !icount_sleep) {
 677        error_setg(errp, "align=on and sleep=off are incompatible");
 678    }
 679    if (strcmp(option, "auto") != 0) {
 680        errno = 0;
 681        icount_time_shift = strtol(option, &rem_str, 0);
 682        if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 683            error_setg(errp, "icount: Invalid shift value");
 684        }
 685        use_icount = 1;
 686        return;
 687    } else if (icount_align_option) {
 688        error_setg(errp, "shift=auto and align=on are incompatible");
 689    } else if (!icount_sleep) {
 690        error_setg(errp, "shift=auto and sleep=off are incompatible");
 691    }
 692
 693    use_icount = 2;
 694
 695    /* 125MIPS seems a reasonable initial guess at the guest speed.
 696       It will be corrected fairly quickly anyway.  */
 697    icount_time_shift = 3;
 698
 699    /* Have both realtime and virtual time triggers for speed adjustment.
 700       The realtime trigger catches emulated time passing too slowly,
 701       the virtual time trigger catches emulated time passing too fast.
 702       Realtime triggers occur even when idle, so use them less frequently
 703       than VM triggers.  */
 704    icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 705                                   icount_adjust_rt, NULL);
 706    timer_mod(icount_rt_timer,
 707                   qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 708    icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 709                                        icount_adjust_vm, NULL);
 710    timer_mod(icount_vm_timer,
 711                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 712                   NANOSECONDS_PER_SECOND / 10);
 713}
 714
 715/***********************************************************/
 716void hw_error(const char *fmt, ...)
 717{
 718    va_list ap;
 719    CPUState *cpu;
 720
 721    va_start(ap, fmt);
 722    fprintf(stderr, "qemu: hardware error: ");
 723    vfprintf(stderr, fmt, ap);
 724    fprintf(stderr, "\n");
 725    CPU_FOREACH(cpu) {
 726        fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 727        cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 728    }
 729    va_end(ap);
 730    abort();
 731}
 732
 733void cpu_synchronize_all_states(void)
 734{
 735    CPUState *cpu;
 736
 737    CPU_FOREACH(cpu) {
 738        cpu_synchronize_state(cpu);
 739    }
 740}
 741
 742void cpu_synchronize_all_post_reset(void)
 743{
 744    CPUState *cpu;
 745
 746    CPU_FOREACH(cpu) {
 747        cpu_synchronize_post_reset(cpu);
 748    }
 749}
 750
 751void cpu_synchronize_all_post_init(void)
 752{
 753    CPUState *cpu;
 754
 755    CPU_FOREACH(cpu) {
 756        cpu_synchronize_post_init(cpu);
 757    }
 758}
 759
 760static int do_vm_stop(RunState state)
 761{
 762    int ret = 0;
 763
 764    if (runstate_is_running()) {
 765        cpu_disable_ticks();
 766        pause_all_vcpus();
 767        runstate_set(state);
 768        vm_state_notify(0, state);
 769        qapi_event_send_stop(&error_abort);
 770    }
 771
 772    bdrv_drain_all();
 773    ret = blk_flush_all();
 774
 775    return ret;
 776}
 777
 778static bool cpu_can_run(CPUState *cpu)
 779{
 780    if (cpu->stop) {
 781        return false;
 782    }
 783    if (cpu_is_stopped(cpu)) {
 784        return false;
 785    }
 786    return true;
 787}
 788
 789static void cpu_handle_guest_debug(CPUState *cpu)
 790{
 791    gdb_set_stop_cpu(cpu);
 792    qemu_system_debug_request();
 793    cpu->stopped = true;
 794}
 795
 796#ifdef CONFIG_LINUX
 797static void sigbus_reraise(void)
 798{
 799    sigset_t set;
 800    struct sigaction action;
 801
 802    memset(&action, 0, sizeof(action));
 803    action.sa_handler = SIG_DFL;
 804    if (!sigaction(SIGBUS, &action, NULL)) {
 805        raise(SIGBUS);
 806        sigemptyset(&set);
 807        sigaddset(&set, SIGBUS);
 808        sigprocmask(SIG_UNBLOCK, &set, NULL);
 809    }
 810    perror("Failed to re-raise SIGBUS!\n");
 811    abort();
 812}
 813
 814static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo,
 815                           void *ctx)
 816{
 817    if (kvm_on_sigbus(siginfo->ssi_code,
 818                      (void *)(intptr_t)siginfo->ssi_addr)) {
 819        sigbus_reraise();
 820    }
 821}
 822
 823static void qemu_init_sigbus(void)
 824{
 825    struct sigaction action;
 826
 827    memset(&action, 0, sizeof(action));
 828    action.sa_flags = SA_SIGINFO;
 829    action.sa_sigaction = (void (*)(int, siginfo_t*, void*))sigbus_handler;
 830    sigaction(SIGBUS, &action, NULL);
 831
 832    prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
 833}
 834
 835static void qemu_kvm_eat_signals(CPUState *cpu)
 836{
 837    struct timespec ts = { 0, 0 };
 838    siginfo_t siginfo;
 839    sigset_t waitset;
 840    sigset_t chkset;
 841    int r;
 842
 843    sigemptyset(&waitset);
 844    sigaddset(&waitset, SIG_IPI);
 845    sigaddset(&waitset, SIGBUS);
 846
 847    do {
 848        r = sigtimedwait(&waitset, &siginfo, &ts);
 849        if (r == -1 && !(errno == EAGAIN || errno == EINTR)) {
 850            perror("sigtimedwait");
 851            exit(1);
 852        }
 853
 854        switch (r) {
 855        case SIGBUS:
 856            if (kvm_on_sigbus_vcpu(cpu, siginfo.si_code, siginfo.si_addr)) {
 857                sigbus_reraise();
 858            }
 859            break;
 860        default:
 861            break;
 862        }
 863
 864        r = sigpending(&chkset);
 865        if (r == -1) {
 866            perror("sigpending");
 867            exit(1);
 868        }
 869    } while (sigismember(&chkset, SIG_IPI) || sigismember(&chkset, SIGBUS));
 870}
 871
 872#else /* !CONFIG_LINUX */
 873
 874static void qemu_init_sigbus(void)
 875{
 876}
 877
 878static void qemu_kvm_eat_signals(CPUState *cpu)
 879{
 880}
 881#endif /* !CONFIG_LINUX */
 882
 883#ifndef _WIN32
 884static void dummy_signal(int sig)
 885{
 886}
 887
 888static void qemu_kvm_init_cpu_signals(CPUState *cpu)
 889{
 890    int r;
 891    sigset_t set;
 892    struct sigaction sigact;
 893
 894    memset(&sigact, 0, sizeof(sigact));
 895    sigact.sa_handler = dummy_signal;
 896    sigaction(SIG_IPI, &sigact, NULL);
 897
 898    pthread_sigmask(SIG_BLOCK, NULL, &set);
 899    sigdelset(&set, SIG_IPI);
 900    sigdelset(&set, SIGBUS);
 901    r = kvm_set_signal_mask(cpu, &set);
 902    if (r) {
 903        fprintf(stderr, "kvm_set_signal_mask: %s\n", strerror(-r));
 904        exit(1);
 905    }
 906}
 907
 908#else /* _WIN32 */
 909static void qemu_kvm_init_cpu_signals(CPUState *cpu)
 910{
 911    abort();
 912}
 913#endif /* _WIN32 */
 914
 915static QemuMutex qemu_global_mutex;
 916static QemuCond qemu_io_proceeded_cond;
 917static unsigned iothread_requesting_mutex;
 918
 919static QemuThread io_thread;
 920
 921/* cpu creation */
 922static QemuCond qemu_cpu_cond;
 923/* system init */
 924static QemuCond qemu_pause_cond;
 925static QemuCond qemu_work_cond;
 926
 927void qemu_init_cpu_loop(void)
 928{
 929    qemu_init_sigbus();
 930    qemu_cond_init(&qemu_cpu_cond);
 931    qemu_cond_init(&qemu_pause_cond);
 932    qemu_cond_init(&qemu_work_cond);
 933    qemu_cond_init(&qemu_io_proceeded_cond);
 934    qemu_mutex_init(&qemu_global_mutex);
 935
 936    qemu_thread_get_self(&io_thread);
 937}
 938
 939void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
 940{
 941    struct qemu_work_item wi;
 942
 943    if (qemu_cpu_is_self(cpu)) {
 944        func(data);
 945        return;
 946    }
 947
 948    wi.func = func;
 949    wi.data = data;
 950    wi.free = false;
 951
 952    qemu_mutex_lock(&cpu->work_mutex);
 953    if (cpu->queued_work_first == NULL) {
 954        cpu->queued_work_first = &wi;
 955    } else {
 956        cpu->queued_work_last->next = &wi;
 957    }
 958    cpu->queued_work_last = &wi;
 959    wi.next = NULL;
 960    wi.done = false;
 961    qemu_mutex_unlock(&cpu->work_mutex);
 962
 963    qemu_cpu_kick(cpu);
 964    while (!atomic_mb_read(&wi.done)) {
 965        CPUState *self_cpu = current_cpu;
 966
 967        qemu_cond_wait(&qemu_work_cond, &qemu_global_mutex);
 968        current_cpu = self_cpu;
 969    }
 970}
 971
 972void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data)
 973{
 974    struct qemu_work_item *wi;
 975
 976    if (qemu_cpu_is_self(cpu)) {
 977        func(data);
 978        return;
 979    }
 980
 981    wi = g_malloc0(sizeof(struct qemu_work_item));
 982    wi->func = func;
 983    wi->data = data;
 984    wi->free = true;
 985
 986    qemu_mutex_lock(&cpu->work_mutex);
 987    if (cpu->queued_work_first == NULL) {
 988        cpu->queued_work_first = wi;
 989    } else {
 990        cpu->queued_work_last->next = wi;
 991    }
 992    cpu->queued_work_last = wi;
 993    wi->next = NULL;
 994    wi->done = false;
 995    qemu_mutex_unlock(&cpu->work_mutex);
 996
 997    qemu_cpu_kick(cpu);
 998}
 999
1000static void flush_queued_work(CPUState *cpu)
1001{
1002    struct qemu_work_item *wi;
1003
1004    if (cpu->queued_work_first == NULL) {
1005        return;
1006    }
1007
1008    qemu_mutex_lock(&cpu->work_mutex);
1009    while (cpu->queued_work_first != NULL) {
1010        wi = cpu->queued_work_first;
1011        cpu->queued_work_first = wi->next;
1012        if (!cpu->queued_work_first) {
1013            cpu->queued_work_last = NULL;
1014        }
1015        qemu_mutex_unlock(&cpu->work_mutex);
1016        wi->func(wi->data);
1017        qemu_mutex_lock(&cpu->work_mutex);
1018        if (wi->free) {
1019            g_free(wi);
1020        } else {
1021            atomic_mb_set(&wi->done, true);
1022        }
1023    }
1024    qemu_mutex_unlock(&cpu->work_mutex);
1025    qemu_cond_broadcast(&qemu_work_cond);
1026}
1027
1028static void qemu_wait_io_event_common(CPUState *cpu)
1029{
1030    if (cpu->stop) {
1031        cpu->stop = false;
1032        cpu->stopped = true;
1033        qemu_cond_broadcast(&qemu_pause_cond);
1034    }
1035    flush_queued_work(cpu);
1036    cpu->thread_kicked = false;
1037}
1038
1039static void qemu_tcg_wait_io_event(CPUState *cpu)
1040{
1041    while (all_cpu_threads_idle()) {
1042        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1043    }
1044
1045    while (iothread_requesting_mutex) {
1046        qemu_cond_wait(&qemu_io_proceeded_cond, &qemu_global_mutex);
1047    }
1048
1049    CPU_FOREACH(cpu) {
1050        qemu_wait_io_event_common(cpu);
1051    }
1052}
1053
1054static void qemu_kvm_wait_io_event(CPUState *cpu)
1055{
1056    while (cpu_thread_is_idle(cpu)) {
1057        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1058    }
1059
1060    qemu_kvm_eat_signals(cpu);
1061    qemu_wait_io_event_common(cpu);
1062}
1063
1064static void *qemu_kvm_cpu_thread_fn(void *arg)
1065{
1066    CPUState *cpu = arg;
1067    int r;
1068
1069    rcu_register_thread();
1070
1071    qemu_mutex_lock_iothread();
1072    qemu_thread_get_self(cpu->thread);
1073    cpu->thread_id = qemu_get_thread_id();
1074    cpu->can_do_io = 1;
1075    current_cpu = cpu;
1076
1077    r = kvm_init_vcpu(cpu);
1078    if (r < 0) {
1079        fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
1080        exit(1);
1081    }
1082
1083    qemu_kvm_init_cpu_signals(cpu);
1084
1085    /* signal CPU creation */
1086    cpu->created = true;
1087    qemu_cond_signal(&qemu_cpu_cond);
1088
1089    while (1) {
1090        if (cpu_can_run(cpu)) {
1091            r = kvm_cpu_exec(cpu);
1092            if (r == EXCP_DEBUG) {
1093                cpu_handle_guest_debug(cpu);
1094            }
1095        }
1096        qemu_kvm_wait_io_event(cpu);
1097    }
1098
1099    return NULL;
1100}
1101
1102static void *qemu_dummy_cpu_thread_fn(void *arg)
1103{
1104#ifdef _WIN32
1105    fprintf(stderr, "qtest is not supported under Windows\n");
1106    exit(1);
1107#else
1108    CPUState *cpu = arg;
1109    sigset_t waitset;
1110    int r;
1111
1112    rcu_register_thread();
1113
1114    qemu_mutex_lock_iothread();
1115    qemu_thread_get_self(cpu->thread);
1116    cpu->thread_id = qemu_get_thread_id();
1117    cpu->can_do_io = 1;
1118
1119    sigemptyset(&waitset);
1120    sigaddset(&waitset, SIG_IPI);
1121
1122    /* signal CPU creation */
1123    cpu->created = true;
1124    qemu_cond_signal(&qemu_cpu_cond);
1125
1126    current_cpu = cpu;
1127    while (1) {
1128        current_cpu = NULL;
1129        qemu_mutex_unlock_iothread();
1130        do {
1131            int sig;
1132            r = sigwait(&waitset, &sig);
1133        } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1134        if (r == -1) {
1135            perror("sigwait");
1136            exit(1);
1137        }
1138        qemu_mutex_lock_iothread();
1139        current_cpu = cpu;
1140        qemu_wait_io_event_common(cpu);
1141    }
1142
1143    return NULL;
1144#endif
1145}
1146
1147static void tcg_exec_all(void);
1148
1149static void *qemu_tcg_cpu_thread_fn(void *arg)
1150{
1151    CPUState *cpu = arg;
1152
1153    rcu_register_thread();
1154
1155    qemu_mutex_lock_iothread();
1156    qemu_thread_get_self(cpu->thread);
1157
1158    CPU_FOREACH(cpu) {
1159        cpu->thread_id = qemu_get_thread_id();
1160        cpu->created = true;
1161        cpu->can_do_io = 1;
1162    }
1163    qemu_cond_signal(&qemu_cpu_cond);
1164
1165    /* wait for initial kick-off after machine start */
1166    while (first_cpu->stopped) {
1167        qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1168
1169        /* process any pending work */
1170        CPU_FOREACH(cpu) {
1171            qemu_wait_io_event_common(cpu);
1172        }
1173    }
1174
1175    /* process any pending work */
1176    atomic_mb_set(&exit_request, 1);
1177
1178    while (1) {
1179        tcg_exec_all();
1180
1181        if (use_icount) {
1182            int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1183
1184            if (deadline == 0) {
1185                qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1186            }
1187        }
1188        qemu_tcg_wait_io_event(QTAILQ_FIRST(&cpus));
1189    }
1190
1191    return NULL;
1192}
1193
1194static void qemu_cpu_kick_thread(CPUState *cpu)
1195{
1196#ifndef _WIN32
1197    int err;
1198
1199    if (cpu->thread_kicked) {
1200        return;
1201    }
1202    cpu->thread_kicked = true;
1203    err = pthread_kill(cpu->thread->thread, SIG_IPI);
1204    if (err) {
1205        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1206        exit(1);
1207    }
1208#else /* _WIN32 */
1209    abort();
1210#endif
1211}
1212
1213static void qemu_cpu_kick_no_halt(void)
1214{
1215    CPUState *cpu;
1216    /* Ensure whatever caused the exit has reached the CPU threads before
1217     * writing exit_request.
1218     */
1219    atomic_mb_set(&exit_request, 1);
1220    cpu = atomic_mb_read(&tcg_current_cpu);
1221    if (cpu) {
1222        cpu_exit(cpu);
1223    }
1224}
1225
1226void qemu_cpu_kick(CPUState *cpu)
1227{
1228    qemu_cond_broadcast(cpu->halt_cond);
1229    if (tcg_enabled()) {
1230        qemu_cpu_kick_no_halt();
1231    } else {
1232        qemu_cpu_kick_thread(cpu);
1233    }
1234}
1235
1236void qemu_cpu_kick_self(void)
1237{
1238    assert(current_cpu);
1239    qemu_cpu_kick_thread(current_cpu);
1240}
1241
1242bool qemu_cpu_is_self(CPUState *cpu)
1243{
1244    return qemu_thread_is_self(cpu->thread);
1245}
1246
1247bool qemu_in_vcpu_thread(void)
1248{
1249    return current_cpu && qemu_cpu_is_self(current_cpu);
1250}
1251
1252static __thread bool iothread_locked = false;
1253
1254bool qemu_mutex_iothread_locked(void)
1255{
1256    return iothread_locked;
1257}
1258
1259void qemu_mutex_lock_iothread(void)
1260{
1261    atomic_inc(&iothread_requesting_mutex);
1262    /* In the simple case there is no need to bump the VCPU thread out of
1263     * TCG code execution.
1264     */
1265    if (!tcg_enabled() || qemu_in_vcpu_thread() ||
1266        !first_cpu || !first_cpu->created) {
1267        qemu_mutex_lock(&qemu_global_mutex);
1268        atomic_dec(&iothread_requesting_mutex);
1269    } else {
1270        if (qemu_mutex_trylock(&qemu_global_mutex)) {
1271            qemu_cpu_kick_no_halt();
1272            qemu_mutex_lock(&qemu_global_mutex);
1273        }
1274        atomic_dec(&iothread_requesting_mutex);
1275        qemu_cond_broadcast(&qemu_io_proceeded_cond);
1276    }
1277    iothread_locked = true;
1278}
1279
1280void qemu_mutex_unlock_iothread(void)
1281{
1282    iothread_locked = false;
1283    qemu_mutex_unlock(&qemu_global_mutex);
1284}
1285
1286static int all_vcpus_paused(void)
1287{
1288    CPUState *cpu;
1289
1290    CPU_FOREACH(cpu) {
1291        if (!cpu->stopped) {
1292            return 0;
1293        }
1294    }
1295
1296    return 1;
1297}
1298
1299void pause_all_vcpus(void)
1300{
1301    CPUState *cpu;
1302
1303    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1304    CPU_FOREACH(cpu) {
1305        cpu->stop = true;
1306        qemu_cpu_kick(cpu);
1307    }
1308
1309    if (qemu_in_vcpu_thread()) {
1310        cpu_stop_current();
1311        if (!kvm_enabled()) {
1312            CPU_FOREACH(cpu) {
1313                cpu->stop = false;
1314                cpu->stopped = true;
1315            }
1316            return;
1317        }
1318    }
1319
1320    while (!all_vcpus_paused()) {
1321        qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1322        CPU_FOREACH(cpu) {
1323            qemu_cpu_kick(cpu);
1324        }
1325    }
1326}
1327
1328void cpu_resume(CPUState *cpu)
1329{
1330    cpu->stop = false;
1331    cpu->stopped = false;
1332    qemu_cpu_kick(cpu);
1333}
1334
1335void resume_all_vcpus(void)
1336{
1337    CPUState *cpu;
1338
1339    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1340    CPU_FOREACH(cpu) {
1341        cpu_resume(cpu);
1342    }
1343}
1344
1345/* For temporary buffers for forming a name */
1346#define VCPU_THREAD_NAME_SIZE 16
1347
1348static void qemu_tcg_init_vcpu(CPUState *cpu)
1349{
1350    char thread_name[VCPU_THREAD_NAME_SIZE];
1351    static QemuCond *tcg_halt_cond;
1352    static QemuThread *tcg_cpu_thread;
1353
1354    /* share a single thread for all cpus with TCG */
1355    if (!tcg_cpu_thread) {
1356        cpu->thread = g_malloc0(sizeof(QemuThread));
1357        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1358        qemu_cond_init(cpu->halt_cond);
1359        tcg_halt_cond = cpu->halt_cond;
1360        snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1361                 cpu->cpu_index);
1362        qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1363                           cpu, QEMU_THREAD_JOINABLE);
1364#ifdef _WIN32
1365        cpu->hThread = qemu_thread_get_handle(cpu->thread);
1366#endif
1367        while (!cpu->created) {
1368            qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1369        }
1370        tcg_cpu_thread = cpu->thread;
1371    } else {
1372        cpu->thread = tcg_cpu_thread;
1373        cpu->halt_cond = tcg_halt_cond;
1374    }
1375}
1376
1377static void qemu_kvm_start_vcpu(CPUState *cpu)
1378{
1379    char thread_name[VCPU_THREAD_NAME_SIZE];
1380
1381    cpu->thread = g_malloc0(sizeof(QemuThread));
1382    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1383    qemu_cond_init(cpu->halt_cond);
1384    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1385             cpu->cpu_index);
1386    qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1387                       cpu, QEMU_THREAD_JOINABLE);
1388    while (!cpu->created) {
1389        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1390    }
1391}
1392
1393static void qemu_dummy_start_vcpu(CPUState *cpu)
1394{
1395    char thread_name[VCPU_THREAD_NAME_SIZE];
1396
1397    cpu->thread = g_malloc0(sizeof(QemuThread));
1398    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1399    qemu_cond_init(cpu->halt_cond);
1400    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1401             cpu->cpu_index);
1402    qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1403                       QEMU_THREAD_JOINABLE);
1404    while (!cpu->created) {
1405        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
1406    }
1407}
1408
1409void qemu_init_vcpu(CPUState *cpu)
1410{
1411    cpu->nr_cores = smp_cores;
1412    cpu->nr_threads = smp_threads;
1413    cpu->stopped = true;
1414
1415    if (!cpu->as) {
1416        /* If the target cpu hasn't set up any address spaces itself,
1417         * give it the default one.
1418         */
1419        AddressSpace *as = address_space_init_shareable(cpu->memory,
1420                                                        "cpu-memory");
1421        cpu->num_ases = 1;
1422        cpu_address_space_init(cpu, as, 0);
1423    }
1424
1425    if (kvm_enabled()) {
1426        qemu_kvm_start_vcpu(cpu);
1427    } else if (tcg_enabled()) {
1428        qemu_tcg_init_vcpu(cpu);
1429    } else {
1430        qemu_dummy_start_vcpu(cpu);
1431    }
1432}
1433
1434void cpu_stop_current(void)
1435{
1436    if (current_cpu) {
1437        current_cpu->stop = false;
1438        current_cpu->stopped = true;
1439        cpu_exit(current_cpu);
1440        qemu_cond_broadcast(&qemu_pause_cond);
1441    }
1442}
1443
1444void vm_stop_from_timer(RunState state)
1445{
1446    qemu_system_vmstop_request_prepare();
1447    qemu_system_vmstop_request(state);
1448    /*
1449     * FIXME: should not return to device code in case
1450     * vm_stop() has been requested.
1451     */
1452    cpu_stop_current();
1453}
1454
1455int vm_stop(RunState state)
1456{
1457    if (qemu_in_vcpu_thread()) {
1458        qemu_system_vmstop_request_prepare();
1459        qemu_system_vmstop_request(state);
1460        /*
1461         * FIXME: should not return to device code in case
1462         * vm_stop() has been requested.
1463         */
1464        cpu_stop_current();
1465        return 0;
1466    }
1467
1468    return do_vm_stop(state);
1469}
1470
1471/* does a state transition even if the VM is already stopped,
1472   current state is forgotten forever */
1473int vm_stop_force_state(RunState state)
1474{
1475    if (runstate_is_running()) {
1476        return vm_stop(state);
1477    } else {
1478        runstate_set(state);
1479
1480        bdrv_drain_all();
1481        /* Make sure to return an error if the flush in a previous vm_stop()
1482         * failed. */
1483        return blk_flush_all();
1484    }
1485}
1486
1487static int64_t tcg_get_icount_limit(void)
1488{
1489    int64_t deadline;
1490
1491    if (replay_mode != REPLAY_MODE_PLAY) {
1492        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1493
1494        /* Maintain prior (possibly buggy) behaviour where if no deadline
1495         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1496         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1497         * nanoseconds.
1498         */
1499        if ((deadline < 0) || (deadline > INT32_MAX)) {
1500            deadline = INT32_MAX;
1501        }
1502
1503        return qemu_icount_round(deadline);
1504    } else {
1505        return replay_get_instructions();
1506    }
1507}
1508
1509static int tcg_cpu_exec(CPUState *cpu)
1510{
1511    int ret;
1512#ifdef CONFIG_PROFILER
1513    int64_t ti;
1514#endif
1515
1516#ifdef CONFIG_PROFILER
1517    ti = profile_getclock();
1518#endif
1519    if (use_icount) {
1520        int64_t count;
1521        int decr;
1522        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1523                                    + cpu->icount_extra);
1524        cpu->icount_decr.u16.low = 0;
1525        cpu->icount_extra = 0;
1526        count = tcg_get_icount_limit();
1527        timers_state.qemu_icount += count;
1528        decr = (count > 0xffff) ? 0xffff : count;
1529        count -= decr;
1530        cpu->icount_decr.u16.low = decr;
1531        cpu->icount_extra = count;
1532    }
1533    ret = cpu_exec(cpu);
1534#ifdef CONFIG_PROFILER
1535    tcg_time += profile_getclock() - ti;
1536#endif
1537    if (use_icount) {
1538        /* Fold pending instructions back into the
1539           instruction counter, and clear the interrupt flag.  */
1540        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
1541                        + cpu->icount_extra);
1542        cpu->icount_decr.u32 = 0;
1543        cpu->icount_extra = 0;
1544        replay_account_executed_instructions();
1545    }
1546    return ret;
1547}
1548
1549static void tcg_exec_all(void)
1550{
1551    int r;
1552
1553    /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1554    qemu_account_warp_timer();
1555
1556    if (next_cpu == NULL) {
1557        next_cpu = first_cpu;
1558    }
1559    for (; next_cpu != NULL && !exit_request; next_cpu = CPU_NEXT(next_cpu)) {
1560        CPUState *cpu = next_cpu;
1561
1562        qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1563                          (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1564
1565        if (cpu_can_run(cpu)) {
1566            r = tcg_cpu_exec(cpu);
1567            if (r == EXCP_DEBUG) {
1568                cpu_handle_guest_debug(cpu);
1569                break;
1570            }
1571        } else if (cpu->stop || cpu->stopped) {
1572            break;
1573        }
1574        /* Xilinx: Randomize whether to advance to next CPU on exit request.
1575         * If exit requests repeatedly occur in a regular pattern,
1576         * this can cause CPU starvation. The starvation can be causes
1577         * on either of the "never-advance" and "always-advance" policies.
1578         * So roll the dice.
1579         */
1580        if (exit_request && (rand() & 1)) {
1581            break;
1582        }
1583    }
1584
1585    /* Pairs with smp_wmb in qemu_cpu_kick.  */
1586    atomic_mb_set(&exit_request, 0);
1587}
1588
1589void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
1590{
1591    /* XXX: implement xxx_cpu_list for targets that still miss it */
1592#if defined(cpu_list)
1593    cpu_list(f, cpu_fprintf);
1594#endif
1595}
1596
1597CpuInfoList *qmp_query_cpus(Error **errp)
1598{
1599    CpuInfoList *head = NULL, *cur_item = NULL;
1600    CPUState *cpu;
1601
1602    CPU_FOREACH(cpu) {
1603        CpuInfoList *info;
1604#if defined(TARGET_I386)
1605        X86CPU *x86_cpu = X86_CPU(cpu);
1606        CPUX86State *env = &x86_cpu->env;
1607#elif defined(TARGET_PPC)
1608        PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
1609        CPUPPCState *env = &ppc_cpu->env;
1610#elif defined(TARGET_SPARC)
1611        SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
1612        CPUSPARCState *env = &sparc_cpu->env;
1613#elif defined(TARGET_MIPS)
1614        MIPSCPU *mips_cpu = MIPS_CPU(cpu);
1615        CPUMIPSState *env = &mips_cpu->env;
1616#elif defined(TARGET_TRICORE)
1617        TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
1618        CPUTriCoreState *env = &tricore_cpu->env;
1619#endif
1620
1621        cpu_synchronize_state(cpu);
1622
1623        info = g_malloc0(sizeof(*info));
1624        info->value = g_malloc0(sizeof(*info->value));
1625        info->value->CPU = cpu->cpu_index;
1626        info->value->current = (cpu == first_cpu);
1627        info->value->halted = cpu->halted;
1628        info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
1629        info->value->thread_id = cpu->thread_id;
1630#if defined(TARGET_I386)
1631        info->value->arch = CPU_INFO_ARCH_X86;
1632        info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
1633#elif defined(TARGET_PPC)
1634        info->value->arch = CPU_INFO_ARCH_PPC;
1635        info->value->u.ppc.nip = env->nip;
1636#elif defined(TARGET_SPARC)
1637        info->value->arch = CPU_INFO_ARCH_SPARC;
1638        info->value->u.q_sparc.pc = env->pc;
1639        info->value->u.q_sparc.npc = env->npc;
1640#elif defined(TARGET_MIPS)
1641        info->value->arch = CPU_INFO_ARCH_MIPS;
1642        info->value->u.q_mips.PC = env->active_tc.PC;
1643#elif defined(TARGET_TRICORE)
1644        info->value->arch = CPU_INFO_ARCH_TRICORE;
1645        info->value->u.tricore.PC = env->PC;
1646#else
1647        info->value->arch = CPU_INFO_ARCH_OTHER;
1648#endif
1649
1650        /* XXX: waiting for the qapi to support GSList */
1651        if (!cur_item) {
1652            head = cur_item = info;
1653        } else {
1654            cur_item->next = info;
1655            cur_item = info;
1656        }
1657    }
1658
1659    return head;
1660}
1661
1662void qmp_memsave(int64_t addr, int64_t size, const char *filename,
1663                 bool has_cpu, int64_t cpu_index, Error **errp)
1664{
1665    FILE *f;
1666    uint32_t l;
1667    CPUState *cpu;
1668    uint8_t buf[1024];
1669    int64_t orig_addr = addr, orig_size = size;
1670
1671    if (!has_cpu) {
1672        cpu_index = 0;
1673    }
1674
1675    cpu = qemu_get_cpu(cpu_index);
1676    if (cpu == NULL) {
1677        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
1678                   "a CPU number");
1679        return;
1680    }
1681
1682    f = fopen(filename, "wb");
1683    if (!f) {
1684        error_setg_file_open(errp, errno, filename);
1685        return;
1686    }
1687
1688    while (size != 0) {
1689        l = sizeof(buf);
1690        if (l > size)
1691            l = size;
1692        if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
1693            error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
1694                             " specified", orig_addr, orig_size);
1695            goto exit;
1696        }
1697        if (fwrite(buf, 1, l, f) != l) {
1698            error_setg(errp, QERR_IO_ERROR);
1699            goto exit;
1700        }
1701        addr += l;
1702        size -= l;
1703    }
1704
1705exit:
1706    fclose(f);
1707}
1708
1709void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
1710                  Error **errp)
1711{
1712    FILE *f;
1713    uint32_t l;
1714    uint8_t buf[1024];
1715
1716    f = fopen(filename, "wb");
1717    if (!f) {
1718        error_setg_file_open(errp, errno, filename);
1719        return;
1720    }
1721
1722    while (size != 0) {
1723        l = sizeof(buf);
1724        if (l > size)
1725            l = size;
1726        cpu_physical_memory_read(addr, buf, l);
1727        if (fwrite(buf, 1, l, f) != l) {
1728            error_setg(errp, QERR_IO_ERROR);
1729            goto exit;
1730        }
1731        addr += l;
1732        size -= l;
1733    }
1734
1735exit:
1736    fclose(f);
1737}
1738
1739void qmp_inject_nmi(Error **errp)
1740{
1741#if defined(TARGET_I386)
1742    CPUState *cs;
1743
1744    CPU_FOREACH(cs) {
1745        X86CPU *cpu = X86_CPU(cs);
1746
1747        if (!cpu->apic_state) {
1748            cpu_interrupt(cs, CPU_INTERRUPT_NMI);
1749        } else {
1750            apic_deliver_nmi(cpu->apic_state);
1751        }
1752    }
1753#else
1754    nmi_monitor_handle(monitor_get_cpu_index(), errp);
1755#endif
1756}
1757
1758void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
1759{
1760    if (!use_icount) {
1761        return;
1762    }
1763
1764    cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
1765                (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
1766    if (icount_align_option) {
1767        cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
1768        cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
1769    } else {
1770        cpu_fprintf(f, "Max guest delay     NA\n");
1771        cpu_fprintf(f, "Max guest advance   NA\n");
1772    }
1773}
1774