LXR qemu/cpus.c

   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qemu-common.h"
  27#include "qemu/config-file.h"
  28#include "migration/vmstate.h"
  29#include "monitor/monitor.h"
  30#include "qapi/error.h"
  31#include "qapi/qapi-commands-misc.h"
  32#include "qapi/qapi-events-run-state.h"
  33#include "qapi/qmp/qerror.h"
  34#include "qemu/error-report.h"
  35#include "qemu/qemu-print.h"
  36#include "sysemu/tcg.h"
  37#include "sysemu/block-backend.h"
  38#include "exec/gdbstub.h"
  39#include "sysemu/dma.h"
  40#include "sysemu/hw_accel.h"
  41#include "sysemu/kvm.h"
  42#include "sysemu/hax.h"
  43#include "sysemu/hvf.h"
  44#include "sysemu/whpx.h"
  45#include "exec/exec-all.h"
  46
  47#include "qemu/thread.h"
  48#include "qemu/plugin.h"
  49#include "sysemu/cpus.h"
  50#include "sysemu/qtest.h"
  51#include "qemu/main-loop.h"
  52#include "qemu/option.h"
  53#include "qemu/bitmap.h"
  54#include "qemu/seqlock.h"
  55#include "qemu/guest-random.h"
  56#include "tcg/tcg.h"
  57#include "hw/nmi.h"
  58#include "sysemu/replay.h"
  59#include "sysemu/runstate.h"
  60#include "hw/boards.h"
  61#include "hw/hw.h"
  62
  63#ifdef CONFIG_LINUX
  64
  65#include <sys/prctl.h>
  66
  67#ifndef PR_MCE_KILL
  68#define PR_MCE_KILL 33
  69#endif
  70
  71#ifndef PR_MCE_KILL_SET
  72#define PR_MCE_KILL_SET 1
  73#endif
  74
  75#ifndef PR_MCE_KILL_EARLY
  76#define PR_MCE_KILL_EARLY 1
  77#endif
  78
  79#endif /* CONFIG_LINUX */
  80
  81static QemuMutex qemu_global_mutex;
  82
  83int64_t max_delay;
  84int64_t max_advance;
  85
  86/* vcpu throttling controls */
  87static QEMUTimer *throttle_timer;
  88static unsigned int throttle_percentage;
  89
  90#define CPU_THROTTLE_PCT_MIN 1
  91#define CPU_THROTTLE_PCT_MAX 99
  92#define CPU_THROTTLE_TIMESLICE_NS 10000000
  93
  94bool cpu_is_stopped(CPUState *cpu)
  95{
  96    return cpu->stopped || !runstate_is_running();
  97}
  98
  99static bool cpu_thread_is_idle(CPUState *cpu)
 100{
 101    if (cpu->stop || cpu->queued_work_first) {
 102        return false;
 103    }
 104    if (cpu_is_stopped(cpu)) {
 105        return true;
 106    }
 107    if (!cpu->halted || cpu_has_work(cpu) ||
 108        kvm_halt_in_kernel()) {
 109        return false;
 110    }
 111    return true;
 112}
 113
 114static bool all_cpu_threads_idle(void)
 115{
 116    CPUState *cpu;
 117
 118    CPU_FOREACH(cpu) {
 119        if (!cpu_thread_is_idle(cpu)) {
 120            return false;
 121        }
 122    }
 123    return true;
 124}
 125
 126/***********************************************************/
 127/* guest cycle counter */
 128
 129/* Protected by TimersState seqlock */
 130
 131static bool icount_sleep = true;
 132/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 133#define MAX_ICOUNT_SHIFT 10
 134
 135typedef struct TimersState {
 136    /* Protected by BQL.  */
 137    int64_t cpu_ticks_prev;
 138    int64_t cpu_ticks_offset;
 139
 140    /* Protect fields that can be respectively read outside the
 141     * BQL, and written from multiple threads.
 142     */
 143    QemuSeqLock vm_clock_seqlock;
 144    QemuSpin vm_clock_lock;
 145
 146    int16_t cpu_ticks_enabled;
 147
 148    /* Conversion factor from emulated instructions to virtual clock ticks.  */
 149    int16_t icount_time_shift;
 150
 151    /* Compensate for varying guest execution speed.  */
 152    int64_t qemu_icount_bias;
 153
 154    int64_t vm_clock_warp_start;
 155    int64_t cpu_clock_offset;
 156
 157    /* Only written by TCG thread */
 158    int64_t qemu_icount;
 159
 160    /* for adjusting icount */
 161    QEMUTimer *icount_rt_timer;
 162    QEMUTimer *icount_vm_timer;
 163    QEMUTimer *icount_warp_timer;
 164} TimersState;
 165
 166static TimersState timers_state;
 167bool mttcg_enabled;
 168
 169
 170/* The current number of executed instructions is based on what we
 171 * originally budgeted minus the current state of the decrementing
 172 * icount counters in extra/u16.low.
 173 */
 174static int64_t cpu_get_icount_executed(CPUState *cpu)
 175{
 176    return (cpu->icount_budget -
 177            (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 178}
 179
 180/*
 181 * Update the global shared timer_state.qemu_icount to take into
 182 * account executed instructions. This is done by the TCG vCPU
 183 * thread so the main-loop can see time has moved forward.
 184 */
 185static void cpu_update_icount_locked(CPUState *cpu)
 186{
 187    int64_t executed = cpu_get_icount_executed(cpu);
 188    cpu->icount_budget -= executed;
 189
 190    atomic_set_i64(&timers_state.qemu_icount,
 191                   timers_state.qemu_icount + executed);
 192}
 193
 194/*
 195 * Update the global shared timer_state.qemu_icount to take into
 196 * account executed instructions. This is done by the TCG vCPU
 197 * thread so the main-loop can see time has moved forward.
 198 */
 199void cpu_update_icount(CPUState *cpu)
 200{
 201    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 202                       &timers_state.vm_clock_lock);
 203    cpu_update_icount_locked(cpu);
 204    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 205                         &timers_state.vm_clock_lock);
 206}
 207
 208static int64_t cpu_get_icount_raw_locked(void)
 209{
 210    CPUState *cpu = current_cpu;
 211
 212    if (cpu && cpu->running) {
 213        if (!cpu->can_do_io) {
 214            error_report("Bad icount read");
 215            exit(1);
 216        }
 217        /* Take into account what has run */
 218        cpu_update_icount_locked(cpu);
 219    }
 220    /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 221    return atomic_read_i64(&timers_state.qemu_icount);
 222}
 223
 224static int64_t cpu_get_icount_locked(void)
 225{
 226    int64_t icount = cpu_get_icount_raw_locked();
 227    return atomic_read_i64(&timers_state.qemu_icount_bias) +
 228        cpu_icount_to_ns(icount);
 229}
 230
 231int64_t cpu_get_icount_raw(void)
 232{
 233    int64_t icount;
 234    unsigned start;
 235
 236    do {
 237        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 238        icount = cpu_get_icount_raw_locked();
 239    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 240
 241    return icount;
 242}
 243
 244/* Return the virtual CPU time, based on the instruction counter.  */
 245int64_t cpu_get_icount(void)
 246{
 247    int64_t icount;
 248    unsigned start;
 249
 250    do {
 251        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 252        icount = cpu_get_icount_locked();
 253    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 254
 255    return icount;
 256}
 257
 258int64_t cpu_icount_to_ns(int64_t icount)
 259{
 260    return icount << atomic_read(&timers_state.icount_time_shift);
 261}
 262
 263static int64_t cpu_get_ticks_locked(void)
 264{
 265    int64_t ticks = timers_state.cpu_ticks_offset;
 266    if (timers_state.cpu_ticks_enabled) {
 267        ticks += cpu_get_host_ticks();
 268    }
 269
 270    if (timers_state.cpu_ticks_prev > ticks) {
 271        /* Non increasing ticks may happen if the host uses software suspend.  */
 272        timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 273        ticks = timers_state.cpu_ticks_prev;
 274    }
 275
 276    timers_state.cpu_ticks_prev = ticks;
 277    return ticks;
 278}
 279
 280/* return the time elapsed in VM between vm_start and vm_stop.  Unless
 281 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 282 * counter.
 283 */
 284int64_t cpu_get_ticks(void)
 285{
 286    int64_t ticks;
 287
 288    if (use_icount) {
 289        return cpu_get_icount();
 290    }
 291
 292    qemu_spin_lock(&timers_state.vm_clock_lock);
 293    ticks = cpu_get_ticks_locked();
 294    qemu_spin_unlock(&timers_state.vm_clock_lock);
 295    return ticks;
 296}
 297
 298static int64_t cpu_get_clock_locked(void)
 299{
 300    int64_t time;
 301
 302    time = timers_state.cpu_clock_offset;
 303    if (timers_state.cpu_ticks_enabled) {
 304        time += get_clock();
 305    }
 306
 307    return time;
 308}
 309
 310/* Return the monotonic time elapsed in VM, i.e.,
 311 * the time between vm_start and vm_stop
 312 */
 313int64_t cpu_get_clock(void)
 314{
 315    int64_t ti;
 316    unsigned start;
 317
 318    do {
 319        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 320        ti = cpu_get_clock_locked();
 321    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 322
 323    return ti;
 324}
 325
 326/* enable cpu_get_ticks()
 327 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 328 */
 329void cpu_enable_ticks(void)
 330{
 331    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 332                       &timers_state.vm_clock_lock);
 333    if (!timers_state.cpu_ticks_enabled) {
 334        timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 335        timers_state.cpu_clock_offset -= get_clock();
 336        timers_state.cpu_ticks_enabled = 1;
 337    }
 338    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 339                       &timers_state.vm_clock_lock);
 340}
 341
 342/* disable cpu_get_ticks() : the clock is stopped. You must not call
 343 * cpu_get_ticks() after that.
 344 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 345 */
 346void cpu_disable_ticks(void)
 347{
 348    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 349                       &timers_state.vm_clock_lock);
 350    if (timers_state.cpu_ticks_enabled) {
 351        timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 352        timers_state.cpu_clock_offset = cpu_get_clock_locked();
 353        timers_state.cpu_ticks_enabled = 0;
 354    }
 355    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 356                         &timers_state.vm_clock_lock);
 357}
 358
 359/* Correlation between real and virtual time is always going to be
 360   fairly approximate, so ignore small variation.
 361   When the guest is idle real and virtual time will be aligned in
 362   the IO wait loop.  */
 363#define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 364
 365static void icount_adjust(void)
 366{
 367    int64_t cur_time;
 368    int64_t cur_icount;
 369    int64_t delta;
 370
 371    /* Protected by TimersState mutex.  */
 372    static int64_t last_delta;
 373
 374    /* If the VM is not running, then do nothing.  */
 375    if (!runstate_is_running()) {
 376        return;
 377    }
 378
 379    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 380                       &timers_state.vm_clock_lock);
 381    cur_time = cpu_get_clock_locked();
 382    cur_icount = cpu_get_icount_locked();
 383
 384    delta = cur_icount - cur_time;
 385    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 386    if (delta > 0
 387        && last_delta + ICOUNT_WOBBLE < delta * 2
 388        && timers_state.icount_time_shift > 0) {
 389        /* The guest is getting too far ahead.  Slow time down.  */
 390        atomic_set(&timers_state.icount_time_shift,
 391                   timers_state.icount_time_shift - 1);
 392    }
 393    if (delta < 0
 394        && last_delta - ICOUNT_WOBBLE > delta * 2
 395        && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 396        /* The guest is getting too far behind.  Speed time up.  */
 397        atomic_set(&timers_state.icount_time_shift,
 398                   timers_state.icount_time_shift + 1);
 399    }
 400    last_delta = delta;
 401    atomic_set_i64(&timers_state.qemu_icount_bias,
 402                   cur_icount - (timers_state.qemu_icount
 403                                 << timers_state.icount_time_shift));
 404    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 405                         &timers_state.vm_clock_lock);
 406}
 407
 408static void icount_adjust_rt(void *opaque)
 409{
 410    timer_mod(timers_state.icount_rt_timer,
 411              qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 412    icount_adjust();
 413}
 414
 415static void icount_adjust_vm(void *opaque)
 416{
 417    timer_mod(timers_state.icount_vm_timer,
 418                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 419                   NANOSECONDS_PER_SECOND / 10);
 420    icount_adjust();
 421}
 422
 423static int64_t qemu_icount_round(int64_t count)
 424{
 425    int shift = atomic_read(&timers_state.icount_time_shift);
 426    return (count + (1 << shift) - 1) >> shift;
 427}
 428
 429static void icount_warp_rt(void)
 430{
 431    unsigned seq;
 432    int64_t warp_start;
 433
 434    /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 435     * changes from -1 to another value, so the race here is okay.
 436     */
 437    do {
 438        seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 439        warp_start = timers_state.vm_clock_warp_start;
 440    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 441
 442    if (warp_start == -1) {
 443        return;
 444    }
 445
 446    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 447                       &timers_state.vm_clock_lock);
 448    if (runstate_is_running()) {
 449        int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 450                                            cpu_get_clock_locked());
 451        int64_t warp_delta;
 452
 453        warp_delta = clock - timers_state.vm_clock_warp_start;
 454        if (use_icount == 2) {
 455            /*
 456             * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 457             * far ahead of real time.
 458             */
 459            int64_t cur_icount = cpu_get_icount_locked();
 460            int64_t delta = clock - cur_icount;
 461            warp_delta = MIN(warp_delta, delta);
 462        }
 463        atomic_set_i64(&timers_state.qemu_icount_bias,
 464                       timers_state.qemu_icount_bias + warp_delta);
 465    }
 466    timers_state.vm_clock_warp_start = -1;
 467    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 468                       &timers_state.vm_clock_lock);
 469
 470    if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 471        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 472    }
 473}
 474
 475static void icount_timer_cb(void *opaque)
 476{
 477    /* No need for a checkpoint because the timer already synchronizes
 478     * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 479     */
 480    icount_warp_rt();
 481}
 482
 483void qtest_clock_warp(int64_t dest)
 484{
 485    int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 486    AioContext *aio_context;
 487    assert(qtest_enabled());
 488    aio_context = qemu_get_aio_context();
 489    while (clock < dest) {
 490        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 491                                                      QEMU_TIMER_ATTR_ALL);
 492        int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 493
 494        seqlock_write_lock(&timers_state.vm_clock_seqlock,
 495                           &timers_state.vm_clock_lock);
 496        atomic_set_i64(&timers_state.qemu_icount_bias,
 497                       timers_state.qemu_icount_bias + warp);
 498        seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 499                             &timers_state.vm_clock_lock);
 500
 501        qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 502        timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 503        clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 504    }
 505    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 506}
 507
 508void qemu_start_warp_timer(void)
 509{
 510    int64_t clock;
 511    int64_t deadline;
 512
 513    if (!use_icount) {
 514        return;
 515    }
 516
 517    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 518     * do not fire, so computing the deadline does not make sense.
 519     */
 520    if (!runstate_is_running()) {
 521        return;
 522    }
 523
 524    if (replay_mode != REPLAY_MODE_PLAY) {
 525        if (!all_cpu_threads_idle()) {
 526            return;
 527        }
 528
 529        if (qtest_enabled()) {
 530            /* When testing, qtest commands advance icount.  */
 531            return;
 532        }
 533
 534        replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 535    } else {
 536        /* warp clock deterministically in record/replay mode */
 537        if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 538            /* vCPU is sleeping and warp can't be started.
 539               It is probably a race condition: notification sent
 540               to vCPU was processed in advance and vCPU went to sleep.
 541               Therefore we have to wake it up for doing someting. */
 542            if (replay_has_checkpoint()) {
 543                qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 544            }
 545            return;
 546        }
 547    }
 548
 549    /* We want to use the earliest deadline from ALL vm_clocks */
 550    clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 551    deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 552                                          ~QEMU_TIMER_ATTR_EXTERNAL);
 553    if (deadline < 0) {
 554        static bool notified;
 555        if (!icount_sleep && !notified) {
 556            warn_report("icount sleep disabled and no active timers");
 557            notified = true;
 558        }
 559        return;
 560    }
 561
 562    if (deadline > 0) {
 563        /*
 564         * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 565         * sleep.  Otherwise, the CPU might be waiting for a future timer
 566         * interrupt to wake it up, but the interrupt never comes because
 567         * the vCPU isn't running any insns and thus doesn't advance the
 568         * QEMU_CLOCK_VIRTUAL.
 569         */
 570        if (!icount_sleep) {
 571            /*
 572             * We never let VCPUs sleep in no sleep icount mode.
 573             * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 574             * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 575             * It is useful when we want a deterministic execution time,
 576             * isolated from host latencies.
 577             */
 578            seqlock_write_lock(&timers_state.vm_clock_seqlock,
 579                               &timers_state.vm_clock_lock);
 580            atomic_set_i64(&timers_state.qemu_icount_bias,
 581                           timers_state.qemu_icount_bias + deadline);
 582            seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 583                                 &timers_state.vm_clock_lock);
 584            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 585        } else {
 586            /*
 587             * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 588             * "real" time, (related to the time left until the next event) has
 589             * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 590             * This avoids that the warps are visible externally; for example,
 591             * you will not be sending network packets continuously instead of
 592             * every 100ms.
 593             */
 594            seqlock_write_lock(&timers_state.vm_clock_seqlock,
 595                               &timers_state.vm_clock_lock);
 596            if (timers_state.vm_clock_warp_start == -1
 597                || timers_state.vm_clock_warp_start > clock) {
 598                timers_state.vm_clock_warp_start = clock;
 599            }
 600            seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 601                                 &timers_state.vm_clock_lock);
 602            timer_mod_anticipate(timers_state.icount_warp_timer,
 603                                 clock + deadline);
 604        }
 605    } else if (deadline == 0) {
 606        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 607    }
 608}
 609
 610static void qemu_account_warp_timer(void)
 611{
 612    if (!use_icount || !icount_sleep) {
 613        return;
 614    }
 615
 616    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 617     * do not fire, so computing the deadline does not make sense.
 618     */
 619    if (!runstate_is_running()) {
 620        return;
 621    }
 622
 623    /* warp clock deterministically in record/replay mode */
 624    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 625        return;
 626    }
 627
 628    timer_del(timers_state.icount_warp_timer);
 629    icount_warp_rt();
 630}
 631
 632static bool icount_state_needed(void *opaque)
 633{
 634    return use_icount;
 635}
 636
 637static bool warp_timer_state_needed(void *opaque)
 638{
 639    TimersState *s = opaque;
 640    return s->icount_warp_timer != NULL;
 641}
 642
 643static bool adjust_timers_state_needed(void *opaque)
 644{
 645    TimersState *s = opaque;
 646    return s->icount_rt_timer != NULL;
 647}
 648
 649/*
 650 * Subsection for warp timer migration is optional, because may not be created
 651 */
 652static const VMStateDescription icount_vmstate_warp_timer = {
 653    .name = "timer/icount/warp_timer",
 654    .version_id = 1,
 655    .minimum_version_id = 1,
 656    .needed = warp_timer_state_needed,
 657    .fields = (VMStateField[]) {
 658        VMSTATE_INT64(vm_clock_warp_start, TimersState),
 659        VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 660        VMSTATE_END_OF_LIST()
 661    }
 662};
 663
 664static const VMStateDescription icount_vmstate_adjust_timers = {
 665    .name = "timer/icount/timers",
 666    .version_id = 1,
 667    .minimum_version_id = 1,
 668    .needed = adjust_timers_state_needed,
 669    .fields = (VMStateField[]) {
 670        VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 671        VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 672        VMSTATE_END_OF_LIST()
 673    }
 674};
 675
 676/*
 677 * This is a subsection for icount migration.
 678 */
 679static const VMStateDescription icount_vmstate_timers = {
 680    .name = "timer/icount",
 681    .version_id = 1,
 682    .minimum_version_id = 1,
 683    .needed = icount_state_needed,
 684    .fields = (VMStateField[]) {
 685        VMSTATE_INT64(qemu_icount_bias, TimersState),
 686        VMSTATE_INT64(qemu_icount, TimersState),
 687        VMSTATE_END_OF_LIST()
 688    },
 689    .subsections = (const VMStateDescription*[]) {
 690        &icount_vmstate_warp_timer,
 691        &icount_vmstate_adjust_timers,
 692        NULL
 693    }
 694};
 695
 696static const VMStateDescription vmstate_timers = {
 697    .name = "timer",
 698    .version_id = 2,
 699    .minimum_version_id = 1,
 700    .fields = (VMStateField[]) {
 701        VMSTATE_INT64(cpu_ticks_offset, TimersState),
 702        VMSTATE_UNUSED(8),
 703        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 704        VMSTATE_END_OF_LIST()
 705    },
 706    .subsections = (const VMStateDescription*[]) {
 707        &icount_vmstate_timers,
 708        NULL
 709    }
 710};
 711
 712static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 713{
 714    double pct;
 715    double throttle_ratio;
 716    int64_t sleeptime_ns, endtime_ns;
 717
 718    if (!cpu_throttle_get_percentage()) {
 719        return;
 720    }
 721
 722    pct = (double)cpu_throttle_get_percentage()/100;
 723    throttle_ratio = pct / (1 - pct);
 724    /* Add 1ns to fix double's rounding error (like 0.9999999...) */
 725    sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
 726    endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
 727    while (sleeptime_ns > 0 && !cpu->stop) {
 728        if (sleeptime_ns > SCALE_MS) {
 729            qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
 730                                sleeptime_ns / SCALE_MS);
 731        } else {
 732            qemu_mutex_unlock_iothread();
 733            g_usleep(sleeptime_ns / SCALE_US);
 734            qemu_mutex_lock_iothread();
 735        }
 736        sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 737    }
 738    atomic_set(&cpu->throttle_thread_scheduled, 0);
 739}
 740
 741static void cpu_throttle_timer_tick(void *opaque)
 742{
 743    CPUState *cpu;
 744    double pct;
 745
 746    /* Stop the timer if needed */
 747    if (!cpu_throttle_get_percentage()) {
 748        return;
 749    }
 750    CPU_FOREACH(cpu) {
 751        if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 752            async_run_on_cpu(cpu, cpu_throttle_thread,
 753                             RUN_ON_CPU_NULL);
 754        }
 755    }
 756
 757    pct = (double)cpu_throttle_get_percentage()/100;
 758    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 759                                   CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 760}
 761
 762void cpu_throttle_set(int new_throttle_pct)
 763{
 764    /* Ensure throttle percentage is within valid range */
 765    new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 766    new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 767
 768    atomic_set(&throttle_percentage, new_throttle_pct);
 769
 770    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 771                                       CPU_THROTTLE_TIMESLICE_NS);
 772}
 773
 774void cpu_throttle_stop(void)
 775{
 776    atomic_set(&throttle_percentage, 0);
 777}
 778
 779bool cpu_throttle_active(void)
 780{
 781    return (cpu_throttle_get_percentage() != 0);
 782}
 783
 784int cpu_throttle_get_percentage(void)
 785{
 786    return atomic_read(&throttle_percentage);
 787}
 788
 789void cpu_ticks_init(void)
 790{
 791    seqlock_init(&timers_state.vm_clock_seqlock);
 792    qemu_spin_init(&timers_state.vm_clock_lock);
 793    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 794    throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 795                                           cpu_throttle_timer_tick, NULL);
 796}
 797
 798void configure_icount(QemuOpts *opts, Error **errp)
 799{
 800    const char *option;
 801    char *rem_str = NULL;
 802
 803    option = qemu_opt_get(opts, "shift");
 804    if (!option) {
 805        if (qemu_opt_get(opts, "align") != NULL) {
 806            error_setg(errp, "Please specify shift option when using align");
 807        }
 808        return;
 809    }
 810
 811    icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 812    if (icount_sleep) {
 813        timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 814                                         icount_timer_cb, NULL);
 815    }
 816
 817    icount_align_option = qemu_opt_get_bool(opts, "align", false);
 818
 819    if (icount_align_option && !icount_sleep) {
 820        error_setg(errp, "align=on and sleep=off are incompatible");
 821    }
 822    if (strcmp(option, "auto") != 0) {
 823        errno = 0;
 824        timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 825        if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 826            error_setg(errp, "icount: Invalid shift value");
 827        }
 828        use_icount = 1;
 829        return;
 830    } else if (icount_align_option) {
 831        error_setg(errp, "shift=auto and align=on are incompatible");
 832    } else if (!icount_sleep) {
 833        error_setg(errp, "shift=auto and sleep=off are incompatible");
 834    }
 835
 836    use_icount = 2;
 837
 838    /* 125MIPS seems a reasonable initial guess at the guest speed.
 839       It will be corrected fairly quickly anyway.  */
 840    timers_state.icount_time_shift = 3;
 841
 842    /* Have both realtime and virtual time triggers for speed adjustment.
 843       The realtime trigger catches emulated time passing too slowly,
 844       the virtual time trigger catches emulated time passing too fast.
 845       Realtime triggers occur even when idle, so use them less frequently
 846       than VM triggers.  */
 847    timers_state.vm_clock_warp_start = -1;
 848    timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 849                                   icount_adjust_rt, NULL);
 850    timer_mod(timers_state.icount_rt_timer,
 851                   qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 852    timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 853                                        icount_adjust_vm, NULL);
 854    timer_mod(timers_state.icount_vm_timer,
 855                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 856                   NANOSECONDS_PER_SECOND / 10);
 857}
 858
 859/***********************************************************/
 860/* TCG vCPU kick timer
 861 *
 862 * The kick timer is responsible for moving single threaded vCPU
 863 * emulation on to the next vCPU. If more than one vCPU is running a
 864 * timer event with force a cpu->exit so the next vCPU can get
 865 * scheduled.
 866 *
 867 * The timer is removed if all vCPUs are idle and restarted again once
 868 * idleness is complete.
 869 */
 870
 871static QEMUTimer *tcg_kick_vcpu_timer;
 872static CPUState *tcg_current_rr_cpu;
 873
 874#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 875
 876static inline int64_t qemu_tcg_next_kick(void)
 877{
 878    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 879}
 880
 881/* Kick the currently round-robin scheduled vCPU to next */
 882static void qemu_cpu_kick_rr_next_cpu(void)
 883{
 884    CPUState *cpu;
 885    do {
 886        cpu = atomic_mb_read(&tcg_current_rr_cpu);
 887        if (cpu) {
 888            cpu_exit(cpu);
 889        }
 890    } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 891}
 892
 893/* Kick all RR vCPUs */
 894static void qemu_cpu_kick_rr_cpus(void)
 895{
 896    CPUState *cpu;
 897
 898    CPU_FOREACH(cpu) {
 899        cpu_exit(cpu);
 900    };
 901}
 902
 903static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 904{
 905}
 906
 907void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 908{
 909    if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 910        qemu_notify_event();
 911        return;
 912    }
 913
 914    if (qemu_in_vcpu_thread()) {
 915        /* A CPU is currently running; kick it back out to the
 916         * tcg_cpu_exec() loop so it will recalculate its
 917         * icount deadline immediately.
 918         */
 919        qemu_cpu_kick(current_cpu);
 920    } else if (first_cpu) {
 921        /* qemu_cpu_kick is not enough to kick a halted CPU out of
 922         * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 923         * causes cpu_thread_is_idle to return false.  This way,
 924         * handle_icount_deadline can run.
 925         * If we have no CPUs at all for some reason, we don't
 926         * need to do anything.
 927         */
 928        async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 929    }
 930}
 931
 932static void kick_tcg_thread(void *opaque)
 933{
 934    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 935    qemu_cpu_kick_rr_next_cpu();
 936}
 937
 938static void start_tcg_kick_timer(void)
 939{
 940    assert(!mttcg_enabled);
 941    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 942        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 943                                           kick_tcg_thread, NULL);
 944    }
 945    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 946        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 947    }
 948}
 949
 950static void stop_tcg_kick_timer(void)
 951{
 952    assert(!mttcg_enabled);
 953    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 954        timer_del(tcg_kick_vcpu_timer);
 955    }
 956}
 957
 958/***********************************************************/
 959void hw_error(const char *fmt, ...)
 960{
 961    va_list ap;
 962    CPUState *cpu;
 963
 964    va_start(ap, fmt);
 965    fprintf(stderr, "qemu: hardware error: ");
 966    vfprintf(stderr, fmt, ap);
 967    fprintf(stderr, "\n");
 968    CPU_FOREACH(cpu) {
 969        fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 970        cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
 971    }
 972    va_end(ap);
 973    abort();
 974}
 975
 976void cpu_synchronize_all_states(void)
 977{
 978    CPUState *cpu;
 979
 980    CPU_FOREACH(cpu) {
 981        cpu_synchronize_state(cpu);
 982        /* TODO: move to cpu_synchronize_state() */
 983        if (hvf_enabled()) {
 984            hvf_cpu_synchronize_state(cpu);
 985        }
 986    }
 987}
 988
 989void cpu_synchronize_all_post_reset(void)
 990{
 991    CPUState *cpu;
 992
 993    CPU_FOREACH(cpu) {
 994        cpu_synchronize_post_reset(cpu);
 995        /* TODO: move to cpu_synchronize_post_reset() */
 996        if (hvf_enabled()) {
 997            hvf_cpu_synchronize_post_reset(cpu);
 998        }
 999    }
1000}

1001
1002void cpu_synchronize_all_post_init(void)
1003{
1004    CPUState *cpu;
1005
1006    CPU_FOREACH(cpu) {
1007        cpu_synchronize_post_init(cpu);
1008        /* TODO: move to cpu_synchronize_post_init() */
1009        if (hvf_enabled()) {
1010            hvf_cpu_synchronize_post_init(cpu);
1011        }
1012    }
1013}
1014
1015void cpu_synchronize_all_pre_loadvm(void)
1016{
1017    CPUState *cpu;
1018
1019    CPU_FOREACH(cpu) {
1020        cpu_synchronize_pre_loadvm(cpu);
1021    }
1022}
1023
1024static int do_vm_stop(RunState state, bool send_stop)
1025{
1026    int ret = 0;
1027
1028    if (runstate_is_running()) {
1029        runstate_set(state);
1030        cpu_disable_ticks();
1031        pause_all_vcpus();
1032        vm_state_notify(0, state);
1033        if (send_stop) {
1034            qapi_event_send_stop();
1035        }
1036    }
1037
1038    bdrv_drain_all();
1039    ret = bdrv_flush_all();
1040
1041    return ret;
1042}
1043
1044/* Special vm_stop() variant for terminating the process.  Historically clients
1045 * did not expect a QMP STOP event and so we need to retain compatibility.
1046 */
1047int vm_shutdown(void)
1048{
1049    return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1050}
1051
1052static bool cpu_can_run(CPUState *cpu)
1053{
1054    if (cpu->stop) {
1055        return false;
1056    }
1057    if (cpu_is_stopped(cpu)) {
1058        return false;
1059    }
1060    return true;
1061}
1062
1063static void cpu_handle_guest_debug(CPUState *cpu)
1064{
1065    gdb_set_stop_cpu(cpu);
1066    qemu_system_debug_request();
1067    cpu->stopped = true;
1068}
1069
1070#ifdef CONFIG_LINUX
1071static void sigbus_reraise(void)
1072{
1073    sigset_t set;
1074    struct sigaction action;
1075
1076    memset(&action, 0, sizeof(action));
1077    action.sa_handler = SIG_DFL;
1078    if (!sigaction(SIGBUS, &action, NULL)) {
1079        raise(SIGBUS);
1080        sigemptyset(&set);
1081        sigaddset(&set, SIGBUS);
1082        pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1083    }
1084    perror("Failed to re-raise SIGBUS!\n");
1085    abort();
1086}
1087
1088static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1089{
1090    if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1091        sigbus_reraise();
1092    }
1093
1094    if (current_cpu) {
1095        /* Called asynchronously in VCPU thread.  */
1096        if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1097            sigbus_reraise();
1098        }
1099    } else {
1100        /* Called synchronously (via signalfd) in main thread.  */
1101        if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1102            sigbus_reraise();
1103        }
1104    }
1105}
1106
1107static void qemu_init_sigbus(void)
1108{
1109    struct sigaction action;
1110
1111    memset(&action, 0, sizeof(action));
1112    action.sa_flags = SA_SIGINFO;
1113    action.sa_sigaction = sigbus_handler;
1114    sigaction(SIGBUS, &action, NULL);
1115
1116    prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1117}
1118#else /* !CONFIG_LINUX */
1119static void qemu_init_sigbus(void)
1120{
1121}
1122#endif /* !CONFIG_LINUX */
1123
1124static QemuThread io_thread;
1125
1126/* cpu creation */
1127static QemuCond qemu_cpu_cond;
1128/* system init */
1129static QemuCond qemu_pause_cond;
1130
1131void qemu_init_cpu_loop(void)
1132{
1133    qemu_init_sigbus();
1134    qemu_cond_init(&qemu_cpu_cond);
1135    qemu_cond_init(&qemu_pause_cond);
1136    qemu_mutex_init(&qemu_global_mutex);
1137
1138    qemu_thread_get_self(&io_thread);
1139}
1140
1141void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1142{
1143    do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1144}
1145
1146static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1147{
1148    if (kvm_destroy_vcpu(cpu) < 0) {
1149        error_report("kvm_destroy_vcpu failed");
1150        exit(EXIT_FAILURE);
1151    }
1152}
1153
1154static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1155{
1156}
1157
1158static void qemu_cpu_stop(CPUState *cpu, bool exit)
1159{
1160    g_assert(qemu_cpu_is_self(cpu));
1161    cpu->stop = false;
1162    cpu->stopped = true;
1163    if (exit) {
1164        cpu_exit(cpu);
1165    }
1166    qemu_cond_broadcast(&qemu_pause_cond);
1167}
1168
1169static void qemu_wait_io_event_common(CPUState *cpu)
1170{
1171    atomic_mb_set(&cpu->thread_kicked, false);
1172    if (cpu->stop) {
1173        qemu_cpu_stop(cpu, false);
1174    }
1175    process_queued_cpu_work(cpu);
1176}
1177
1178static void qemu_tcg_rr_wait_io_event(void)
1179{
1180    CPUState *cpu;
1181
1182    while (all_cpu_threads_idle()) {
1183        stop_tcg_kick_timer();
1184        qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1185    }
1186
1187    start_tcg_kick_timer();
1188
1189    CPU_FOREACH(cpu) {
1190        qemu_wait_io_event_common(cpu);
1191    }
1192}
1193
1194static void qemu_wait_io_event(CPUState *cpu)
1195{
1196    bool slept = false;
1197
1198    while (cpu_thread_is_idle(cpu)) {
1199        if (!slept) {
1200            slept = true;
1201            qemu_plugin_vcpu_idle_cb(cpu);
1202        }
1203        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1204    }
1205    if (slept) {
1206        qemu_plugin_vcpu_resume_cb(cpu);
1207    }
1208
1209#ifdef _WIN32
1210    /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1211    if (!tcg_enabled()) {
1212        SleepEx(0, TRUE);
1213    }
1214#endif
1215    qemu_wait_io_event_common(cpu);
1216}
1217
1218static void *qemu_kvm_cpu_thread_fn(void *arg)
1219{
1220    CPUState *cpu = arg;
1221    int r;
1222
1223    rcu_register_thread();
1224
1225    qemu_mutex_lock_iothread();
1226    qemu_thread_get_self(cpu->thread);
1227    cpu->thread_id = qemu_get_thread_id();
1228    cpu->can_do_io = 1;
1229    current_cpu = cpu;
1230
1231    r = kvm_init_vcpu(cpu);
1232    if (r < 0) {
1233        error_report("kvm_init_vcpu failed: %s", strerror(-r));
1234        exit(1);
1235    }
1236
1237    kvm_init_cpu_signals(cpu);
1238
1239    /* signal CPU creation */
1240    cpu->created = true;
1241    qemu_cond_signal(&qemu_cpu_cond);
1242    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1243
1244    do {
1245        if (cpu_can_run(cpu)) {
1246            r = kvm_cpu_exec(cpu);
1247            if (r == EXCP_DEBUG) {
1248                cpu_handle_guest_debug(cpu);
1249            }
1250        }
1251        qemu_wait_io_event(cpu);
1252    } while (!cpu->unplug || cpu_can_run(cpu));
1253
1254    qemu_kvm_destroy_vcpu(cpu);
1255    cpu->created = false;
1256    qemu_cond_signal(&qemu_cpu_cond);
1257    qemu_mutex_unlock_iothread();
1258    rcu_unregister_thread();
1259    return NULL;
1260}
1261
1262static void *qemu_dummy_cpu_thread_fn(void *arg)
1263{
1264#ifdef _WIN32
1265    error_report("qtest is not supported under Windows");
1266    exit(1);
1267#else
1268    CPUState *cpu = arg;
1269    sigset_t waitset;
1270    int r;
1271
1272    rcu_register_thread();
1273
1274    qemu_mutex_lock_iothread();
1275    qemu_thread_get_self(cpu->thread);
1276    cpu->thread_id = qemu_get_thread_id();
1277    cpu->can_do_io = 1;
1278    current_cpu = cpu;
1279
1280    sigemptyset(&waitset);
1281    sigaddset(&waitset, SIG_IPI);
1282
1283    /* signal CPU creation */
1284    cpu->created = true;
1285    qemu_cond_signal(&qemu_cpu_cond);
1286    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1287
1288    do {
1289        qemu_mutex_unlock_iothread();
1290        do {
1291            int sig;
1292            r = sigwait(&waitset, &sig);
1293        } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1294        if (r == -1) {
1295            perror("sigwait");
1296            exit(1);
1297        }
1298        qemu_mutex_lock_iothread();
1299        qemu_wait_io_event(cpu);
1300    } while (!cpu->unplug);
1301
1302    qemu_mutex_unlock_iothread();
1303    rcu_unregister_thread();
1304    return NULL;
1305#endif
1306}
1307
1308static int64_t tcg_get_icount_limit(void)
1309{
1310    int64_t deadline;
1311
1312    if (replay_mode != REPLAY_MODE_PLAY) {
1313        /*
1314         * Include all the timers, because they may need an attention.
1315         * Too long CPU execution may create unnecessary delay in UI.
1316         */
1317        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1318                                              QEMU_TIMER_ATTR_ALL);
1319        /* Check realtime timers, because they help with input processing */
1320        deadline = qemu_soonest_timeout(deadline,
1321                qemu_clock_deadline_ns_all(QEMU_CLOCK_REALTIME,
1322                                           QEMU_TIMER_ATTR_ALL));
1323
1324        /* Maintain prior (possibly buggy) behaviour where if no deadline
1325         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1326         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1327         * nanoseconds.
1328         */
1329        if ((deadline < 0) || (deadline > INT32_MAX)) {
1330            deadline = INT32_MAX;
1331        }
1332
1333        return qemu_icount_round(deadline);
1334    } else {
1335        return replay_get_instructions();
1336    }
1337}
1338
1339static void handle_icount_deadline(void)
1340{
1341    assert(qemu_in_vcpu_thread());
1342    if (use_icount) {
1343        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1344                                                      QEMU_TIMER_ATTR_ALL);
1345
1346        if (deadline == 0) {
1347            /* Wake up other AioContexts.  */
1348            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1349            qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1350        }
1351    }
1352}
1353
1354static void prepare_icount_for_run(CPUState *cpu)
1355{
1356    if (use_icount) {
1357        int insns_left;
1358
1359        /* These should always be cleared by process_icount_data after
1360         * each vCPU execution. However u16.high can be raised
1361         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1362         */
1363        g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1364        g_assert(cpu->icount_extra == 0);
1365
1366        cpu->icount_budget = tcg_get_icount_limit();
1367        insns_left = MIN(0xffff, cpu->icount_budget);
1368        cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1369        cpu->icount_extra = cpu->icount_budget - insns_left;
1370
1371        replay_mutex_lock();
1372    }
1373}
1374
1375static void process_icount_data(CPUState *cpu)
1376{
1377    if (use_icount) {
1378        /* Account for executed instructions */
1379        cpu_update_icount(cpu);
1380
1381        /* Reset the counters */
1382        cpu_neg(cpu)->icount_decr.u16.low = 0;
1383        cpu->icount_extra = 0;
1384        cpu->icount_budget = 0;
1385
1386        replay_account_executed_instructions();
1387
1388        replay_mutex_unlock();
1389    }
1390}
1391
1392
1393static int tcg_cpu_exec(CPUState *cpu)
1394{
1395    int ret;
1396#ifdef CONFIG_PROFILER
1397    int64_t ti;
1398#endif
1399
1400    assert(tcg_enabled());
1401#ifdef CONFIG_PROFILER
1402    ti = profile_getclock();
1403#endif
1404    cpu_exec_start(cpu);
1405    ret = cpu_exec(cpu);
1406    cpu_exec_end(cpu);
1407#ifdef CONFIG_PROFILER
1408    atomic_set(&tcg_ctx->prof.cpu_exec_time,
1409               tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1410#endif
1411    return ret;
1412}
1413
1414/* Destroy any remaining vCPUs which have been unplugged and have
1415 * finished running
1416 */
1417static void deal_with_unplugged_cpus(void)
1418{
1419    CPUState *cpu;
1420
1421    CPU_FOREACH(cpu) {
1422        if (cpu->unplug && !cpu_can_run(cpu)) {
1423            qemu_tcg_destroy_vcpu(cpu);
1424            cpu->created = false;
1425            qemu_cond_signal(&qemu_cpu_cond);
1426            break;
1427        }
1428    }
1429}
1430
1431/* Single-threaded TCG
1432 *
1433 * In the single-threaded case each vCPU is simulated in turn. If
1434 * there is more than a single vCPU we create a simple timer to kick
1435 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1436 * This is done explicitly rather than relying on side-effects
1437 * elsewhere.
1438 */
1439
1440static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1441{
1442    CPUState *cpu = arg;
1443
1444    assert(tcg_enabled());
1445    rcu_register_thread();
1446    tcg_register_thread();
1447
1448    qemu_mutex_lock_iothread();
1449    qemu_thread_get_self(cpu->thread);
1450
1451    cpu->thread_id = qemu_get_thread_id();
1452    cpu->created = true;
1453    cpu->can_do_io = 1;
1454    qemu_cond_signal(&qemu_cpu_cond);
1455    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1456
1457    /* wait for initial kick-off after machine start */
1458    while (first_cpu->stopped) {
1459        qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1460
1461        /* process any pending work */
1462        CPU_FOREACH(cpu) {
1463            current_cpu = cpu;
1464            qemu_wait_io_event_common(cpu);
1465        }
1466    }
1467
1468    start_tcg_kick_timer();
1469
1470    cpu = first_cpu;
1471
1472    /* process any pending work */
1473    cpu->exit_request = 1;
1474
1475    while (1) {
1476        qemu_mutex_unlock_iothread();
1477        replay_mutex_lock();
1478        qemu_mutex_lock_iothread();
1479        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1480        qemu_account_warp_timer();
1481
1482        /* Run the timers here.  This is much more efficient than
1483         * waking up the I/O thread and waiting for completion.
1484         */
1485        handle_icount_deadline();
1486
1487        replay_mutex_unlock();
1488
1489        if (!cpu) {
1490            cpu = first_cpu;
1491        }
1492
1493        while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1494
1495            atomic_mb_set(&tcg_current_rr_cpu, cpu);
1496            current_cpu = cpu;
1497
1498            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1499                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1500
1501            if (cpu_can_run(cpu)) {
1502                int r;
1503
1504                qemu_mutex_unlock_iothread();
1505                prepare_icount_for_run(cpu);
1506
1507                r = tcg_cpu_exec(cpu);
1508
1509                process_icount_data(cpu);
1510                qemu_mutex_lock_iothread();
1511
1512                if (r == EXCP_DEBUG) {
1513                    cpu_handle_guest_debug(cpu);
1514                    break;
1515                } else if (r == EXCP_ATOMIC) {
1516                    qemu_mutex_unlock_iothread();
1517                    cpu_exec_step_atomic(cpu);
1518                    qemu_mutex_lock_iothread();
1519                    break;
1520                }
1521            } else if (cpu->stop) {
1522                if (cpu->unplug) {
1523                    cpu = CPU_NEXT(cpu);
1524                }
1525                break;
1526            }
1527
1528            cpu = CPU_NEXT(cpu);
1529        } /* while (cpu && !cpu->exit_request).. */
1530
1531        /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1532        atomic_set(&tcg_current_rr_cpu, NULL);
1533
1534        if (cpu && cpu->exit_request) {
1535            atomic_mb_set(&cpu->exit_request, 0);
1536        }
1537
1538        if (use_icount && all_cpu_threads_idle()) {
1539            /*
1540             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1541             * in the main_loop, wake it up in order to start the warp timer.
1542             */
1543            qemu_notify_event();
1544        }
1545
1546        qemu_tcg_rr_wait_io_event();
1547        deal_with_unplugged_cpus();
1548    }
1549
1550    rcu_unregister_thread();
1551    return NULL;
1552}
1553
1554static void *qemu_hax_cpu_thread_fn(void *arg)
1555{
1556    CPUState *cpu = arg;
1557    int r;
1558
1559    rcu_register_thread();
1560    qemu_mutex_lock_iothread();
1561    qemu_thread_get_self(cpu->thread);
1562
1563    cpu->thread_id = qemu_get_thread_id();
1564    cpu->created = true;
1565    current_cpu = cpu;
1566
1567    hax_init_vcpu(cpu);
1568    qemu_cond_signal(&qemu_cpu_cond);
1569    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1570
1571    do {
1572        if (cpu_can_run(cpu)) {
1573            r = hax_smp_cpu_exec(cpu);
1574            if (r == EXCP_DEBUG) {
1575                cpu_handle_guest_debug(cpu);
1576            }
1577        }
1578
1579        qemu_wait_io_event(cpu);
1580    } while (!cpu->unplug || cpu_can_run(cpu));
1581    rcu_unregister_thread();
1582    return NULL;
1583}
1584
1585/* The HVF-specific vCPU thread function. This one should only run when the host
1586 * CPU supports the VMX "unrestricted guest" feature. */
1587static void *qemu_hvf_cpu_thread_fn(void *arg)
1588{
1589    CPUState *cpu = arg;
1590
1591    int r;
1592
1593    assert(hvf_enabled());
1594
1595    rcu_register_thread();
1596
1597    qemu_mutex_lock_iothread();
1598    qemu_thread_get_self(cpu->thread);
1599
1600    cpu->thread_id = qemu_get_thread_id();
1601    cpu->can_do_io = 1;
1602    current_cpu = cpu;
1603
1604    hvf_init_vcpu(cpu);
1605
1606    /* signal CPU creation */
1607    cpu->created = true;
1608    qemu_cond_signal(&qemu_cpu_cond);
1609    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1610
1611    do {
1612        if (cpu_can_run(cpu)) {
1613            r = hvf_vcpu_exec(cpu);
1614            if (r == EXCP_DEBUG) {
1615                cpu_handle_guest_debug(cpu);
1616            }
1617        }
1618        qemu_wait_io_event(cpu);
1619    } while (!cpu->unplug || cpu_can_run(cpu));
1620
1621    hvf_vcpu_destroy(cpu);
1622    cpu->created = false;
1623    qemu_cond_signal(&qemu_cpu_cond);
1624    qemu_mutex_unlock_iothread();
1625    rcu_unregister_thread();
1626    return NULL;
1627}
1628
1629static void *qemu_whpx_cpu_thread_fn(void *arg)
1630{
1631    CPUState *cpu = arg;
1632    int r;
1633
1634    rcu_register_thread();
1635
1636    qemu_mutex_lock_iothread();
1637    qemu_thread_get_self(cpu->thread);
1638    cpu->thread_id = qemu_get_thread_id();
1639    current_cpu = cpu;
1640
1641    r = whpx_init_vcpu(cpu);
1642    if (r < 0) {
1643        fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1644        exit(1);
1645    }
1646
1647    /* signal CPU creation */
1648    cpu->created = true;
1649    qemu_cond_signal(&qemu_cpu_cond);
1650    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1651
1652    do {
1653        if (cpu_can_run(cpu)) {
1654            r = whpx_vcpu_exec(cpu);
1655            if (r == EXCP_DEBUG) {
1656                cpu_handle_guest_debug(cpu);
1657            }
1658        }
1659        while (cpu_thread_is_idle(cpu)) {
1660            qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1661        }
1662        qemu_wait_io_event_common(cpu);
1663    } while (!cpu->unplug || cpu_can_run(cpu));
1664
1665    whpx_destroy_vcpu(cpu);
1666    cpu->created = false;
1667    qemu_cond_signal(&qemu_cpu_cond);
1668    qemu_mutex_unlock_iothread();
1669    rcu_unregister_thread();
1670    return NULL;
1671}
1672
1673#ifdef _WIN32
1674static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1675{
1676}
1677#endif
1678
1679/* Multi-threaded TCG
1680 *
1681 * In the multi-threaded case each vCPU has its own thread. The TLS
1682 * variable current_cpu can be used deep in the code to find the
1683 * current CPUState for a given thread.
1684 */
1685
1686static void *qemu_tcg_cpu_thread_fn(void *arg)
1687{
1688    CPUState *cpu = arg;
1689
1690    assert(tcg_enabled());
1691    g_assert(!use_icount);
1692
1693    rcu_register_thread();
1694    tcg_register_thread();
1695
1696    qemu_mutex_lock_iothread();
1697    qemu_thread_get_self(cpu->thread);
1698
1699    cpu->thread_id = qemu_get_thread_id();
1700    cpu->created = true;
1701    cpu->can_do_io = 1;
1702    current_cpu = cpu;
1703    qemu_cond_signal(&qemu_cpu_cond);
1704    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1705
1706    /* process any pending work */
1707    cpu->exit_request = 1;
1708
1709    do {
1710        if (cpu_can_run(cpu)) {
1711            int r;
1712            qemu_mutex_unlock_iothread();
1713            r = tcg_cpu_exec(cpu);
1714            qemu_mutex_lock_iothread();
1715            switch (r) {
1716            case EXCP_DEBUG:
1717                cpu_handle_guest_debug(cpu);
1718                break;
1719            case EXCP_HALTED:
1720                /* during start-up the vCPU is reset and the thread is
1721                 * kicked several times. If we don't ensure we go back
1722                 * to sleep in the halted state we won't cleanly
1723                 * start-up when the vCPU is enabled.
1724                 *
1725                 * cpu->halted should ensure we sleep in wait_io_event
1726                 */
1727                g_assert(cpu->halted);
1728                break;
1729            case EXCP_ATOMIC:
1730                qemu_mutex_unlock_iothread();
1731                cpu_exec_step_atomic(cpu);
1732                qemu_mutex_lock_iothread();
1733            default:
1734                /* Ignore everything else? */
1735                break;
1736            }
1737        }
1738
1739        atomic_mb_set(&cpu->exit_request, 0);
1740        qemu_wait_io_event(cpu);
1741    } while (!cpu->unplug || cpu_can_run(cpu));
1742
1743    qemu_tcg_destroy_vcpu(cpu);
1744    cpu->created = false;
1745    qemu_cond_signal(&qemu_cpu_cond);
1746    qemu_mutex_unlock_iothread();
1747    rcu_unregister_thread();
1748    return NULL;
1749}
1750
1751static void qemu_cpu_kick_thread(CPUState *cpu)
1752{
1753#ifndef _WIN32
1754    int err;
1755
1756    if (cpu->thread_kicked) {
1757        return;
1758    }
1759    cpu->thread_kicked = true;
1760    err = pthread_kill(cpu->thread->thread, SIG_IPI);
1761    if (err && err != ESRCH) {
1762        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1763        exit(1);
1764    }
1765#else /* _WIN32 */
1766    if (!qemu_cpu_is_self(cpu)) {
1767        if (whpx_enabled()) {
1768            whpx_vcpu_kick(cpu);
1769        } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1770            fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1771                    __func__, GetLastError());
1772            exit(1);
1773        }
1774    }
1775#endif
1776}
1777
1778void qemu_cpu_kick(CPUState *cpu)
1779{
1780    qemu_cond_broadcast(cpu->halt_cond);
1781    if (tcg_enabled()) {
1782        if (qemu_tcg_mttcg_enabled()) {
1783            cpu_exit(cpu);
1784        } else {
1785            qemu_cpu_kick_rr_cpus();
1786        }
1787    } else {
1788        if (hax_enabled()) {
1789            /*
1790             * FIXME: race condition with the exit_request check in
1791             * hax_vcpu_hax_exec
1792             */
1793            cpu->exit_request = 1;
1794        }
1795        qemu_cpu_kick_thread(cpu);
1796    }
1797}
1798
1799void qemu_cpu_kick_self(void)
1800{
1801    assert(current_cpu);
1802    qemu_cpu_kick_thread(current_cpu);
1803}
1804
1805bool qemu_cpu_is_self(CPUState *cpu)
1806{
1807    return qemu_thread_is_self(cpu->thread);
1808}
1809
1810bool qemu_in_vcpu_thread(void)
1811{
1812    return current_cpu && qemu_cpu_is_self(current_cpu);
1813}
1814
1815static __thread bool iothread_locked = false;
1816
1817bool qemu_mutex_iothread_locked(void)
1818{
1819    return iothread_locked;
1820}
1821
1822/*
1823 * The BQL is taken from so many places that it is worth profiling the
1824 * callers directly, instead of funneling them all through a single function.
1825 */
1826void qemu_mutex_lock_iothread_impl(const char *file, int line)
1827{
1828    QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1829
1830    g_assert(!qemu_mutex_iothread_locked());
1831    bql_lock(&qemu_global_mutex, file, line);
1832    iothread_locked = true;
1833}
1834
1835void qemu_mutex_unlock_iothread(void)
1836{
1837    g_assert(qemu_mutex_iothread_locked());
1838    iothread_locked = false;
1839    qemu_mutex_unlock(&qemu_global_mutex);
1840}
1841
1842void qemu_cond_wait_iothread(QemuCond *cond)
1843{
1844    qemu_cond_wait(cond, &qemu_global_mutex);
1845}
1846
1847static bool all_vcpus_paused(void)
1848{
1849    CPUState *cpu;
1850
1851    CPU_FOREACH(cpu) {
1852        if (!cpu->stopped) {
1853            return false;
1854        }
1855    }
1856
1857    return true;
1858}
1859
1860void pause_all_vcpus(void)
1861{
1862    CPUState *cpu;
1863
1864    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1865    CPU_FOREACH(cpu) {
1866        if (qemu_cpu_is_self(cpu)) {
1867            qemu_cpu_stop(cpu, true);
1868        } else {
1869            cpu->stop = true;
1870            qemu_cpu_kick(cpu);
1871        }
1872    }
1873
1874    /* We need to drop the replay_lock so any vCPU threads woken up
1875     * can finish their replay tasks
1876     */
1877    replay_mutex_unlock();
1878
1879    while (!all_vcpus_paused()) {
1880        qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1881        CPU_FOREACH(cpu) {
1882            qemu_cpu_kick(cpu);
1883        }
1884    }
1885
1886    qemu_mutex_unlock_iothread();
1887    replay_mutex_lock();
1888    qemu_mutex_lock_iothread();
1889}
1890
1891void cpu_resume(CPUState *cpu)
1892{
1893    cpu->stop = false;
1894    cpu->stopped = false;
1895    qemu_cpu_kick(cpu);
1896}
1897
1898void resume_all_vcpus(void)
1899{
1900    CPUState *cpu;
1901
1902    if (!runstate_is_running()) {
1903        return;
1904    }
1905
1906    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1907    CPU_FOREACH(cpu) {
1908        cpu_resume(cpu);
1909    }
1910}
1911
1912void cpu_remove_sync(CPUState *cpu)
1913{
1914    cpu->stop = true;
1915    cpu->unplug = true;
1916    qemu_cpu_kick(cpu);
1917    qemu_mutex_unlock_iothread();
1918    qemu_thread_join(cpu->thread);
1919    qemu_mutex_lock_iothread();
1920}
1921
1922/* For temporary buffers for forming a name */
1923#define VCPU_THREAD_NAME_SIZE 16
1924
1925static void qemu_tcg_init_vcpu(CPUState *cpu)
1926{
1927    char thread_name[VCPU_THREAD_NAME_SIZE];
1928    static QemuCond *single_tcg_halt_cond;
1929    static QemuThread *single_tcg_cpu_thread;
1930    static int tcg_region_inited;
1931
1932    assert(tcg_enabled());
1933    /*
1934     * Initialize TCG regions--once. Now is a good time, because:
1935     * (1) TCG's init context, prologue and target globals have been set up.
1936     * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1937     *     -accel flag is processed, so the check doesn't work then).
1938     */
1939    if (!tcg_region_inited) {
1940        tcg_region_inited = 1;
1941        tcg_region_init();
1942    }
1943
1944    if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1945        cpu->thread = g_malloc0(sizeof(QemuThread));
1946        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1947        qemu_cond_init(cpu->halt_cond);
1948
1949        if (qemu_tcg_mttcg_enabled()) {
1950            /* create a thread per vCPU with TCG (MTTCG) */
1951            parallel_cpus = true;
1952            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1953                 cpu->cpu_index);
1954
1955            qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1956                               cpu, QEMU_THREAD_JOINABLE);
1957
1958        } else {
1959            /* share a single thread for all cpus with TCG */
1960            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1961            qemu_thread_create(cpu->thread, thread_name,
1962                               qemu_tcg_rr_cpu_thread_fn,
1963                               cpu, QEMU_THREAD_JOINABLE);
1964
1965            single_tcg_halt_cond = cpu->halt_cond;
1966            single_tcg_cpu_thread = cpu->thread;
1967        }
1968#ifdef _WIN32
1969        cpu->hThread = qemu_thread_get_handle(cpu->thread);
1970#endif
1971    } else {
1972        /* For non-MTTCG cases we share the thread */
1973        cpu->thread = single_tcg_cpu_thread;
1974        cpu->halt_cond = single_tcg_halt_cond;
1975        cpu->thread_id = first_cpu->thread_id;
1976        cpu->can_do_io = 1;
1977        cpu->created = true;
1978    }
1979}
1980
1981static void qemu_hax_start_vcpu(CPUState *cpu)
1982{
1983    char thread_name[VCPU_THREAD_NAME_SIZE];
1984
1985    cpu->thread = g_malloc0(sizeof(QemuThread));
1986    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1987    qemu_cond_init(cpu->halt_cond);
1988
1989    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1990             cpu->cpu_index);
1991    qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1992                       cpu, QEMU_THREAD_JOINABLE);
1993#ifdef _WIN32
1994    cpu->hThread = qemu_thread_get_handle(cpu->thread);
1995#endif
1996}
1997
1998static void qemu_kvm_start_vcpu(CPUState *cpu)
1999{
2000    char thread_name[VCPU_THREAD_NAME_SIZE];

2001
2002    cpu->thread = g_malloc0(sizeof(QemuThread));
2003    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2004    qemu_cond_init(cpu->halt_cond);
2005    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2006             cpu->cpu_index);
2007    qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2008                       cpu, QEMU_THREAD_JOINABLE);
2009}
2010
2011static void qemu_hvf_start_vcpu(CPUState *cpu)
2012{
2013    char thread_name[VCPU_THREAD_NAME_SIZE];
2014
2015    /* HVF currently does not support TCG, and only runs in
2016     * unrestricted-guest mode. */
2017    assert(hvf_enabled());
2018
2019    cpu->thread = g_malloc0(sizeof(QemuThread));
2020    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2021    qemu_cond_init(cpu->halt_cond);
2022
2023    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2024             cpu->cpu_index);
2025    qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2026                       cpu, QEMU_THREAD_JOINABLE);
2027}
2028
2029static void qemu_whpx_start_vcpu(CPUState *cpu)
2030{
2031    char thread_name[VCPU_THREAD_NAME_SIZE];
2032
2033    cpu->thread = g_malloc0(sizeof(QemuThread));
2034    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2035    qemu_cond_init(cpu->halt_cond);
2036    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2037             cpu->cpu_index);
2038    qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2039                       cpu, QEMU_THREAD_JOINABLE);
2040#ifdef _WIN32
2041    cpu->hThread = qemu_thread_get_handle(cpu->thread);
2042#endif
2043}
2044
2045static void qemu_dummy_start_vcpu(CPUState *cpu)
2046{
2047    char thread_name[VCPU_THREAD_NAME_SIZE];
2048
2049    cpu->thread = g_malloc0(sizeof(QemuThread));
2050    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2051    qemu_cond_init(cpu->halt_cond);
2052    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2053             cpu->cpu_index);
2054    qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2055                       QEMU_THREAD_JOINABLE);
2056}
2057
2058void qemu_init_vcpu(CPUState *cpu)
2059{
2060    MachineState *ms = MACHINE(qdev_get_machine());
2061
2062    cpu->nr_cores = ms->smp.cores;
2063    cpu->nr_threads =  ms->smp.threads;
2064    cpu->stopped = true;
2065    cpu->random_seed = qemu_guest_random_seed_thread_part1();
2066
2067    if (!cpu->as) {
2068        /* If the target cpu hasn't set up any address spaces itself,
2069         * give it the default one.
2070         */
2071        cpu->num_ases = 1;
2072        cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2073    }
2074
2075    if (kvm_enabled()) {
2076        qemu_kvm_start_vcpu(cpu);
2077    } else if (hax_enabled()) {
2078        qemu_hax_start_vcpu(cpu);
2079    } else if (hvf_enabled()) {
2080        qemu_hvf_start_vcpu(cpu);
2081    } else if (tcg_enabled()) {
2082        qemu_tcg_init_vcpu(cpu);
2083    } else if (whpx_enabled()) {
2084        qemu_whpx_start_vcpu(cpu);
2085    } else {
2086        qemu_dummy_start_vcpu(cpu);
2087    }
2088
2089    while (!cpu->created) {
2090        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2091    }
2092}
2093
2094void cpu_stop_current(void)
2095{
2096    if (current_cpu) {
2097        current_cpu->stop = true;
2098        cpu_exit(current_cpu);
2099    }
2100}
2101
2102int vm_stop(RunState state)
2103{
2104    if (qemu_in_vcpu_thread()) {
2105        qemu_system_vmstop_request_prepare();
2106        qemu_system_vmstop_request(state);
2107        /*
2108         * FIXME: should not return to device code in case
2109         * vm_stop() has been requested.
2110         */
2111        cpu_stop_current();
2112        return 0;
2113    }
2114
2115    return do_vm_stop(state, true);
2116}
2117
2118/**
2119 * Prepare for (re)starting the VM.
2120 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2121 * running or in case of an error condition), 0 otherwise.
2122 */
2123int vm_prepare_start(void)
2124{
2125    RunState requested;
2126
2127    qemu_vmstop_requested(&requested);
2128    if (runstate_is_running() && requested == RUN_STATE__MAX) {
2129        return -1;
2130    }
2131
2132    /* Ensure that a STOP/RESUME pair of events is emitted if a
2133     * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2134     * example, according to documentation is always followed by
2135     * the STOP event.
2136     */
2137    if (runstate_is_running()) {
2138        qapi_event_send_stop();
2139        qapi_event_send_resume();
2140        return -1;
2141    }
2142
2143    /* We are sending this now, but the CPUs will be resumed shortly later */
2144    qapi_event_send_resume();
2145
2146    cpu_enable_ticks();
2147    runstate_set(RUN_STATE_RUNNING);
2148    vm_state_notify(1, RUN_STATE_RUNNING);
2149    return 0;
2150}
2151
2152void vm_start(void)
2153{
2154    if (!vm_prepare_start()) {
2155        resume_all_vcpus();
2156    }
2157}
2158
2159/* does a state transition even if the VM is already stopped,
2160   current state is forgotten forever */
2161int vm_stop_force_state(RunState state)
2162{
2163    if (runstate_is_running()) {
2164        return vm_stop(state);
2165    } else {
2166        runstate_set(state);
2167
2168        bdrv_drain_all();
2169        /* Make sure to return an error if the flush in a previous vm_stop()
2170         * failed. */
2171        return bdrv_flush_all();
2172    }
2173}
2174
2175void list_cpus(const char *optarg)
2176{
2177    /* XXX: implement xxx_cpu_list for targets that still miss it */
2178#if defined(cpu_list)
2179    cpu_list();
2180#endif
2181}
2182
2183void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2184                 bool has_cpu, int64_t cpu_index, Error **errp)
2185{
2186    FILE *f;
2187    uint32_t l;
2188    CPUState *cpu;
2189    uint8_t buf[1024];
2190    int64_t orig_addr = addr, orig_size = size;
2191
2192    if (!has_cpu) {
2193        cpu_index = 0;
2194    }
2195
2196    cpu = qemu_get_cpu(cpu_index);
2197    if (cpu == NULL) {
2198        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2199                   "a CPU number");
2200        return;
2201    }
2202
2203    f = fopen(filename, "wb");
2204    if (!f) {
2205        error_setg_file_open(errp, errno, filename);
2206        return;
2207    }
2208
2209    while (size != 0) {
2210        l = sizeof(buf);
2211        if (l > size)
2212            l = size;
2213        if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2214            error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2215                             " specified", orig_addr, orig_size);
2216            goto exit;
2217        }
2218        if (fwrite(buf, 1, l, f) != l) {
2219            error_setg(errp, QERR_IO_ERROR);
2220            goto exit;
2221        }
2222        addr += l;
2223        size -= l;
2224    }
2225
2226exit:
2227    fclose(f);
2228}
2229
2230void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2231                  Error **errp)
2232{
2233    FILE *f;
2234    uint32_t l;
2235    uint8_t buf[1024];
2236
2237    f = fopen(filename, "wb");
2238    if (!f) {
2239        error_setg_file_open(errp, errno, filename);
2240        return;
2241    }
2242
2243    while (size != 0) {
2244        l = sizeof(buf);
2245        if (l > size)
2246            l = size;
2247        cpu_physical_memory_read(addr, buf, l);
2248        if (fwrite(buf, 1, l, f) != l) {
2249            error_setg(errp, QERR_IO_ERROR);
2250            goto exit;
2251        }
2252        addr += l;
2253        size -= l;
2254    }
2255
2256exit:
2257    fclose(f);
2258}
2259
2260void qmp_inject_nmi(Error **errp)
2261{
2262    nmi_monitor_handle(monitor_get_cpu_index(), errp);
2263}
2264
2265void dump_drift_info(void)
2266{
2267    if (!use_icount) {
2268        return;
2269    }
2270
2271    qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2272                (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2273    if (icount_align_option) {
2274        qemu_printf("Max guest delay     %"PRIi64" ms\n",
2275                    -max_delay / SCALE_MS);
2276        qemu_printf("Max guest advance   %"PRIi64" ms\n",
2277                    max_advance / SCALE_MS);
2278    } else {
2279        qemu_printf("Max guest delay     NA\n");
2280        qemu_printf("Max guest advance   NA\n");
2281    }
2282}
2283