qemu/cpus.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qemu/config-file.h"
  27#include "cpu.h"
  28#include "monitor/monitor.h"
  29#include "qapi/error.h"
  30#include "qapi/qapi-commands-misc.h"
  31#include "qapi/qapi-events-run-state.h"
  32#include "qapi/qmp/qerror.h"
  33#include "qemu/error-report.h"
  34#include "sysemu/sysemu.h"
  35#include "sysemu/block-backend.h"
  36#include "exec/gdbstub.h"
  37#include "sysemu/dma.h"
  38#include "sysemu/hw_accel.h"
  39#include "sysemu/kvm.h"
  40#include "sysemu/hax.h"
  41#include "sysemu/hvf.h"
  42#include "sysemu/whpx.h"
  43#include "exec/exec-all.h"
  44
  45#include "qemu/thread.h"
  46#include "sysemu/cpus.h"
  47#include "sysemu/qtest.h"
  48#include "qemu/main-loop.h"
  49#include "qemu/option.h"
  50#include "qemu/bitmap.h"
  51#include "qemu/seqlock.h"
  52#include "tcg.h"
  53#include "hw/nmi.h"
  54#include "sysemu/replay.h"
  55#include "hw/boards.h"
  56
  57#ifdef CONFIG_LINUX
  58
  59#include <sys/prctl.h>
  60
  61#ifndef PR_MCE_KILL
  62#define PR_MCE_KILL 33
  63#endif
  64
  65#ifndef PR_MCE_KILL_SET
  66#define PR_MCE_KILL_SET 1
  67#endif
  68
  69#ifndef PR_MCE_KILL_EARLY
  70#define PR_MCE_KILL_EARLY 1
  71#endif
  72
  73#endif /* CONFIG_LINUX */
  74
  75int64_t max_delay;
  76int64_t max_advance;
  77
  78/* vcpu throttling controls */
  79static QEMUTimer *throttle_timer;
  80static unsigned int throttle_percentage;
  81
  82#define CPU_THROTTLE_PCT_MIN 1
  83#define CPU_THROTTLE_PCT_MAX 99
  84#define CPU_THROTTLE_TIMESLICE_NS 10000000
  85
  86bool cpu_is_stopped(CPUState *cpu)
  87{
  88    return cpu->stopped || !runstate_is_running();
  89}
  90
  91static bool cpu_thread_is_idle(CPUState *cpu)
  92{
  93    if (cpu->stop || cpu->queued_work_first) {
  94        return false;
  95    }
  96    if (cpu_is_stopped(cpu)) {
  97        return true;
  98    }
  99    if (!cpu->halted || cpu_has_work(cpu) ||
 100        kvm_halt_in_kernel()) {
 101        return false;
 102    }
 103    return true;
 104}
 105
 106static bool all_cpu_threads_idle(void)
 107{
 108    CPUState *cpu;
 109
 110    CPU_FOREACH(cpu) {
 111        if (!cpu_thread_is_idle(cpu)) {
 112            return false;
 113        }
 114    }
 115    return true;
 116}
 117
 118/***********************************************************/
 119/* guest cycle counter */
 120
 121/* Protected by TimersState seqlock */
 122
 123static bool icount_sleep = true;
 124/* Conversion factor from emulated instructions to virtual clock ticks.  */
 125static int icount_time_shift;
 126/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 127#define MAX_ICOUNT_SHIFT 10
 128
 129typedef struct TimersState {
 130    /* Protected by BQL.  */
 131    int64_t cpu_ticks_prev;
 132    int64_t cpu_ticks_offset;
 133
 134    /* cpu_clock_offset can be read out of BQL, so protect it with
 135     * this lock.
 136     */
 137    QemuSeqLock vm_clock_seqlock;
 138    int64_t cpu_clock_offset;
 139    int32_t cpu_ticks_enabled;
 140    int64_t dummy;
 141
 142    /* Compensate for varying guest execution speed.  */
 143    int64_t qemu_icount_bias;
 144    /* Only written by TCG thread */
 145    int64_t qemu_icount;
 146    /* for adjusting icount */
 147    int64_t vm_clock_warp_start;
 148    QEMUTimer *icount_rt_timer;
 149    QEMUTimer *icount_vm_timer;
 150    QEMUTimer *icount_warp_timer;
 151} TimersState;
 152
 153static TimersState timers_state;
 154bool mttcg_enabled;
 155
 156/*
 157 * We default to false if we know other options have been enabled
 158 * which are currently incompatible with MTTCG. Otherwise when each
 159 * guest (target) has been updated to support:
 160 *   - atomic instructions
 161 *   - memory ordering primitives (barriers)
 162 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 163 *
 164 * Once a guest architecture has been converted to the new primitives
 165 * there are two remaining limitations to check.
 166 *
 167 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 168 * - The host must have a stronger memory order than the guest
 169 *
 170 * It may be possible in future to support strong guests on weak hosts
 171 * but that will require tagging all load/stores in a guest with their
 172 * implicit memory order requirements which would likely slow things
 173 * down a lot.
 174 */
 175
 176static bool check_tcg_memory_orders_compatible(void)
 177{
 178#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 179    return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 180#else
 181    return false;
 182#endif
 183}
 184
 185static bool default_mttcg_enabled(void)
 186{
 187    if (use_icount || TCG_OVERSIZED_GUEST) {
 188        return false;
 189    } else {
 190#ifdef TARGET_SUPPORTS_MTTCG
 191        return check_tcg_memory_orders_compatible();
 192#else
 193        return false;
 194#endif
 195    }
 196}
 197
 198void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 199{
 200    const char *t = qemu_opt_get(opts, "thread");
 201    if (t) {
 202        if (strcmp(t, "multi") == 0) {
 203            if (TCG_OVERSIZED_GUEST) {
 204                error_setg(errp, "No MTTCG when guest word size > hosts");
 205            } else if (use_icount) {
 206                error_setg(errp, "No MTTCG when icount is enabled");
 207            } else {
 208#ifndef TARGET_SUPPORTS_MTTCG
 209                error_report("Guest not yet converted to MTTCG - "
 210                             "you may get unexpected results");
 211#endif
 212                if (!check_tcg_memory_orders_compatible()) {
 213                    error_report("Guest expects a stronger memory ordering "
 214                                 "than the host provides");
 215                    error_printf("This may cause strange/hard to debug errors\n");
 216                }
 217                mttcg_enabled = true;
 218            }
 219        } else if (strcmp(t, "single") == 0) {
 220            mttcg_enabled = false;
 221        } else {
 222            error_setg(errp, "Invalid 'thread' setting %s", t);
 223        }
 224    } else {
 225        mttcg_enabled = default_mttcg_enabled();
 226    }
 227}
 228
 229/* The current number of executed instructions is based on what we
 230 * originally budgeted minus the current state of the decrementing
 231 * icount counters in extra/u16.low.
 232 */
 233static int64_t cpu_get_icount_executed(CPUState *cpu)
 234{
 235    return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
 236}
 237
 238/*
 239 * Update the global shared timer_state.qemu_icount to take into
 240 * account executed instructions. This is done by the TCG vCPU
 241 * thread so the main-loop can see time has moved forward.
 242 */
 243void cpu_update_icount(CPUState *cpu)
 244{
 245    int64_t executed = cpu_get_icount_executed(cpu);
 246    cpu->icount_budget -= executed;
 247
 248#ifdef CONFIG_ATOMIC64
 249    atomic_set__nocheck(&timers_state.qemu_icount,
 250                        atomic_read__nocheck(&timers_state.qemu_icount) +
 251                        executed);
 252#else /* FIXME: we need 64bit atomics to do this safely */
 253    timers_state.qemu_icount += executed;
 254#endif
 255}
 256
 257int64_t cpu_get_icount_raw(void)
 258{
 259    CPUState *cpu = current_cpu;
 260
 261    if (cpu && cpu->running) {
 262        if (!cpu->can_do_io) {
 263            error_report("Bad icount read");
 264            exit(1);
 265        }
 266        /* Take into account what has run */
 267        cpu_update_icount(cpu);
 268    }
 269#ifdef CONFIG_ATOMIC64
 270    return atomic_read__nocheck(&timers_state.qemu_icount);
 271#else /* FIXME: we need 64bit atomics to do this safely */
 272    return timers_state.qemu_icount;
 273#endif
 274}
 275
 276/* Return the virtual CPU time, based on the instruction counter.  */
 277static int64_t cpu_get_icount_locked(void)
 278{
 279    int64_t icount = cpu_get_icount_raw();
 280    return timers_state.qemu_icount_bias + cpu_icount_to_ns(icount);
 281}
 282
 283int64_t cpu_get_icount(void)
 284{
 285    int64_t icount;
 286    unsigned start;
 287
 288    do {
 289        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 290        icount = cpu_get_icount_locked();
 291    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 292
 293    return icount;
 294}
 295
 296int64_t cpu_icount_to_ns(int64_t icount)
 297{
 298    return icount << icount_time_shift;
 299}
 300
 301/* return the time elapsed in VM between vm_start and vm_stop.  Unless
 302 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 303 * counter.
 304 *
 305 * Caller must hold the BQL
 306 */
 307int64_t cpu_get_ticks(void)
 308{
 309    int64_t ticks;
 310
 311    if (use_icount) {
 312        return cpu_get_icount();
 313    }
 314
 315    ticks = timers_state.cpu_ticks_offset;
 316    if (timers_state.cpu_ticks_enabled) {
 317        ticks += cpu_get_host_ticks();
 318    }
 319
 320    if (timers_state.cpu_ticks_prev > ticks) {
 321        /* Note: non increasing ticks may happen if the host uses
 322           software suspend */
 323        timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 324        ticks = timers_state.cpu_ticks_prev;
 325    }
 326
 327    timers_state.cpu_ticks_prev = ticks;
 328    return ticks;
 329}
 330
 331static int64_t cpu_get_clock_locked(void)
 332{
 333    int64_t time;
 334
 335    time = timers_state.cpu_clock_offset;
 336    if (timers_state.cpu_ticks_enabled) {
 337        time += get_clock();
 338    }
 339
 340    return time;
 341}
 342
 343/* Return the monotonic time elapsed in VM, i.e.,
 344 * the time between vm_start and vm_stop
 345 */
 346int64_t cpu_get_clock(void)
 347{
 348    int64_t ti;
 349    unsigned start;
 350
 351    do {
 352        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 353        ti = cpu_get_clock_locked();
 354    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 355
 356    return ti;
 357}
 358
 359/* enable cpu_get_ticks()
 360 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 361 */
 362void cpu_enable_ticks(void)
 363{
 364    /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 365    seqlock_write_begin(&timers_state.vm_clock_seqlock);
 366    if (!timers_state.cpu_ticks_enabled) {
 367        timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 368        timers_state.cpu_clock_offset -= get_clock();
 369        timers_state.cpu_ticks_enabled = 1;
 370    }
 371    seqlock_write_end(&timers_state.vm_clock_seqlock);
 372}
 373
 374/* disable cpu_get_ticks() : the clock is stopped. You must not call
 375 * cpu_get_ticks() after that.
 376 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 377 */
 378void cpu_disable_ticks(void)
 379{
 380    /* Here, the really thing protected by seqlock is cpu_clock_offset. */
 381    seqlock_write_begin(&timers_state.vm_clock_seqlock);
 382    if (timers_state.cpu_ticks_enabled) {
 383        timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 384        timers_state.cpu_clock_offset = cpu_get_clock_locked();
 385        timers_state.cpu_ticks_enabled = 0;
 386    }
 387    seqlock_write_end(&timers_state.vm_clock_seqlock);
 388}
 389
 390/* Correlation between real and virtual time is always going to be
 391   fairly approximate, so ignore small variation.
 392   When the guest is idle real and virtual time will be aligned in
 393   the IO wait loop.  */
 394#define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 395
 396static void icount_adjust(void)
 397{
 398    int64_t cur_time;
 399    int64_t cur_icount;
 400    int64_t delta;
 401
 402    /* Protected by TimersState mutex.  */
 403    static int64_t last_delta;
 404
 405    /* If the VM is not running, then do nothing.  */
 406    if (!runstate_is_running()) {
 407        return;
 408    }
 409
 410    seqlock_write_begin(&timers_state.vm_clock_seqlock);
 411    cur_time = cpu_get_clock_locked();
 412    cur_icount = cpu_get_icount_locked();
 413
 414    delta = cur_icount - cur_time;
 415    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 416    if (delta > 0
 417        && last_delta + ICOUNT_WOBBLE < delta * 2
 418        && icount_time_shift > 0) {
 419        /* The guest is getting too far ahead.  Slow time down.  */
 420        icount_time_shift--;
 421    }
 422    if (delta < 0
 423        && last_delta - ICOUNT_WOBBLE > delta * 2
 424        && icount_time_shift < MAX_ICOUNT_SHIFT) {
 425        /* The guest is getting too far behind.  Speed time up.  */
 426        icount_time_shift++;
 427    }
 428    last_delta = delta;
 429    timers_state.qemu_icount_bias = cur_icount
 430                              - (timers_state.qemu_icount << icount_time_shift);
 431    seqlock_write_end(&timers_state.vm_clock_seqlock);
 432}
 433
 434static void icount_adjust_rt(void *opaque)
 435{
 436    timer_mod(timers_state.icount_rt_timer,
 437              qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 438    icount_adjust();
 439}
 440
 441static void icount_adjust_vm(void *opaque)
 442{
 443    timer_mod(timers_state.icount_vm_timer,
 444                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 445                   NANOSECONDS_PER_SECOND / 10);
 446    icount_adjust();
 447}
 448
 449static int64_t qemu_icount_round(int64_t count)
 450{
 451    return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 452}
 453
 454static void icount_warp_rt(void)
 455{
 456    unsigned seq;
 457    int64_t warp_start;
 458
 459    /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 460     * changes from -1 to another value, so the race here is okay.
 461     */
 462    do {
 463        seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 464        warp_start = timers_state.vm_clock_warp_start;
 465    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 466
 467    if (warp_start == -1) {
 468        return;
 469    }
 470
 471    seqlock_write_begin(&timers_state.vm_clock_seqlock);
 472    if (runstate_is_running()) {
 473        int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
 474                                     cpu_get_clock_locked());
 475        int64_t warp_delta;
 476
 477        warp_delta = clock - timers_state.vm_clock_warp_start;
 478        if (use_icount == 2) {
 479            /*
 480             * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 481             * far ahead of real time.
 482             */
 483            int64_t cur_icount = cpu_get_icount_locked();
 484            int64_t delta = clock - cur_icount;
 485            warp_delta = MIN(warp_delta, delta);
 486        }
 487        timers_state.qemu_icount_bias += warp_delta;
 488    }
 489    timers_state.vm_clock_warp_start = -1;
 490    seqlock_write_end(&timers_state.vm_clock_seqlock);
 491
 492    if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 493        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 494    }
 495}
 496
 497static void icount_timer_cb(void *opaque)
 498{
 499    /* No need for a checkpoint because the timer already synchronizes
 500     * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 501     */
 502    icount_warp_rt();
 503}
 504
 505void qtest_clock_warp(int64_t dest)
 506{
 507    int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 508    AioContext *aio_context;
 509    assert(qtest_enabled());
 510    aio_context = qemu_get_aio_context();
 511    while (clock < dest) {
 512        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 513        int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 514
 515        seqlock_write_begin(&timers_state.vm_clock_seqlock);
 516        timers_state.qemu_icount_bias += warp;
 517        seqlock_write_end(&timers_state.vm_clock_seqlock);
 518
 519        qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 520        timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 521        clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 522    }
 523    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 524}
 525
 526void qemu_start_warp_timer(void)
 527{
 528    int64_t clock;
 529    int64_t deadline;
 530
 531    if (!use_icount) {
 532        return;
 533    }
 534
 535    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 536     * do not fire, so computing the deadline does not make sense.
 537     */
 538    if (!runstate_is_running()) {
 539        return;
 540    }
 541
 542    /* warp clock deterministically in record/replay mode */
 543    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 544        return;
 545    }
 546
 547    if (!all_cpu_threads_idle()) {
 548        return;
 549    }
 550
 551    if (qtest_enabled()) {
 552        /* When testing, qtest commands advance icount.  */
 553        return;
 554    }
 555
 556    /* We want to use the earliest deadline from ALL vm_clocks */
 557    clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 558    deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 559    if (deadline < 0) {
 560        static bool notified;
 561        if (!icount_sleep && !notified) {
 562            warn_report("icount sleep disabled and no active timers");
 563            notified = true;
 564        }
 565        return;
 566    }
 567
 568    if (deadline > 0) {
 569        /*
 570         * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 571         * sleep.  Otherwise, the CPU might be waiting for a future timer
 572         * interrupt to wake it up, but the interrupt never comes because
 573         * the vCPU isn't running any insns and thus doesn't advance the
 574         * QEMU_CLOCK_VIRTUAL.
 575         */
 576        if (!icount_sleep) {
 577            /*
 578             * We never let VCPUs sleep in no sleep icount mode.
 579             * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 580             * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 581             * It is useful when we want a deterministic execution time,
 582             * isolated from host latencies.
 583             */
 584            seqlock_write_begin(&timers_state.vm_clock_seqlock);
 585            timers_state.qemu_icount_bias += deadline;
 586            seqlock_write_end(&timers_state.vm_clock_seqlock);
 587            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 588        } else {
 589            /*
 590             * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 591             * "real" time, (related to the time left until the next event) has
 592             * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 593             * This avoids that the warps are visible externally; for example,
 594             * you will not be sending network packets continuously instead of
 595             * every 100ms.
 596             */
 597            seqlock_write_begin(&timers_state.vm_clock_seqlock);
 598            if (timers_state.vm_clock_warp_start == -1
 599                || timers_state.vm_clock_warp_start > clock) {
 600                timers_state.vm_clock_warp_start = clock;
 601            }
 602            seqlock_write_end(&timers_state.vm_clock_seqlock);
 603            timer_mod_anticipate(timers_state.icount_warp_timer,
 604                                 clock + deadline);
 605        }
 606    } else if (deadline == 0) {
 607        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 608    }
 609}
 610
 611static void qemu_account_warp_timer(void)
 612{
 613    if (!use_icount || !icount_sleep) {
 614        return;
 615    }
 616
 617    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 618     * do not fire, so computing the deadline does not make sense.
 619     */
 620    if (!runstate_is_running()) {
 621        return;
 622    }
 623
 624    /* warp clock deterministically in record/replay mode */
 625    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 626        return;
 627    }
 628
 629    timer_del(timers_state.icount_warp_timer);
 630    icount_warp_rt();
 631}
 632
 633static bool icount_state_needed(void *opaque)
 634{
 635    return use_icount;
 636}
 637
 638static bool warp_timer_state_needed(void *opaque)
 639{
 640    TimersState *s = opaque;
 641    return s->icount_warp_timer != NULL;
 642}
 643
 644static bool adjust_timers_state_needed(void *opaque)
 645{
 646    TimersState *s = opaque;
 647    return s->icount_rt_timer != NULL;
 648}
 649
 650/*
 651 * Subsection for warp timer migration is optional, because may not be created
 652 */
 653static const VMStateDescription icount_vmstate_warp_timer = {
 654    .name = "timer/icount/warp_timer",
 655    .version_id = 1,
 656    .minimum_version_id = 1,
 657    .needed = warp_timer_state_needed,
 658    .fields = (VMStateField[]) {
 659        VMSTATE_INT64(vm_clock_warp_start, TimersState),
 660        VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 661        VMSTATE_END_OF_LIST()
 662    }
 663};
 664
 665static const VMStateDescription icount_vmstate_adjust_timers = {
 666    .name = "timer/icount/timers",
 667    .version_id = 1,
 668    .minimum_version_id = 1,
 669    .needed = adjust_timers_state_needed,
 670    .fields = (VMStateField[]) {
 671        VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 672        VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 673        VMSTATE_END_OF_LIST()
 674    }
 675};
 676
 677/*
 678 * This is a subsection for icount migration.
 679 */
 680static const VMStateDescription icount_vmstate_timers = {
 681    .name = "timer/icount",
 682    .version_id = 1,
 683    .minimum_version_id = 1,
 684    .needed = icount_state_needed,
 685    .fields = (VMStateField[]) {
 686        VMSTATE_INT64(qemu_icount_bias, TimersState),
 687        VMSTATE_INT64(qemu_icount, TimersState),
 688        VMSTATE_END_OF_LIST()
 689    },
 690    .subsections = (const VMStateDescription*[]) {
 691        &icount_vmstate_warp_timer,
 692        &icount_vmstate_adjust_timers,
 693        NULL
 694    }
 695};
 696
 697static const VMStateDescription vmstate_timers = {
 698    .name = "timer",
 699    .version_id = 2,
 700    .minimum_version_id = 1,
 701    .fields = (VMStateField[]) {
 702        VMSTATE_INT64(cpu_ticks_offset, TimersState),
 703        VMSTATE_INT64(dummy, TimersState),
 704        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 705        VMSTATE_END_OF_LIST()
 706    },
 707    .subsections = (const VMStateDescription*[]) {
 708        &icount_vmstate_timers,
 709        NULL
 710    }
 711};
 712
 713static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 714{
 715    double pct;
 716    double throttle_ratio;
 717    long sleeptime_ns;
 718
 719    if (!cpu_throttle_get_percentage()) {
 720        return;
 721    }
 722
 723    pct = (double)cpu_throttle_get_percentage()/100;
 724    throttle_ratio = pct / (1 - pct);
 725    sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 726
 727    qemu_mutex_unlock_iothread();
 728    g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 729    qemu_mutex_lock_iothread();
 730    atomic_set(&cpu->throttle_thread_scheduled, 0);
 731}
 732
 733static void cpu_throttle_timer_tick(void *opaque)
 734{
 735    CPUState *cpu;
 736    double pct;
 737
 738    /* Stop the timer if needed */
 739    if (!cpu_throttle_get_percentage()) {
 740        return;
 741    }
 742    CPU_FOREACH(cpu) {
 743        if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 744            async_run_on_cpu(cpu, cpu_throttle_thread,
 745                             RUN_ON_CPU_NULL);
 746        }
 747    }
 748
 749    pct = (double)cpu_throttle_get_percentage()/100;
 750    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 751                                   CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 752}
 753
 754void cpu_throttle_set(int new_throttle_pct)
 755{
 756    /* Ensure throttle percentage is within valid range */
 757    new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 758    new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 759
 760    atomic_set(&throttle_percentage, new_throttle_pct);
 761
 762    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 763                                       CPU_THROTTLE_TIMESLICE_NS);
 764}
 765
 766void cpu_throttle_stop(void)
 767{
 768    atomic_set(&throttle_percentage, 0);
 769}
 770
 771bool cpu_throttle_active(void)
 772{
 773    return (cpu_throttle_get_percentage() != 0);
 774}
 775
 776int cpu_throttle_get_percentage(void)
 777{
 778    return atomic_read(&throttle_percentage);
 779}
 780
 781void cpu_ticks_init(void)
 782{
 783    seqlock_init(&timers_state.vm_clock_seqlock);
 784    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 785    throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 786                                           cpu_throttle_timer_tick, NULL);
 787}
 788
 789void configure_icount(QemuOpts *opts, Error **errp)
 790{
 791    const char *option;
 792    char *rem_str = NULL;
 793
 794    option = qemu_opt_get(opts, "shift");
 795    if (!option) {
 796        if (qemu_opt_get(opts, "align") != NULL) {
 797            error_setg(errp, "Please specify shift option when using align");
 798        }
 799        return;
 800    }
 801
 802    icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 803    if (icount_sleep) {
 804        timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 805                                         icount_timer_cb, NULL);
 806    }
 807
 808    icount_align_option = qemu_opt_get_bool(opts, "align", false);
 809
 810    if (icount_align_option && !icount_sleep) {
 811        error_setg(errp, "align=on and sleep=off are incompatible");
 812    }
 813    if (strcmp(option, "auto") != 0) {
 814        errno = 0;
 815        icount_time_shift = strtol(option, &rem_str, 0);
 816        if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 817            error_setg(errp, "icount: Invalid shift value");
 818        }
 819        use_icount = 1;
 820        return;
 821    } else if (icount_align_option) {
 822        error_setg(errp, "shift=auto and align=on are incompatible");
 823    } else if (!icount_sleep) {
 824        error_setg(errp, "shift=auto and sleep=off are incompatible");
 825    }
 826
 827    use_icount = 2;
 828
 829    /* 125MIPS seems a reasonable initial guess at the guest speed.
 830       It will be corrected fairly quickly anyway.  */
 831    icount_time_shift = 3;
 832
 833    /* Have both realtime and virtual time triggers for speed adjustment.
 834       The realtime trigger catches emulated time passing too slowly,
 835       the virtual time trigger catches emulated time passing too fast.
 836       Realtime triggers occur even when idle, so use them less frequently
 837       than VM triggers.  */
 838    timers_state.vm_clock_warp_start = -1;
 839    timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 840                                   icount_adjust_rt, NULL);
 841    timer_mod(timers_state.icount_rt_timer,
 842                   qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 843    timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 844                                        icount_adjust_vm, NULL);
 845    timer_mod(timers_state.icount_vm_timer,
 846                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 847                   NANOSECONDS_PER_SECOND / 10);
 848}
 849
 850/***********************************************************/
 851/* TCG vCPU kick timer
 852 *
 853 * The kick timer is responsible for moving single threaded vCPU
 854 * emulation on to the next vCPU. If more than one vCPU is running a
 855 * timer event with force a cpu->exit so the next vCPU can get
 856 * scheduled.
 857 *
 858 * The timer is removed if all vCPUs are idle and restarted again once
 859 * idleness is complete.
 860 */
 861
 862static QEMUTimer *tcg_kick_vcpu_timer;
 863static CPUState *tcg_current_rr_cpu;
 864
 865#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 866
 867static inline int64_t qemu_tcg_next_kick(void)
 868{
 869    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 870}
 871
 872/* Kick the currently round-robin scheduled vCPU */
 873static void qemu_cpu_kick_rr_cpu(void)
 874{
 875    CPUState *cpu;
 876    do {
 877        cpu = atomic_mb_read(&tcg_current_rr_cpu);
 878        if (cpu) {
 879            cpu_exit(cpu);
 880        }
 881    } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 882}
 883
 884static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 885{
 886}
 887
 888void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 889{
 890    if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 891        qemu_notify_event();
 892        return;
 893    }
 894
 895    if (qemu_in_vcpu_thread()) {
 896        /* A CPU is currently running; kick it back out to the
 897         * tcg_cpu_exec() loop so it will recalculate its
 898         * icount deadline immediately.
 899         */
 900        qemu_cpu_kick(current_cpu);
 901    } else if (first_cpu) {
 902        /* qemu_cpu_kick is not enough to kick a halted CPU out of
 903         * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 904         * causes cpu_thread_is_idle to return false.  This way,
 905         * handle_icount_deadline can run.
 906         * If we have no CPUs at all for some reason, we don't
 907         * need to do anything.
 908         */
 909        async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 910    }
 911}
 912
 913static void kick_tcg_thread(void *opaque)
 914{
 915    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 916    qemu_cpu_kick_rr_cpu();
 917}
 918
 919static void start_tcg_kick_timer(void)
 920{
 921    assert(!mttcg_enabled);
 922    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 923        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 924                                           kick_tcg_thread, NULL);
 925        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 926    }
 927}
 928
 929static void stop_tcg_kick_timer(void)
 930{
 931    assert(!mttcg_enabled);
 932    if (tcg_kick_vcpu_timer) {
 933        timer_del(tcg_kick_vcpu_timer);
 934        tcg_kick_vcpu_timer = NULL;
 935    }
 936}
 937
 938/***********************************************************/
 939void hw_error(const char *fmt, ...)
 940{
 941    va_list ap;
 942    CPUState *cpu;
 943
 944    va_start(ap, fmt);
 945    fprintf(stderr, "qemu: hardware error: ");
 946    vfprintf(stderr, fmt, ap);
 947    fprintf(stderr, "\n");
 948    CPU_FOREACH(cpu) {
 949        fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
 950        cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
 951    }
 952    va_end(ap);
 953    abort();
 954}
 955
 956void cpu_synchronize_all_states(void)
 957{
 958    CPUState *cpu;
 959
 960    CPU_FOREACH(cpu) {
 961        cpu_synchronize_state(cpu);
 962        /* TODO: move to cpu_synchronize_state() */
 963        if (hvf_enabled()) {
 964            hvf_cpu_synchronize_state(cpu);
 965        }
 966    }
 967}
 968
 969void cpu_synchronize_all_post_reset(void)
 970{
 971    CPUState *cpu;
 972
 973    CPU_FOREACH(cpu) {
 974        cpu_synchronize_post_reset(cpu);
 975        /* TODO: move to cpu_synchronize_post_reset() */
 976        if (hvf_enabled()) {
 977            hvf_cpu_synchronize_post_reset(cpu);
 978        }
 979    }
 980}
 981
 982void cpu_synchronize_all_post_init(void)
 983{
 984    CPUState *cpu;
 985
 986    CPU_FOREACH(cpu) {
 987        cpu_synchronize_post_init(cpu);
 988        /* TODO: move to cpu_synchronize_post_init() */
 989        if (hvf_enabled()) {
 990            hvf_cpu_synchronize_post_init(cpu);
 991        }
 992    }
 993}
 994
 995void cpu_synchronize_all_pre_loadvm(void)
 996{
 997    CPUState *cpu;
 998
 999    CPU_FOREACH(cpu) {
1000        cpu_synchronize_pre_loadvm(cpu);
1001    }
1002}
1003
1004static int do_vm_stop(RunState state, bool send_stop)
1005{
1006    int ret = 0;
1007
1008    if (runstate_is_running()) {
1009        cpu_disable_ticks();
1010        pause_all_vcpus();
1011        runstate_set(state);
1012        vm_state_notify(0, state);
1013        if (send_stop) {
1014            qapi_event_send_stop(&error_abort);
1015        }
1016    }
1017
1018    bdrv_drain_all();
1019    replay_disable_events();
1020    ret = bdrv_flush_all();
1021
1022    return ret;
1023}
1024
1025/* Special vm_stop() variant for terminating the process.  Historically clients
1026 * did not expect a QMP STOP event and so we need to retain compatibility.
1027 */
1028int vm_shutdown(void)
1029{
1030    return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1031}
1032
1033static bool cpu_can_run(CPUState *cpu)
1034{
1035    if (cpu->stop) {
1036        return false;
1037    }
1038    if (cpu_is_stopped(cpu)) {
1039        return false;
1040    }
1041    return true;
1042}
1043
1044static void cpu_handle_guest_debug(CPUState *cpu)
1045{
1046    gdb_set_stop_cpu(cpu);
1047    qemu_system_debug_request();
1048    cpu->stopped = true;
1049}
1050
1051#ifdef CONFIG_LINUX
1052static void sigbus_reraise(void)
1053{
1054    sigset_t set;
1055    struct sigaction action;
1056
1057    memset(&action, 0, sizeof(action));
1058    action.sa_handler = SIG_DFL;
1059    if (!sigaction(SIGBUS, &action, NULL)) {
1060        raise(SIGBUS);
1061        sigemptyset(&set);
1062        sigaddset(&set, SIGBUS);
1063        pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1064    }
1065    perror("Failed to re-raise SIGBUS!\n");
1066    abort();
1067}
1068
1069static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1070{
1071    if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1072        sigbus_reraise();
1073    }
1074
1075    if (current_cpu) {
1076        /* Called asynchronously in VCPU thread.  */
1077        if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1078            sigbus_reraise();
1079        }
1080    } else {
1081        /* Called synchronously (via signalfd) in main thread.  */
1082        if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1083            sigbus_reraise();
1084        }
1085    }
1086}
1087
1088static void qemu_init_sigbus(void)
1089{
1090    struct sigaction action;
1091
1092    memset(&action, 0, sizeof(action));
1093    action.sa_flags = SA_SIGINFO;
1094    action.sa_sigaction = sigbus_handler;
1095    sigaction(SIGBUS, &action, NULL);
1096
1097    prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1098}
1099#else /* !CONFIG_LINUX */
1100static void qemu_init_sigbus(void)
1101{
1102}
1103#endif /* !CONFIG_LINUX */
1104
1105static QemuMutex qemu_global_mutex;
1106
1107static QemuThread io_thread;
1108
1109/* cpu creation */
1110static QemuCond qemu_cpu_cond;
1111/* system init */
1112static QemuCond qemu_pause_cond;
1113
1114void qemu_init_cpu_loop(void)
1115{
1116    qemu_init_sigbus();
1117    qemu_cond_init(&qemu_cpu_cond);
1118    qemu_cond_init(&qemu_pause_cond);
1119    qemu_mutex_init(&qemu_global_mutex);
1120
1121    qemu_thread_get_self(&io_thread);
1122}
1123
1124void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1125{
1126    do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1127}
1128
1129static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1130{
1131    if (kvm_destroy_vcpu(cpu) < 0) {
1132        error_report("kvm_destroy_vcpu failed");
1133        exit(EXIT_FAILURE);
1134    }
1135}
1136
1137static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1138{
1139}
1140
1141static void qemu_cpu_stop(CPUState *cpu, bool exit)
1142{
1143    g_assert(qemu_cpu_is_self(cpu));
1144    cpu->stop = false;
1145    cpu->stopped = true;
1146    if (exit) {
1147        cpu_exit(cpu);
1148    }
1149    qemu_cond_broadcast(&qemu_pause_cond);
1150}
1151
1152static void qemu_wait_io_event_common(CPUState *cpu)
1153{
1154    atomic_mb_set(&cpu->thread_kicked, false);
1155    if (cpu->stop) {
1156        qemu_cpu_stop(cpu, false);
1157    }
1158    process_queued_cpu_work(cpu);
1159}
1160
1161static void qemu_tcg_rr_wait_io_event(CPUState *cpu)
1162{
1163    while (all_cpu_threads_idle()) {
1164        stop_tcg_kick_timer();
1165        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1166    }
1167
1168    start_tcg_kick_timer();
1169
1170    qemu_wait_io_event_common(cpu);
1171}
1172
1173static void qemu_wait_io_event(CPUState *cpu)
1174{
1175    while (cpu_thread_is_idle(cpu)) {
1176        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1177    }
1178
1179#ifdef _WIN32
1180    /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1181    if (!tcg_enabled()) {
1182        SleepEx(0, TRUE);
1183    }
1184#endif
1185    qemu_wait_io_event_common(cpu);
1186}
1187
1188static void *qemu_kvm_cpu_thread_fn(void *arg)
1189{
1190    CPUState *cpu = arg;
1191    int r;
1192
1193    rcu_register_thread();
1194
1195    qemu_mutex_lock_iothread();
1196    qemu_thread_get_self(cpu->thread);
1197    cpu->thread_id = qemu_get_thread_id();
1198    cpu->can_do_io = 1;
1199    current_cpu = cpu;
1200
1201    r = kvm_init_vcpu(cpu);
1202    if (r < 0) {
1203        error_report("kvm_init_vcpu failed: %s", strerror(-r));
1204        exit(1);
1205    }
1206
1207    kvm_init_cpu_signals(cpu);
1208
1209    /* signal CPU creation */
1210    cpu->created = true;
1211    qemu_cond_signal(&qemu_cpu_cond);
1212
1213    do {
1214        if (cpu_can_run(cpu)) {
1215            r = kvm_cpu_exec(cpu);
1216            if (r == EXCP_DEBUG) {
1217                cpu_handle_guest_debug(cpu);
1218            }
1219        }
1220        qemu_wait_io_event(cpu);
1221    } while (!cpu->unplug || cpu_can_run(cpu));
1222
1223    qemu_kvm_destroy_vcpu(cpu);
1224    cpu->created = false;
1225    qemu_cond_signal(&qemu_cpu_cond);
1226    qemu_mutex_unlock_iothread();
1227    rcu_unregister_thread();
1228    return NULL;
1229}
1230
1231static void *qemu_dummy_cpu_thread_fn(void *arg)
1232{
1233#ifdef _WIN32
1234    error_report("qtest is not supported under Windows");
1235    exit(1);
1236#else
1237    CPUState *cpu = arg;
1238    sigset_t waitset;
1239    int r;
1240
1241    rcu_register_thread();
1242
1243    qemu_mutex_lock_iothread();
1244    qemu_thread_get_self(cpu->thread);
1245    cpu->thread_id = qemu_get_thread_id();
1246    cpu->can_do_io = 1;
1247    current_cpu = cpu;
1248
1249    sigemptyset(&waitset);
1250    sigaddset(&waitset, SIG_IPI);
1251
1252    /* signal CPU creation */
1253    cpu->created = true;
1254    qemu_cond_signal(&qemu_cpu_cond);
1255
1256    do {
1257        qemu_mutex_unlock_iothread();
1258        do {
1259            int sig;
1260            r = sigwait(&waitset, &sig);
1261        } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1262        if (r == -1) {
1263            perror("sigwait");
1264            exit(1);
1265        }
1266        qemu_mutex_lock_iothread();
1267        qemu_wait_io_event(cpu);
1268    } while (!cpu->unplug);
1269
1270    rcu_unregister_thread();
1271    return NULL;
1272#endif
1273}
1274
1275static int64_t tcg_get_icount_limit(void)
1276{
1277    int64_t deadline;
1278
1279    if (replay_mode != REPLAY_MODE_PLAY) {
1280        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1281
1282        /* Maintain prior (possibly buggy) behaviour where if no deadline
1283         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1284         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1285         * nanoseconds.
1286         */
1287        if ((deadline < 0) || (deadline > INT32_MAX)) {
1288            deadline = INT32_MAX;
1289        }
1290
1291        return qemu_icount_round(deadline);
1292    } else {
1293        return replay_get_instructions();
1294    }
1295}
1296
1297static void handle_icount_deadline(void)
1298{
1299    assert(qemu_in_vcpu_thread());
1300    if (use_icount) {
1301        int64_t deadline =
1302            qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1303
1304        if (deadline == 0) {
1305            /* Wake up other AioContexts.  */
1306            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1307            qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1308        }
1309    }
1310}
1311
1312static void prepare_icount_for_run(CPUState *cpu)
1313{
1314    if (use_icount) {
1315        int insns_left;
1316
1317        /* These should always be cleared by process_icount_data after
1318         * each vCPU execution. However u16.high can be raised
1319         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1320         */
1321        g_assert(cpu->icount_decr.u16.low == 0);
1322        g_assert(cpu->icount_extra == 0);
1323
1324        cpu->icount_budget = tcg_get_icount_limit();
1325        insns_left = MIN(0xffff, cpu->icount_budget);
1326        cpu->icount_decr.u16.low = insns_left;
1327        cpu->icount_extra = cpu->icount_budget - insns_left;
1328
1329        replay_mutex_lock();
1330    }
1331}
1332
1333static void process_icount_data(CPUState *cpu)
1334{
1335    if (use_icount) {
1336        /* Account for executed instructions */
1337        cpu_update_icount(cpu);
1338
1339        /* Reset the counters */
1340        cpu->icount_decr.u16.low = 0;
1341        cpu->icount_extra = 0;
1342        cpu->icount_budget = 0;
1343
1344        replay_account_executed_instructions();
1345
1346        replay_mutex_unlock();
1347    }
1348}
1349
1350
1351static int tcg_cpu_exec(CPUState *cpu)
1352{
1353    int ret;
1354#ifdef CONFIG_PROFILER
1355    int64_t ti;
1356#endif
1357
1358    assert(tcg_enabled());
1359#ifdef CONFIG_PROFILER
1360    ti = profile_getclock();
1361#endif
1362    cpu_exec_start(cpu);
1363    ret = cpu_exec(cpu);
1364    cpu_exec_end(cpu);
1365#ifdef CONFIG_PROFILER
1366    tcg_time += profile_getclock() - ti;
1367#endif
1368    return ret;
1369}
1370
1371/* Destroy any remaining vCPUs which have been unplugged and have
1372 * finished running
1373 */
1374static void deal_with_unplugged_cpus(void)
1375{
1376    CPUState *cpu;
1377
1378    CPU_FOREACH(cpu) {
1379        if (cpu->unplug && !cpu_can_run(cpu)) {
1380            qemu_tcg_destroy_vcpu(cpu);
1381            cpu->created = false;
1382            qemu_cond_signal(&qemu_cpu_cond);
1383            break;
1384        }
1385    }
1386}
1387
1388/* Single-threaded TCG
1389 *
1390 * In the single-threaded case each vCPU is simulated in turn. If
1391 * there is more than a single vCPU we create a simple timer to kick
1392 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1393 * This is done explicitly rather than relying on side-effects
1394 * elsewhere.
1395 */
1396
1397static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1398{
1399    CPUState *cpu = arg;
1400
1401    assert(tcg_enabled());
1402    rcu_register_thread();
1403    tcg_register_thread();
1404
1405    qemu_mutex_lock_iothread();
1406    qemu_thread_get_self(cpu->thread);
1407
1408    cpu->thread_id = qemu_get_thread_id();
1409    cpu->created = true;
1410    cpu->can_do_io = 1;
1411    qemu_cond_signal(&qemu_cpu_cond);
1412
1413    /* wait for initial kick-off after machine start */
1414    while (first_cpu->stopped) {
1415        qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1416
1417        /* process any pending work */
1418        CPU_FOREACH(cpu) {
1419            current_cpu = cpu;
1420            qemu_wait_io_event_common(cpu);
1421        }
1422    }
1423
1424    start_tcg_kick_timer();
1425
1426    cpu = first_cpu;
1427
1428    /* process any pending work */
1429    cpu->exit_request = 1;
1430
1431    while (1) {
1432        qemu_mutex_unlock_iothread();
1433        replay_mutex_lock();
1434        qemu_mutex_lock_iothread();
1435        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1436        qemu_account_warp_timer();
1437
1438        /* Run the timers here.  This is much more efficient than
1439         * waking up the I/O thread and waiting for completion.
1440         */
1441        handle_icount_deadline();
1442
1443        replay_mutex_unlock();
1444
1445        if (!cpu) {
1446            cpu = first_cpu;
1447        }
1448
1449        while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1450
1451            atomic_mb_set(&tcg_current_rr_cpu, cpu);
1452            current_cpu = cpu;
1453
1454            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1455                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1456
1457            if (cpu_can_run(cpu)) {
1458                int r;
1459
1460                qemu_mutex_unlock_iothread();
1461                prepare_icount_for_run(cpu);
1462
1463                r = tcg_cpu_exec(cpu);
1464
1465                process_icount_data(cpu);
1466                qemu_mutex_lock_iothread();
1467
1468                if (r == EXCP_DEBUG) {
1469                    cpu_handle_guest_debug(cpu);
1470                    break;
1471                } else if (r == EXCP_ATOMIC) {
1472                    qemu_mutex_unlock_iothread();
1473                    cpu_exec_step_atomic(cpu);
1474                    qemu_mutex_lock_iothread();
1475                    break;
1476                }
1477            } else if (cpu->stop) {
1478                if (cpu->unplug) {
1479                    cpu = CPU_NEXT(cpu);
1480                }
1481                break;
1482            }
1483
1484            cpu = CPU_NEXT(cpu);
1485        } /* while (cpu && !cpu->exit_request).. */
1486
1487        /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1488        atomic_set(&tcg_current_rr_cpu, NULL);
1489
1490        if (cpu && cpu->exit_request) {
1491            atomic_mb_set(&cpu->exit_request, 0);
1492        }
1493
1494        qemu_tcg_rr_wait_io_event(cpu ? cpu : QTAILQ_FIRST(&cpus));
1495        deal_with_unplugged_cpus();
1496    }
1497
1498    rcu_unregister_thread();
1499    return NULL;
1500}
1501
1502static void *qemu_hax_cpu_thread_fn(void *arg)
1503{
1504    CPUState *cpu = arg;
1505    int r;
1506
1507    rcu_register_thread();
1508    qemu_mutex_lock_iothread();
1509    qemu_thread_get_self(cpu->thread);
1510
1511    cpu->thread_id = qemu_get_thread_id();
1512    cpu->created = true;
1513    cpu->halted = 0;
1514    current_cpu = cpu;
1515
1516    hax_init_vcpu(cpu);
1517    qemu_cond_signal(&qemu_cpu_cond);
1518
1519    do {
1520        if (cpu_can_run(cpu)) {
1521            r = hax_smp_cpu_exec(cpu);
1522            if (r == EXCP_DEBUG) {
1523                cpu_handle_guest_debug(cpu);
1524            }
1525        }
1526
1527        qemu_wait_io_event(cpu);
1528    } while (!cpu->unplug || cpu_can_run(cpu));
1529    rcu_unregister_thread();
1530    return NULL;
1531}
1532
1533/* The HVF-specific vCPU thread function. This one should only run when the host
1534 * CPU supports the VMX "unrestricted guest" feature. */
1535static void *qemu_hvf_cpu_thread_fn(void *arg)
1536{
1537    CPUState *cpu = arg;
1538
1539    int r;
1540
1541    assert(hvf_enabled());
1542
1543    rcu_register_thread();
1544
1545    qemu_mutex_lock_iothread();
1546    qemu_thread_get_self(cpu->thread);
1547
1548    cpu->thread_id = qemu_get_thread_id();
1549    cpu->can_do_io = 1;
1550    current_cpu = cpu;
1551
1552    hvf_init_vcpu(cpu);
1553
1554    /* signal CPU creation */
1555    cpu->created = true;
1556    qemu_cond_signal(&qemu_cpu_cond);
1557
1558    do {
1559        if (cpu_can_run(cpu)) {
1560            r = hvf_vcpu_exec(cpu);
1561            if (r == EXCP_DEBUG) {
1562                cpu_handle_guest_debug(cpu);
1563            }
1564        }
1565        qemu_wait_io_event(cpu);
1566    } while (!cpu->unplug || cpu_can_run(cpu));
1567
1568    hvf_vcpu_destroy(cpu);
1569    cpu->created = false;
1570    qemu_cond_signal(&qemu_cpu_cond);
1571    qemu_mutex_unlock_iothread();
1572    rcu_unregister_thread();
1573    return NULL;
1574}
1575
1576static void *qemu_whpx_cpu_thread_fn(void *arg)
1577{
1578    CPUState *cpu = arg;
1579    int r;
1580
1581    rcu_register_thread();
1582
1583    qemu_mutex_lock_iothread();
1584    qemu_thread_get_self(cpu->thread);
1585    cpu->thread_id = qemu_get_thread_id();
1586    current_cpu = cpu;
1587
1588    r = whpx_init_vcpu(cpu);
1589    if (r < 0) {
1590        fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1591        exit(1);
1592    }
1593
1594    /* signal CPU creation */
1595    cpu->created = true;
1596    qemu_cond_signal(&qemu_cpu_cond);
1597
1598    do {
1599        if (cpu_can_run(cpu)) {
1600            r = whpx_vcpu_exec(cpu);
1601            if (r == EXCP_DEBUG) {
1602                cpu_handle_guest_debug(cpu);
1603            }
1604        }
1605        while (cpu_thread_is_idle(cpu)) {
1606            qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1607        }
1608        qemu_wait_io_event_common(cpu);
1609    } while (!cpu->unplug || cpu_can_run(cpu));
1610
1611    whpx_destroy_vcpu(cpu);
1612    cpu->created = false;
1613    qemu_cond_signal(&qemu_cpu_cond);
1614    qemu_mutex_unlock_iothread();
1615    rcu_unregister_thread();
1616    return NULL;
1617}
1618
1619#ifdef _WIN32
1620static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1621{
1622}
1623#endif
1624
1625/* Multi-threaded TCG
1626 *
1627 * In the multi-threaded case each vCPU has its own thread. The TLS
1628 * variable current_cpu can be used deep in the code to find the
1629 * current CPUState for a given thread.
1630 */
1631
1632static void *qemu_tcg_cpu_thread_fn(void *arg)
1633{
1634    CPUState *cpu = arg;
1635
1636    assert(tcg_enabled());
1637    g_assert(!use_icount);
1638
1639    rcu_register_thread();
1640    tcg_register_thread();
1641
1642    qemu_mutex_lock_iothread();
1643    qemu_thread_get_self(cpu->thread);
1644
1645    cpu->thread_id = qemu_get_thread_id();
1646    cpu->created = true;
1647    cpu->can_do_io = 1;
1648    current_cpu = cpu;
1649    qemu_cond_signal(&qemu_cpu_cond);
1650
1651    /* process any pending work */
1652    cpu->exit_request = 1;
1653
1654    do {
1655        if (cpu_can_run(cpu)) {
1656            int r;
1657            qemu_mutex_unlock_iothread();
1658            r = tcg_cpu_exec(cpu);
1659            qemu_mutex_lock_iothread();
1660            switch (r) {
1661            case EXCP_DEBUG:
1662                cpu_handle_guest_debug(cpu);
1663                break;
1664            case EXCP_HALTED:
1665                /* during start-up the vCPU is reset and the thread is
1666                 * kicked several times. If we don't ensure we go back
1667                 * to sleep in the halted state we won't cleanly
1668                 * start-up when the vCPU is enabled.
1669                 *
1670                 * cpu->halted should ensure we sleep in wait_io_event
1671                 */
1672                g_assert(cpu->halted);
1673                break;
1674            case EXCP_ATOMIC:
1675                qemu_mutex_unlock_iothread();
1676                cpu_exec_step_atomic(cpu);
1677                qemu_mutex_lock_iothread();
1678            default:
1679                /* Ignore everything else? */
1680                break;
1681            }
1682        }
1683
1684        atomic_mb_set(&cpu->exit_request, 0);
1685        qemu_wait_io_event(cpu);
1686    } while (!cpu->unplug || cpu_can_run(cpu));
1687
1688    qemu_tcg_destroy_vcpu(cpu);
1689    cpu->created = false;
1690    qemu_cond_signal(&qemu_cpu_cond);
1691    qemu_mutex_unlock_iothread();
1692    rcu_unregister_thread();
1693    return NULL;
1694}
1695
1696static void qemu_cpu_kick_thread(CPUState *cpu)
1697{
1698#ifndef _WIN32
1699    int err;
1700
1701    if (cpu->thread_kicked) {
1702        return;
1703    }
1704    cpu->thread_kicked = true;
1705    err = pthread_kill(cpu->thread->thread, SIG_IPI);
1706    if (err) {
1707        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1708        exit(1);
1709    }
1710#else /* _WIN32 */
1711    if (!qemu_cpu_is_self(cpu)) {
1712        if (whpx_enabled()) {
1713            whpx_vcpu_kick(cpu);
1714        } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1715            fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1716                    __func__, GetLastError());
1717            exit(1);
1718        }
1719    }
1720#endif
1721}
1722
1723void qemu_cpu_kick(CPUState *cpu)
1724{
1725    qemu_cond_broadcast(cpu->halt_cond);
1726    if (tcg_enabled()) {
1727        cpu_exit(cpu);
1728        /* NOP unless doing single-thread RR */
1729        qemu_cpu_kick_rr_cpu();
1730    } else {
1731        if (hax_enabled()) {
1732            /*
1733             * FIXME: race condition with the exit_request check in
1734             * hax_vcpu_hax_exec
1735             */
1736            cpu->exit_request = 1;
1737        }
1738        qemu_cpu_kick_thread(cpu);
1739    }
1740}
1741
1742void qemu_cpu_kick_self(void)
1743{
1744    assert(current_cpu);
1745    qemu_cpu_kick_thread(current_cpu);
1746}
1747
1748bool qemu_cpu_is_self(CPUState *cpu)
1749{
1750    return qemu_thread_is_self(cpu->thread);
1751}
1752
1753bool qemu_in_vcpu_thread(void)
1754{
1755    return current_cpu && qemu_cpu_is_self(current_cpu);
1756}
1757
1758static __thread bool iothread_locked = false;
1759
1760bool qemu_mutex_iothread_locked(void)
1761{
1762    return iothread_locked;
1763}
1764
1765void qemu_mutex_lock_iothread(void)
1766{
1767    g_assert(!qemu_mutex_iothread_locked());
1768    qemu_mutex_lock(&qemu_global_mutex);
1769    iothread_locked = true;
1770}
1771
1772void qemu_mutex_unlock_iothread(void)
1773{
1774    g_assert(qemu_mutex_iothread_locked());
1775    iothread_locked = false;
1776    qemu_mutex_unlock(&qemu_global_mutex);
1777}
1778
1779static bool all_vcpus_paused(void)
1780{
1781    CPUState *cpu;
1782
1783    CPU_FOREACH(cpu) {
1784        if (!cpu->stopped) {
1785            return false;
1786        }
1787    }
1788
1789    return true;
1790}
1791
1792void pause_all_vcpus(void)
1793{
1794    CPUState *cpu;
1795
1796    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1797    CPU_FOREACH(cpu) {
1798        if (qemu_cpu_is_self(cpu)) {
1799            qemu_cpu_stop(cpu, true);
1800        } else {
1801            cpu->stop = true;
1802            qemu_cpu_kick(cpu);
1803        }
1804    }
1805
1806    /* We need to drop the replay_lock so any vCPU threads woken up
1807     * can finish their replay tasks
1808     */
1809    replay_mutex_unlock();
1810
1811    while (!all_vcpus_paused()) {
1812        qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1813        CPU_FOREACH(cpu) {
1814            qemu_cpu_kick(cpu);
1815        }
1816    }
1817
1818    qemu_mutex_unlock_iothread();
1819    replay_mutex_lock();
1820    qemu_mutex_lock_iothread();
1821}
1822
1823void cpu_resume(CPUState *cpu)
1824{
1825    cpu->stop = false;
1826    cpu->stopped = false;
1827    qemu_cpu_kick(cpu);
1828}
1829
1830void resume_all_vcpus(void)
1831{
1832    CPUState *cpu;
1833
1834    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1835    CPU_FOREACH(cpu) {
1836        cpu_resume(cpu);
1837    }
1838}
1839
1840void cpu_remove_sync(CPUState *cpu)
1841{
1842    cpu->stop = true;
1843    cpu->unplug = true;
1844    qemu_cpu_kick(cpu);
1845    qemu_mutex_unlock_iothread();
1846    qemu_thread_join(cpu->thread);
1847    qemu_mutex_lock_iothread();
1848}
1849
1850/* For temporary buffers for forming a name */
1851#define VCPU_THREAD_NAME_SIZE 16
1852
1853static void qemu_tcg_init_vcpu(CPUState *cpu)
1854{
1855    char thread_name[VCPU_THREAD_NAME_SIZE];
1856    static QemuCond *single_tcg_halt_cond;
1857    static QemuThread *single_tcg_cpu_thread;
1858    static int tcg_region_inited;
1859
1860    assert(tcg_enabled());
1861    /*
1862     * Initialize TCG regions--once. Now is a good time, because:
1863     * (1) TCG's init context, prologue and target globals have been set up.
1864     * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1865     *     -accel flag is processed, so the check doesn't work then).
1866     */
1867    if (!tcg_region_inited) {
1868        tcg_region_inited = 1;
1869        tcg_region_init();
1870    }
1871
1872    if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1873        cpu->thread = g_malloc0(sizeof(QemuThread));
1874        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1875        qemu_cond_init(cpu->halt_cond);
1876
1877        if (qemu_tcg_mttcg_enabled()) {
1878            /* create a thread per vCPU with TCG (MTTCG) */
1879            parallel_cpus = true;
1880            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1881                 cpu->cpu_index);
1882
1883            qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1884                               cpu, QEMU_THREAD_JOINABLE);
1885
1886        } else {
1887            /* share a single thread for all cpus with TCG */
1888            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1889            qemu_thread_create(cpu->thread, thread_name,
1890                               qemu_tcg_rr_cpu_thread_fn,
1891                               cpu, QEMU_THREAD_JOINABLE);
1892
1893            single_tcg_halt_cond = cpu->halt_cond;
1894            single_tcg_cpu_thread = cpu->thread;
1895        }
1896#ifdef _WIN32
1897        cpu->hThread = qemu_thread_get_handle(cpu->thread);
1898#endif
1899    } else {
1900        /* For non-MTTCG cases we share the thread */
1901        cpu->thread = single_tcg_cpu_thread;
1902        cpu->halt_cond = single_tcg_halt_cond;
1903        cpu->thread_id = first_cpu->thread_id;
1904        cpu->can_do_io = 1;
1905        cpu->created = true;
1906    }
1907}
1908
1909static void qemu_hax_start_vcpu(CPUState *cpu)
1910{
1911    char thread_name[VCPU_THREAD_NAME_SIZE];
1912
1913    cpu->thread = g_malloc0(sizeof(QemuThread));
1914    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1915    qemu_cond_init(cpu->halt_cond);
1916
1917    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1918             cpu->cpu_index);
1919    qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
1920                       cpu, QEMU_THREAD_JOINABLE);
1921#ifdef _WIN32
1922    cpu->hThread = qemu_thread_get_handle(cpu->thread);
1923#endif
1924}
1925
1926static void qemu_kvm_start_vcpu(CPUState *cpu)
1927{
1928    char thread_name[VCPU_THREAD_NAME_SIZE];
1929
1930    cpu->thread = g_malloc0(sizeof(QemuThread));
1931    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1932    qemu_cond_init(cpu->halt_cond);
1933    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
1934             cpu->cpu_index);
1935    qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
1936                       cpu, QEMU_THREAD_JOINABLE);
1937}
1938
1939static void qemu_hvf_start_vcpu(CPUState *cpu)
1940{
1941    char thread_name[VCPU_THREAD_NAME_SIZE];
1942
1943    /* HVF currently does not support TCG, and only runs in
1944     * unrestricted-guest mode. */
1945    assert(hvf_enabled());
1946
1947    cpu->thread = g_malloc0(sizeof(QemuThread));
1948    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1949    qemu_cond_init(cpu->halt_cond);
1950
1951    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
1952             cpu->cpu_index);
1953    qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
1954                       cpu, QEMU_THREAD_JOINABLE);
1955}
1956
1957static void qemu_whpx_start_vcpu(CPUState *cpu)
1958{
1959    char thread_name[VCPU_THREAD_NAME_SIZE];
1960
1961    cpu->thread = g_malloc0(sizeof(QemuThread));
1962    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1963    qemu_cond_init(cpu->halt_cond);
1964    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
1965             cpu->cpu_index);
1966    qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
1967                       cpu, QEMU_THREAD_JOINABLE);
1968#ifdef _WIN32
1969    cpu->hThread = qemu_thread_get_handle(cpu->thread);
1970#endif
1971}
1972
1973static void qemu_dummy_start_vcpu(CPUState *cpu)
1974{
1975    char thread_name[VCPU_THREAD_NAME_SIZE];
1976
1977    cpu->thread = g_malloc0(sizeof(QemuThread));
1978    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1979    qemu_cond_init(cpu->halt_cond);
1980    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
1981             cpu->cpu_index);
1982    qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
1983                       QEMU_THREAD_JOINABLE);
1984}
1985
1986void qemu_init_vcpu(CPUState *cpu)
1987{
1988    cpu->nr_cores = smp_cores;
1989    cpu->nr_threads = smp_threads;
1990    cpu->stopped = true;
1991
1992    if (!cpu->as) {
1993        /* If the target cpu hasn't set up any address spaces itself,
1994         * give it the default one.
1995         */
1996        cpu->num_ases = 1;
1997        cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
1998    }
1999
2000    if (kvm_enabled()) {
2001        qemu_kvm_start_vcpu(cpu);
2002    } else if (hax_enabled()) {
2003        qemu_hax_start_vcpu(cpu);
2004    } else if (hvf_enabled()) {
2005        qemu_hvf_start_vcpu(cpu);
2006    } else if (tcg_enabled()) {
2007        qemu_tcg_init_vcpu(cpu);
2008    } else if (whpx_enabled()) {
2009        qemu_whpx_start_vcpu(cpu);
2010    } else {
2011        qemu_dummy_start_vcpu(cpu);
2012    }
2013
2014    while (!cpu->created) {
2015        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2016    }
2017}
2018
2019void cpu_stop_current(void)
2020{
2021    if (current_cpu) {
2022        qemu_cpu_stop(current_cpu, true);
2023    }
2024}
2025
2026int vm_stop(RunState state)
2027{
2028    if (qemu_in_vcpu_thread()) {
2029        qemu_system_vmstop_request_prepare();
2030        qemu_system_vmstop_request(state);
2031        /*
2032         * FIXME: should not return to device code in case
2033         * vm_stop() has been requested.
2034         */
2035        cpu_stop_current();
2036        return 0;
2037    }
2038
2039    return do_vm_stop(state, true);
2040}
2041
2042/**
2043 * Prepare for (re)starting the VM.
2044 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2045 * running or in case of an error condition), 0 otherwise.
2046 */
2047int vm_prepare_start(void)
2048{
2049    RunState requested;
2050
2051    qemu_vmstop_requested(&requested);
2052    if (runstate_is_running() && requested == RUN_STATE__MAX) {
2053        return -1;
2054    }
2055
2056    /* Ensure that a STOP/RESUME pair of events is emitted if a
2057     * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2058     * example, according to documentation is always followed by
2059     * the STOP event.
2060     */
2061    if (runstate_is_running()) {
2062        qapi_event_send_stop(&error_abort);
2063        qapi_event_send_resume(&error_abort);
2064        return -1;
2065    }
2066
2067    /* We are sending this now, but the CPUs will be resumed shortly later */
2068    qapi_event_send_resume(&error_abort);
2069
2070    replay_enable_events();
2071    cpu_enable_ticks();
2072    runstate_set(RUN_STATE_RUNNING);
2073    vm_state_notify(1, RUN_STATE_RUNNING);
2074    return 0;
2075}
2076
2077void vm_start(void)
2078{
2079    if (!vm_prepare_start()) {
2080        resume_all_vcpus();
2081    }
2082}
2083
2084/* does a state transition even if the VM is already stopped,
2085   current state is forgotten forever */
2086int vm_stop_force_state(RunState state)
2087{
2088    if (runstate_is_running()) {
2089        return vm_stop(state);
2090    } else {
2091        runstate_set(state);
2092
2093        bdrv_drain_all();
2094        /* Make sure to return an error if the flush in a previous vm_stop()
2095         * failed. */
2096        return bdrv_flush_all();
2097    }
2098}
2099
2100void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2101{
2102    /* XXX: implement xxx_cpu_list for targets that still miss it */
2103#if defined(cpu_list)
2104    cpu_list(f, cpu_fprintf);
2105#endif
2106}
2107
2108CpuInfoList *qmp_query_cpus(Error **errp)
2109{
2110    MachineState *ms = MACHINE(qdev_get_machine());
2111    MachineClass *mc = MACHINE_GET_CLASS(ms);
2112    CpuInfoList *head = NULL, *cur_item = NULL;
2113    CPUState *cpu;
2114
2115    CPU_FOREACH(cpu) {
2116        CpuInfoList *info;
2117#if defined(TARGET_I386)
2118        X86CPU *x86_cpu = X86_CPU(cpu);
2119        CPUX86State *env = &x86_cpu->env;
2120#elif defined(TARGET_PPC)
2121        PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2122        CPUPPCState *env = &ppc_cpu->env;
2123#elif defined(TARGET_SPARC)
2124        SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2125        CPUSPARCState *env = &sparc_cpu->env;
2126#elif defined(TARGET_RISCV)
2127        RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2128        CPURISCVState *env = &riscv_cpu->env;
2129#elif defined(TARGET_MIPS)
2130        MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2131        CPUMIPSState *env = &mips_cpu->env;
2132#elif defined(TARGET_TRICORE)
2133        TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2134        CPUTriCoreState *env = &tricore_cpu->env;
2135#elif defined(TARGET_S390X)
2136        S390CPU *s390_cpu = S390_CPU(cpu);
2137        CPUS390XState *env = &s390_cpu->env;
2138#endif
2139
2140        cpu_synchronize_state(cpu);
2141
2142        info = g_malloc0(sizeof(*info));
2143        info->value = g_malloc0(sizeof(*info->value));
2144        info->value->CPU = cpu->cpu_index;
2145        info->value->current = (cpu == first_cpu);
2146        info->value->halted = cpu->halted;
2147        info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2148        info->value->thread_id = cpu->thread_id;
2149#if defined(TARGET_I386)
2150        info->value->arch = CPU_INFO_ARCH_X86;
2151        info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2152#elif defined(TARGET_PPC)
2153        info->value->arch = CPU_INFO_ARCH_PPC;
2154        info->value->u.ppc.nip = env->nip;
2155#elif defined(TARGET_SPARC)
2156        info->value->arch = CPU_INFO_ARCH_SPARC;
2157        info->value->u.q_sparc.pc = env->pc;
2158        info->value->u.q_sparc.npc = env->npc;
2159#elif defined(TARGET_MIPS)
2160        info->value->arch = CPU_INFO_ARCH_MIPS;
2161        info->value->u.q_mips.PC = env->active_tc.PC;
2162#elif defined(TARGET_TRICORE)
2163        info->value->arch = CPU_INFO_ARCH_TRICORE;
2164        info->value->u.tricore.PC = env->PC;
2165#elif defined(TARGET_S390X)
2166        info->value->arch = CPU_INFO_ARCH_S390;
2167        info->value->u.s390.cpu_state = env->cpu_state;
2168#elif defined(TARGET_RISCV)
2169        info->value->arch = CPU_INFO_ARCH_RISCV;
2170        info->value->u.riscv.pc = env->pc;
2171#else
2172        info->value->arch = CPU_INFO_ARCH_OTHER;
2173#endif
2174        info->value->has_props = !!mc->cpu_index_to_instance_props;
2175        if (info->value->has_props) {
2176            CpuInstanceProperties *props;
2177            props = g_malloc0(sizeof(*props));
2178            *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2179            info->value->props = props;
2180        }
2181
2182        /* XXX: waiting for the qapi to support GSList */
2183        if (!cur_item) {
2184            head = cur_item = info;
2185        } else {
2186            cur_item->next = info;
2187            cur_item = info;
2188        }
2189    }
2190
2191    return head;
2192}
2193
2194static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2195{
2196    /*
2197     * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2198     * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2199     */
2200    switch (target) {
2201    case SYS_EMU_TARGET_I386:
2202    case SYS_EMU_TARGET_X86_64:
2203        return CPU_INFO_ARCH_X86;
2204
2205    case SYS_EMU_TARGET_PPC:
2206    case SYS_EMU_TARGET_PPCEMB:
2207    case SYS_EMU_TARGET_PPC64:
2208        return CPU_INFO_ARCH_PPC;
2209
2210    case SYS_EMU_TARGET_SPARC:
2211    case SYS_EMU_TARGET_SPARC64:
2212        return CPU_INFO_ARCH_SPARC;
2213
2214    case SYS_EMU_TARGET_MIPS:
2215    case SYS_EMU_TARGET_MIPSEL:
2216    case SYS_EMU_TARGET_MIPS64:
2217    case SYS_EMU_TARGET_MIPS64EL:
2218        return CPU_INFO_ARCH_MIPS;
2219
2220    case SYS_EMU_TARGET_TRICORE:
2221        return CPU_INFO_ARCH_TRICORE;
2222
2223    case SYS_EMU_TARGET_S390X:
2224        return CPU_INFO_ARCH_S390;
2225
2226    case SYS_EMU_TARGET_RISCV32:
2227    case SYS_EMU_TARGET_RISCV64:
2228        return CPU_INFO_ARCH_RISCV;
2229
2230    default:
2231        return CPU_INFO_ARCH_OTHER;
2232    }
2233}
2234
2235static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2236{
2237#ifdef TARGET_S390X
2238    S390CPU *s390_cpu = S390_CPU(cpu);
2239    CPUS390XState *env = &s390_cpu->env;
2240
2241    info->cpu_state = env->cpu_state;
2242#else
2243    abort();
2244#endif
2245}
2246
2247/*
2248 * fast means: we NEVER interrupt vCPU threads to retrieve
2249 * information from KVM.
2250 */
2251CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2252{
2253    MachineState *ms = MACHINE(qdev_get_machine());
2254    MachineClass *mc = MACHINE_GET_CLASS(ms);
2255    CpuInfoFastList *head = NULL, *cur_item = NULL;
2256    SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2257                                          -1, &error_abort);
2258    CPUState *cpu;
2259
2260    CPU_FOREACH(cpu) {
2261        CpuInfoFastList *info = g_malloc0(sizeof(*info));
2262        info->value = g_malloc0(sizeof(*info->value));
2263
2264        info->value->cpu_index = cpu->cpu_index;
2265        info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2266        info->value->thread_id = cpu->thread_id;
2267
2268        info->value->has_props = !!mc->cpu_index_to_instance_props;
2269        if (info->value->has_props) {
2270            CpuInstanceProperties *props;
2271            props = g_malloc0(sizeof(*props));
2272            *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2273            info->value->props = props;
2274        }
2275
2276        info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2277        info->value->target = target;
2278        if (target == SYS_EMU_TARGET_S390X) {
2279            cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2280        }
2281
2282        if (!cur_item) {
2283            head = cur_item = info;
2284        } else {
2285            cur_item->next = info;
2286            cur_item = info;
2287        }
2288    }
2289
2290    return head;
2291}
2292
2293void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2294                 bool has_cpu, int64_t cpu_index, Error **errp)
2295{
2296    FILE *f;
2297    uint32_t l;
2298    CPUState *cpu;
2299    uint8_t buf[1024];
2300    int64_t orig_addr = addr, orig_size = size;
2301
2302    if (!has_cpu) {
2303        cpu_index = 0;
2304    }
2305
2306    cpu = qemu_get_cpu(cpu_index);
2307    if (cpu == NULL) {
2308        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2309                   "a CPU number");
2310        return;
2311    }
2312
2313    f = fopen(filename, "wb");
2314    if (!f) {
2315        error_setg_file_open(errp, errno, filename);
2316        return;
2317    }
2318
2319    while (size != 0) {
2320        l = sizeof(buf);
2321        if (l > size)
2322            l = size;
2323        if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2324            error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2325                             " specified", orig_addr, orig_size);
2326            goto exit;
2327        }
2328        if (fwrite(buf, 1, l, f) != l) {
2329            error_setg(errp, QERR_IO_ERROR);
2330            goto exit;
2331        }
2332        addr += l;
2333        size -= l;
2334    }
2335
2336exit:
2337    fclose(f);
2338}
2339
2340void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2341                  Error **errp)
2342{
2343    FILE *f;
2344    uint32_t l;
2345    uint8_t buf[1024];
2346
2347    f = fopen(filename, "wb");
2348    if (!f) {
2349        error_setg_file_open(errp, errno, filename);
2350        return;
2351    }
2352
2353    while (size != 0) {
2354        l = sizeof(buf);
2355        if (l > size)
2356            l = size;
2357        cpu_physical_memory_read(addr, buf, l);
2358        if (fwrite(buf, 1, l, f) != l) {
2359            error_setg(errp, QERR_IO_ERROR);
2360            goto exit;
2361        }
2362        addr += l;
2363        size -= l;
2364    }
2365
2366exit:
2367    fclose(f);
2368}
2369
2370void qmp_inject_nmi(Error **errp)
2371{
2372    nmi_monitor_handle(monitor_get_cpu_index(), errp);
2373}
2374
2375void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2376{
2377    if (!use_icount) {
2378        return;
2379    }
2380
2381    cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2382                (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2383    if (icount_align_option) {
2384        cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2385        cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2386    } else {
2387        cpu_fprintf(f, "Max guest delay     NA\n");
2388        cpu_fprintf(f, "Max guest advance   NA\n");
2389    }
2390}
2391