qemu/cpus.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qemu/config-file.h"
  27#include "cpu.h"
  28#include "monitor/monitor.h"
  29#include "qapi/error.h"
  30#include "qapi/qapi-commands-misc.h"
  31#include "qapi/qapi-events-run-state.h"
  32#include "qapi/qmp/qerror.h"
  33#include "qemu/error-report.h"
  34#include "sysemu/sysemu.h"
  35#include "sysemu/block-backend.h"
  36#include "exec/gdbstub.h"
  37#include "sysemu/dma.h"
  38#include "sysemu/hw_accel.h"
  39#include "sysemu/kvm.h"
  40#include "sysemu/hax.h"
  41#include "sysemu/hvf.h"
  42#include "sysemu/whpx.h"
  43#include "exec/exec-all.h"
  44
  45#include "qemu/thread.h"
  46#include "sysemu/cpus.h"
  47#include "sysemu/qtest.h"
  48#include "qemu/main-loop.h"
  49#include "qemu/option.h"
  50#include "qemu/bitmap.h"
  51#include "qemu/seqlock.h"
  52#include "tcg.h"
  53#include "hw/nmi.h"
  54#include "sysemu/replay.h"
  55#include "hw/boards.h"
  56
  57#ifdef CONFIG_LINUX
  58
  59#include <sys/prctl.h>
  60
  61#ifndef PR_MCE_KILL
  62#define PR_MCE_KILL 33
  63#endif
  64
  65#ifndef PR_MCE_KILL_SET
  66#define PR_MCE_KILL_SET 1
  67#endif
  68
  69#ifndef PR_MCE_KILL_EARLY
  70#define PR_MCE_KILL_EARLY 1
  71#endif
  72
  73#endif /* CONFIG_LINUX */
  74
  75int64_t max_delay;
  76int64_t max_advance;
  77
  78/* vcpu throttling controls */
  79static QEMUTimer *throttle_timer;
  80static unsigned int throttle_percentage;
  81
  82#define CPU_THROTTLE_PCT_MIN 1
  83#define CPU_THROTTLE_PCT_MAX 99
  84#define CPU_THROTTLE_TIMESLICE_NS 10000000
  85
  86bool cpu_is_stopped(CPUState *cpu)
  87{
  88    return cpu->stopped || !runstate_is_running();
  89}
  90
  91static bool cpu_thread_is_idle(CPUState *cpu)
  92{
  93    if (cpu->stop || cpu->queued_work_first) {
  94        return false;
  95    }
  96    if (cpu_is_stopped(cpu)) {
  97        return true;
  98    }
  99    if (!cpu->halted || cpu_has_work(cpu) ||
 100        kvm_halt_in_kernel()) {
 101        return false;
 102    }
 103    return true;
 104}
 105
 106static bool all_cpu_threads_idle(void)
 107{
 108    CPUState *cpu;
 109
 110    CPU_FOREACH(cpu) {
 111        if (!cpu_thread_is_idle(cpu)) {
 112            return false;
 113        }
 114    }
 115    return true;
 116}
 117
 118/***********************************************************/
 119/* guest cycle counter */
 120
 121/* Protected by TimersState seqlock */
 122
 123static bool icount_sleep = true;
 124/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 125#define MAX_ICOUNT_SHIFT 10
 126
 127typedef struct TimersState {
 128    /* Protected by BQL.  */
 129    int64_t cpu_ticks_prev;
 130    int64_t cpu_ticks_offset;
 131
 132    /* Protect fields that can be respectively read outside the
 133     * BQL, and written from multiple threads.
 134     */
 135    QemuSeqLock vm_clock_seqlock;
 136    QemuSpin vm_clock_lock;
 137
 138    int16_t cpu_ticks_enabled;
 139
 140    /* Conversion factor from emulated instructions to virtual clock ticks.  */
 141    int16_t icount_time_shift;
 142
 143    /* Compensate for varying guest execution speed.  */
 144    int64_t qemu_icount_bias;
 145
 146    int64_t vm_clock_warp_start;
 147    int64_t cpu_clock_offset;
 148
 149    /* Only written by TCG thread */
 150    int64_t qemu_icount;
 151
 152    /* for adjusting icount */
 153    QEMUTimer *icount_rt_timer;
 154    QEMUTimer *icount_vm_timer;
 155    QEMUTimer *icount_warp_timer;
 156} TimersState;
 157
 158static TimersState timers_state;
 159bool mttcg_enabled;
 160
 161/*
 162 * We default to false if we know other options have been enabled
 163 * which are currently incompatible with MTTCG. Otherwise when each
 164 * guest (target) has been updated to support:
 165 *   - atomic instructions
 166 *   - memory ordering primitives (barriers)
 167 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 168 *
 169 * Once a guest architecture has been converted to the new primitives
 170 * there are two remaining limitations to check.
 171 *
 172 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 173 * - The host must have a stronger memory order than the guest
 174 *
 175 * It may be possible in future to support strong guests on weak hosts
 176 * but that will require tagging all load/stores in a guest with their
 177 * implicit memory order requirements which would likely slow things
 178 * down a lot.
 179 */
 180
 181static bool check_tcg_memory_orders_compatible(void)
 182{
 183#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 184    return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 185#else
 186    return false;
 187#endif
 188}
 189
 190static bool default_mttcg_enabled(void)
 191{
 192    if (use_icount || TCG_OVERSIZED_GUEST) {
 193        return false;
 194    } else {
 195#ifdef TARGET_SUPPORTS_MTTCG
 196        return check_tcg_memory_orders_compatible();
 197#else
 198        return false;
 199#endif
 200    }
 201}
 202
 203void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 204{
 205    const char *t = qemu_opt_get(opts, "thread");
 206    if (t) {
 207        if (strcmp(t, "multi") == 0) {
 208            if (TCG_OVERSIZED_GUEST) {
 209                error_setg(errp, "No MTTCG when guest word size > hosts");
 210            } else if (use_icount) {
 211                error_setg(errp, "No MTTCG when icount is enabled");
 212            } else {
 213#ifndef TARGET_SUPPORTS_MTTCG
 214                warn_report("Guest not yet converted to MTTCG - "
 215                            "you may get unexpected results");
 216#endif
 217                if (!check_tcg_memory_orders_compatible()) {
 218                    warn_report("Guest expects a stronger memory ordering "
 219                                "than the host provides");
 220                    error_printf("This may cause strange/hard to debug errors\n");
 221                }
 222                mttcg_enabled = true;
 223            }
 224        } else if (strcmp(t, "single") == 0) {
 225            mttcg_enabled = false;
 226        } else {
 227            error_setg(errp, "Invalid 'thread' setting %s", t);
 228        }
 229    } else {
 230        mttcg_enabled = default_mttcg_enabled();
 231    }
 232}
 233
 234/* The current number of executed instructions is based on what we
 235 * originally budgeted minus the current state of the decrementing
 236 * icount counters in extra/u16.low.
 237 */
 238static int64_t cpu_get_icount_executed(CPUState *cpu)
 239{
 240    return cpu->icount_budget - (cpu->icount_decr.u16.low + cpu->icount_extra);
 241}
 242
 243/*
 244 * Update the global shared timer_state.qemu_icount to take into
 245 * account executed instructions. This is done by the TCG vCPU
 246 * thread so the main-loop can see time has moved forward.
 247 */
 248static void cpu_update_icount_locked(CPUState *cpu)
 249{
 250    int64_t executed = cpu_get_icount_executed(cpu);
 251    cpu->icount_budget -= executed;
 252
 253    atomic_set_i64(&timers_state.qemu_icount,
 254                   timers_state.qemu_icount + executed);
 255}
 256
 257/*
 258 * Update the global shared timer_state.qemu_icount to take into
 259 * account executed instructions. This is done by the TCG vCPU
 260 * thread so the main-loop can see time has moved forward.
 261 */
 262void cpu_update_icount(CPUState *cpu)
 263{
 264    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 265                       &timers_state.vm_clock_lock);
 266    cpu_update_icount_locked(cpu);
 267    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 268                         &timers_state.vm_clock_lock);
 269}
 270
 271static int64_t cpu_get_icount_raw_locked(void)
 272{
 273    CPUState *cpu = current_cpu;
 274
 275    if (cpu && cpu->running) {
 276        if (!cpu->can_do_io) {
 277            error_report("Bad icount read");
 278            exit(1);
 279        }
 280        /* Take into account what has run */
 281        cpu_update_icount_locked(cpu);
 282    }
 283    /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 284    return atomic_read_i64(&timers_state.qemu_icount);
 285}
 286
 287static int64_t cpu_get_icount_locked(void)
 288{
 289    int64_t icount = cpu_get_icount_raw_locked();
 290    return atomic_read_i64(&timers_state.qemu_icount_bias) +
 291        cpu_icount_to_ns(icount);
 292}
 293
 294int64_t cpu_get_icount_raw(void)
 295{
 296    int64_t icount;
 297    unsigned start;
 298
 299    do {
 300        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 301        icount = cpu_get_icount_raw_locked();
 302    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 303
 304    return icount;
 305}
 306
 307/* Return the virtual CPU time, based on the instruction counter.  */
 308int64_t cpu_get_icount(void)
 309{
 310    int64_t icount;
 311    unsigned start;
 312
 313    do {
 314        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 315        icount = cpu_get_icount_locked();
 316    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 317
 318    return icount;
 319}
 320
 321int64_t cpu_icount_to_ns(int64_t icount)
 322{
 323    return icount << atomic_read(&timers_state.icount_time_shift);
 324}
 325
 326static int64_t cpu_get_ticks_locked(void)
 327{
 328    int64_t ticks = timers_state.cpu_ticks_offset;
 329    if (timers_state.cpu_ticks_enabled) {
 330        ticks += cpu_get_host_ticks();
 331    }
 332
 333    if (timers_state.cpu_ticks_prev > ticks) {
 334        /* Non increasing ticks may happen if the host uses software suspend.  */
 335        timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 336        ticks = timers_state.cpu_ticks_prev;
 337    }
 338
 339    timers_state.cpu_ticks_prev = ticks;
 340    return ticks;
 341}
 342
 343/* return the time elapsed in VM between vm_start and vm_stop.  Unless
 344 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 345 * counter.
 346 */
 347int64_t cpu_get_ticks(void)
 348{
 349    int64_t ticks;
 350
 351    if (use_icount) {
 352        return cpu_get_icount();
 353    }
 354
 355    qemu_spin_lock(&timers_state.vm_clock_lock);
 356    ticks = cpu_get_ticks_locked();
 357    qemu_spin_unlock(&timers_state.vm_clock_lock);
 358    return ticks;
 359}
 360
 361static int64_t cpu_get_clock_locked(void)
 362{
 363    int64_t time;
 364
 365    time = timers_state.cpu_clock_offset;
 366    if (timers_state.cpu_ticks_enabled) {
 367        time += get_clock();
 368    }
 369
 370    return time;
 371}
 372
 373/* Return the monotonic time elapsed in VM, i.e.,
 374 * the time between vm_start and vm_stop
 375 */
 376int64_t cpu_get_clock(void)
 377{
 378    int64_t ti;
 379    unsigned start;
 380
 381    do {
 382        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 383        ti = cpu_get_clock_locked();
 384    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 385
 386    return ti;
 387}
 388
 389/* enable cpu_get_ticks()
 390 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 391 */
 392void cpu_enable_ticks(void)
 393{
 394    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 395                       &timers_state.vm_clock_lock);
 396    if (!timers_state.cpu_ticks_enabled) {
 397        timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 398        timers_state.cpu_clock_offset -= get_clock();
 399        timers_state.cpu_ticks_enabled = 1;
 400    }
 401    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 402                       &timers_state.vm_clock_lock);
 403}
 404
 405/* disable cpu_get_ticks() : the clock is stopped. You must not call
 406 * cpu_get_ticks() after that.
 407 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 408 */
 409void cpu_disable_ticks(void)
 410{
 411    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 412                       &timers_state.vm_clock_lock);
 413    if (timers_state.cpu_ticks_enabled) {
 414        timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 415        timers_state.cpu_clock_offset = cpu_get_clock_locked();
 416        timers_state.cpu_ticks_enabled = 0;
 417    }
 418    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 419                         &timers_state.vm_clock_lock);
 420}
 421
 422/* Correlation between real and virtual time is always going to be
 423   fairly approximate, so ignore small variation.
 424   When the guest is idle real and virtual time will be aligned in
 425   the IO wait loop.  */
 426#define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 427
 428static void icount_adjust(void)
 429{
 430    int64_t cur_time;
 431    int64_t cur_icount;
 432    int64_t delta;
 433
 434    /* Protected by TimersState mutex.  */
 435    static int64_t last_delta;
 436
 437    /* If the VM is not running, then do nothing.  */
 438    if (!runstate_is_running()) {
 439        return;
 440    }
 441
 442    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 443                       &timers_state.vm_clock_lock);
 444    cur_time = cpu_get_clock_locked();
 445    cur_icount = cpu_get_icount_locked();
 446
 447    delta = cur_icount - cur_time;
 448    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 449    if (delta > 0
 450        && last_delta + ICOUNT_WOBBLE < delta * 2
 451        && timers_state.icount_time_shift > 0) {
 452        /* The guest is getting too far ahead.  Slow time down.  */
 453        atomic_set(&timers_state.icount_time_shift,
 454                   timers_state.icount_time_shift - 1);
 455    }
 456    if (delta < 0
 457        && last_delta - ICOUNT_WOBBLE > delta * 2
 458        && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 459        /* The guest is getting too far behind.  Speed time up.  */
 460        atomic_set(&timers_state.icount_time_shift,
 461                   timers_state.icount_time_shift + 1);
 462    }
 463    last_delta = delta;
 464    atomic_set_i64(&timers_state.qemu_icount_bias,
 465                   cur_icount - (timers_state.qemu_icount
 466                                 << timers_state.icount_time_shift));
 467    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 468                         &timers_state.vm_clock_lock);
 469}
 470
 471static void icount_adjust_rt(void *opaque)
 472{
 473    timer_mod(timers_state.icount_rt_timer,
 474              qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 475    icount_adjust();
 476}
 477
 478static void icount_adjust_vm(void *opaque)
 479{
 480    timer_mod(timers_state.icount_vm_timer,
 481                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 482                   NANOSECONDS_PER_SECOND / 10);
 483    icount_adjust();
 484}
 485
 486static int64_t qemu_icount_round(int64_t count)
 487{
 488    int shift = atomic_read(&timers_state.icount_time_shift);
 489    return (count + (1 << shift) - 1) >> shift;
 490}
 491
 492static void icount_warp_rt(void)
 493{
 494    unsigned seq;
 495    int64_t warp_start;
 496
 497    /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 498     * changes from -1 to another value, so the race here is okay.
 499     */
 500    do {
 501        seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 502        warp_start = timers_state.vm_clock_warp_start;
 503    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 504
 505    if (warp_start == -1) {
 506        return;
 507    }
 508
 509    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 510                       &timers_state.vm_clock_lock);
 511    if (runstate_is_running()) {
 512        int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 513                                            cpu_get_clock_locked());
 514        int64_t warp_delta;
 515
 516        warp_delta = clock - timers_state.vm_clock_warp_start;
 517        if (use_icount == 2) {
 518            /*
 519             * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 520             * far ahead of real time.
 521             */
 522            int64_t cur_icount = cpu_get_icount_locked();
 523            int64_t delta = clock - cur_icount;
 524            warp_delta = MIN(warp_delta, delta);
 525        }
 526        atomic_set_i64(&timers_state.qemu_icount_bias,
 527                       timers_state.qemu_icount_bias + warp_delta);
 528    }
 529    timers_state.vm_clock_warp_start = -1;
 530    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 531                       &timers_state.vm_clock_lock);
 532
 533    if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 534        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 535    }
 536}
 537
 538static void icount_timer_cb(void *opaque)
 539{
 540    /* No need for a checkpoint because the timer already synchronizes
 541     * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 542     */
 543    icount_warp_rt();
 544}
 545
 546void qtest_clock_warp(int64_t dest)
 547{
 548    int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 549    AioContext *aio_context;
 550    assert(qtest_enabled());
 551    aio_context = qemu_get_aio_context();
 552    while (clock < dest) {
 553        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 554        int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 555
 556        seqlock_write_lock(&timers_state.vm_clock_seqlock,
 557                           &timers_state.vm_clock_lock);
 558        atomic_set_i64(&timers_state.qemu_icount_bias,
 559                       timers_state.qemu_icount_bias + warp);
 560        seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 561                             &timers_state.vm_clock_lock);
 562
 563        qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 564        timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 565        clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 566    }
 567    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 568}
 569
 570void qemu_start_warp_timer(void)
 571{
 572    int64_t clock;
 573    int64_t deadline;
 574
 575    if (!use_icount) {
 576        return;
 577    }
 578
 579    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 580     * do not fire, so computing the deadline does not make sense.
 581     */
 582    if (!runstate_is_running()) {
 583        return;
 584    }
 585
 586    if (replay_mode != REPLAY_MODE_PLAY) {
 587        if (!all_cpu_threads_idle()) {
 588            return;
 589        }
 590
 591        if (qtest_enabled()) {
 592            /* When testing, qtest commands advance icount.  */
 593            return;
 594        }
 595
 596        replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 597    } else {
 598        /* warp clock deterministically in record/replay mode */
 599        if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 600            /* vCPU is sleeping and warp can't be started.
 601               It is probably a race condition: notification sent
 602               to vCPU was processed in advance and vCPU went to sleep.
 603               Therefore we have to wake it up for doing someting. */
 604            if (replay_has_checkpoint()) {
 605                qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 606            }
 607            return;
 608        }
 609    }
 610
 611    /* We want to use the earliest deadline from ALL vm_clocks */
 612    clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 613    deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 614    if (deadline < 0) {
 615        static bool notified;
 616        if (!icount_sleep && !notified) {
 617            warn_report("icount sleep disabled and no active timers");
 618            notified = true;
 619        }
 620        return;
 621    }
 622
 623    if (deadline > 0) {
 624        /*
 625         * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 626         * sleep.  Otherwise, the CPU might be waiting for a future timer
 627         * interrupt to wake it up, but the interrupt never comes because
 628         * the vCPU isn't running any insns and thus doesn't advance the
 629         * QEMU_CLOCK_VIRTUAL.
 630         */
 631        if (!icount_sleep) {
 632            /*
 633             * We never let VCPUs sleep in no sleep icount mode.
 634             * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 635             * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 636             * It is useful when we want a deterministic execution time,
 637             * isolated from host latencies.
 638             */
 639            seqlock_write_lock(&timers_state.vm_clock_seqlock,
 640                               &timers_state.vm_clock_lock);
 641            atomic_set_i64(&timers_state.qemu_icount_bias,
 642                           timers_state.qemu_icount_bias + deadline);
 643            seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 644                                 &timers_state.vm_clock_lock);
 645            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 646        } else {
 647            /*
 648             * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 649             * "real" time, (related to the time left until the next event) has
 650             * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 651             * This avoids that the warps are visible externally; for example,
 652             * you will not be sending network packets continuously instead of
 653             * every 100ms.
 654             */
 655            seqlock_write_lock(&timers_state.vm_clock_seqlock,
 656                               &timers_state.vm_clock_lock);
 657            if (timers_state.vm_clock_warp_start == -1
 658                || timers_state.vm_clock_warp_start > clock) {
 659                timers_state.vm_clock_warp_start = clock;
 660            }
 661            seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 662                                 &timers_state.vm_clock_lock);
 663            timer_mod_anticipate(timers_state.icount_warp_timer,
 664                                 clock + deadline);
 665        }
 666    } else if (deadline == 0) {
 667        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 668    }
 669}
 670
 671static void qemu_account_warp_timer(void)
 672{
 673    if (!use_icount || !icount_sleep) {
 674        return;
 675    }
 676
 677    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 678     * do not fire, so computing the deadline does not make sense.
 679     */
 680    if (!runstate_is_running()) {
 681        return;
 682    }
 683
 684    /* warp clock deterministically in record/replay mode */
 685    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 686        return;
 687    }
 688
 689    timer_del(timers_state.icount_warp_timer);
 690    icount_warp_rt();
 691}
 692
 693static bool icount_state_needed(void *opaque)
 694{
 695    return use_icount;
 696}
 697
 698static bool warp_timer_state_needed(void *opaque)
 699{
 700    TimersState *s = opaque;
 701    return s->icount_warp_timer != NULL;
 702}
 703
 704static bool adjust_timers_state_needed(void *opaque)
 705{
 706    TimersState *s = opaque;
 707    return s->icount_rt_timer != NULL;
 708}
 709
 710/*
 711 * Subsection for warp timer migration is optional, because may not be created
 712 */
 713static const VMStateDescription icount_vmstate_warp_timer = {
 714    .name = "timer/icount/warp_timer",
 715    .version_id = 1,
 716    .minimum_version_id = 1,
 717    .needed = warp_timer_state_needed,
 718    .fields = (VMStateField[]) {
 719        VMSTATE_INT64(vm_clock_warp_start, TimersState),
 720        VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 721        VMSTATE_END_OF_LIST()
 722    }
 723};
 724
 725static const VMStateDescription icount_vmstate_adjust_timers = {
 726    .name = "timer/icount/timers",
 727    .version_id = 1,
 728    .minimum_version_id = 1,
 729    .needed = adjust_timers_state_needed,
 730    .fields = (VMStateField[]) {
 731        VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 732        VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 733        VMSTATE_END_OF_LIST()
 734    }
 735};
 736
 737/*
 738 * This is a subsection for icount migration.
 739 */
 740static const VMStateDescription icount_vmstate_timers = {
 741    .name = "timer/icount",
 742    .version_id = 1,
 743    .minimum_version_id = 1,
 744    .needed = icount_state_needed,
 745    .fields = (VMStateField[]) {
 746        VMSTATE_INT64(qemu_icount_bias, TimersState),
 747        VMSTATE_INT64(qemu_icount, TimersState),
 748        VMSTATE_END_OF_LIST()
 749    },
 750    .subsections = (const VMStateDescription*[]) {
 751        &icount_vmstate_warp_timer,
 752        &icount_vmstate_adjust_timers,
 753        NULL
 754    }
 755};
 756
 757static const VMStateDescription vmstate_timers = {
 758    .name = "timer",
 759    .version_id = 2,
 760    .minimum_version_id = 1,
 761    .fields = (VMStateField[]) {
 762        VMSTATE_INT64(cpu_ticks_offset, TimersState),
 763        VMSTATE_UNUSED(8),
 764        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 765        VMSTATE_END_OF_LIST()
 766    },
 767    .subsections = (const VMStateDescription*[]) {
 768        &icount_vmstate_timers,
 769        NULL
 770    }
 771};
 772
 773static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 774{
 775    double pct;
 776    double throttle_ratio;
 777    long sleeptime_ns;
 778
 779    if (!cpu_throttle_get_percentage()) {
 780        return;
 781    }
 782
 783    pct = (double)cpu_throttle_get_percentage()/100;
 784    throttle_ratio = pct / (1 - pct);
 785    sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 786
 787    qemu_mutex_unlock_iothread();
 788    g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 789    qemu_mutex_lock_iothread();
 790    atomic_set(&cpu->throttle_thread_scheduled, 0);
 791}
 792
 793static void cpu_throttle_timer_tick(void *opaque)
 794{
 795    CPUState *cpu;
 796    double pct;
 797
 798    /* Stop the timer if needed */
 799    if (!cpu_throttle_get_percentage()) {
 800        return;
 801    }
 802    CPU_FOREACH(cpu) {
 803        if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 804            async_run_on_cpu(cpu, cpu_throttle_thread,
 805                             RUN_ON_CPU_NULL);
 806        }
 807    }
 808
 809    pct = (double)cpu_throttle_get_percentage()/100;
 810    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 811                                   CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 812}
 813
 814void cpu_throttle_set(int new_throttle_pct)
 815{
 816    /* Ensure throttle percentage is within valid range */
 817    new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 818    new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 819
 820    atomic_set(&throttle_percentage, new_throttle_pct);
 821
 822    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 823                                       CPU_THROTTLE_TIMESLICE_NS);
 824}
 825
 826void cpu_throttle_stop(void)
 827{
 828    atomic_set(&throttle_percentage, 0);
 829}
 830
 831bool cpu_throttle_active(void)
 832{
 833    return (cpu_throttle_get_percentage() != 0);
 834}
 835
 836int cpu_throttle_get_percentage(void)
 837{
 838    return atomic_read(&throttle_percentage);
 839}
 840
 841void cpu_ticks_init(void)
 842{
 843    seqlock_init(&timers_state.vm_clock_seqlock);
 844    qemu_spin_init(&timers_state.vm_clock_lock);
 845    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 846    throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 847                                           cpu_throttle_timer_tick, NULL);
 848}
 849
 850void configure_icount(QemuOpts *opts, Error **errp)
 851{
 852    const char *option;
 853    char *rem_str = NULL;
 854
 855    option = qemu_opt_get(opts, "shift");
 856    if (!option) {
 857        if (qemu_opt_get(opts, "align") != NULL) {
 858            error_setg(errp, "Please specify shift option when using align");
 859        }
 860        return;
 861    }
 862
 863    icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 864    if (icount_sleep) {
 865        timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 866                                         icount_timer_cb, NULL);
 867    }
 868
 869    icount_align_option = qemu_opt_get_bool(opts, "align", false);
 870
 871    if (icount_align_option && !icount_sleep) {
 872        error_setg(errp, "align=on and sleep=off are incompatible");
 873    }
 874    if (strcmp(option, "auto") != 0) {
 875        errno = 0;
 876        timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 877        if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 878            error_setg(errp, "icount: Invalid shift value");
 879        }
 880        use_icount = 1;
 881        return;
 882    } else if (icount_align_option) {
 883        error_setg(errp, "shift=auto and align=on are incompatible");
 884    } else if (!icount_sleep) {
 885        error_setg(errp, "shift=auto and sleep=off are incompatible");
 886    }
 887
 888    use_icount = 2;
 889
 890    /* 125MIPS seems a reasonable initial guess at the guest speed.
 891       It will be corrected fairly quickly anyway.  */
 892    timers_state.icount_time_shift = 3;
 893
 894    /* Have both realtime and virtual time triggers for speed adjustment.
 895       The realtime trigger catches emulated time passing too slowly,
 896       the virtual time trigger catches emulated time passing too fast.
 897       Realtime triggers occur even when idle, so use them less frequently
 898       than VM triggers.  */
 899    timers_state.vm_clock_warp_start = -1;
 900    timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 901                                   icount_adjust_rt, NULL);
 902    timer_mod(timers_state.icount_rt_timer,
 903                   qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 904    timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 905                                        icount_adjust_vm, NULL);
 906    timer_mod(timers_state.icount_vm_timer,
 907                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 908                   NANOSECONDS_PER_SECOND / 10);
 909}
 910
 911/***********************************************************/
 912/* TCG vCPU kick timer
 913 *
 914 * The kick timer is responsible for moving single threaded vCPU
 915 * emulation on to the next vCPU. If more than one vCPU is running a
 916 * timer event with force a cpu->exit so the next vCPU can get
 917 * scheduled.
 918 *
 919 * The timer is removed if all vCPUs are idle and restarted again once
 920 * idleness is complete.
 921 */
 922
 923static QEMUTimer *tcg_kick_vcpu_timer;
 924static CPUState *tcg_current_rr_cpu;
 925
 926#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 927
 928static inline int64_t qemu_tcg_next_kick(void)
 929{
 930    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 931}
 932
 933/* Kick the currently round-robin scheduled vCPU */
 934static void qemu_cpu_kick_rr_cpu(void)
 935{
 936    CPUState *cpu;
 937    do {
 938        cpu = atomic_mb_read(&tcg_current_rr_cpu);
 939        if (cpu) {
 940            cpu_exit(cpu);
 941        }
 942    } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 943}
 944
 945static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 946{
 947}
 948
 949void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 950{
 951    if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 952        qemu_notify_event();
 953        return;
 954    }
 955
 956    if (qemu_in_vcpu_thread()) {
 957        /* A CPU is currently running; kick it back out to the
 958         * tcg_cpu_exec() loop so it will recalculate its
 959         * icount deadline immediately.
 960         */
 961        qemu_cpu_kick(current_cpu);
 962    } else if (first_cpu) {
 963        /* qemu_cpu_kick is not enough to kick a halted CPU out of
 964         * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 965         * causes cpu_thread_is_idle to return false.  This way,
 966         * handle_icount_deadline can run.
 967         * If we have no CPUs at all for some reason, we don't
 968         * need to do anything.
 969         */
 970        async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 971    }
 972}
 973
 974static void kick_tcg_thread(void *opaque)
 975{
 976    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 977    qemu_cpu_kick_rr_cpu();
 978}
 979
 980static void start_tcg_kick_timer(void)
 981{
 982    assert(!mttcg_enabled);
 983    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 984        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 985                                           kick_tcg_thread, NULL);
 986    }
 987    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 988        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 989    }
 990}
 991
 992static void stop_tcg_kick_timer(void)
 993{
 994    assert(!mttcg_enabled);
 995    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 996        timer_del(tcg_kick_vcpu_timer);
 997    }
 998}
 999
1000/***********************************************************/
1001void hw_error(const char *fmt, ...)
1002{
1003    va_list ap;
1004    CPUState *cpu;
1005
1006    va_start(ap, fmt);
1007    fprintf(stderr, "qemu: hardware error: ");
1008    vfprintf(stderr, fmt, ap);
1009    fprintf(stderr, "\n");
1010    CPU_FOREACH(cpu) {
1011        fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1012        cpu_dump_state(cpu, stderr, fprintf, CPU_DUMP_FPU);
1013    }
1014    va_end(ap);
1015    abort();
1016}
1017
1018void cpu_synchronize_all_states(void)
1019{
1020    CPUState *cpu;
1021
1022    CPU_FOREACH(cpu) {
1023        cpu_synchronize_state(cpu);
1024        /* TODO: move to cpu_synchronize_state() */
1025        if (hvf_enabled()) {
1026            hvf_cpu_synchronize_state(cpu);
1027        }
1028    }
1029}
1030
1031void cpu_synchronize_all_post_reset(void)
1032{
1033    CPUState *cpu;
1034
1035    CPU_FOREACH(cpu) {
1036        cpu_synchronize_post_reset(cpu);
1037        /* TODO: move to cpu_synchronize_post_reset() */
1038        if (hvf_enabled()) {
1039            hvf_cpu_synchronize_post_reset(cpu);
1040        }
1041    }
1042}
1043
1044void cpu_synchronize_all_post_init(void)
1045{
1046    CPUState *cpu;
1047
1048    CPU_FOREACH(cpu) {
1049        cpu_synchronize_post_init(cpu);
1050        /* TODO: move to cpu_synchronize_post_init() */
1051        if (hvf_enabled()) {
1052            hvf_cpu_synchronize_post_init(cpu);
1053        }
1054    }
1055}
1056
1057void cpu_synchronize_all_pre_loadvm(void)
1058{
1059    CPUState *cpu;
1060
1061    CPU_FOREACH(cpu) {
1062        cpu_synchronize_pre_loadvm(cpu);
1063    }
1064}
1065
1066static int do_vm_stop(RunState state, bool send_stop)
1067{
1068    int ret = 0;
1069
1070    if (runstate_is_running()) {
1071        cpu_disable_ticks();
1072        pause_all_vcpus();
1073        runstate_set(state);
1074        vm_state_notify(0, state);
1075        if (send_stop) {
1076            qapi_event_send_stop();
1077        }
1078    }
1079
1080    bdrv_drain_all();
1081    replay_disable_events();
1082    ret = bdrv_flush_all();
1083
1084    return ret;
1085}
1086
1087/* Special vm_stop() variant for terminating the process.  Historically clients
1088 * did not expect a QMP STOP event and so we need to retain compatibility.
1089 */
1090int vm_shutdown(void)
1091{
1092    return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1093}
1094
1095static bool cpu_can_run(CPUState *cpu)
1096{
1097    if (cpu->stop) {
1098        return false;
1099    }
1100    if (cpu_is_stopped(cpu)) {
1101        return false;
1102    }
1103    return true;
1104}
1105
1106static void cpu_handle_guest_debug(CPUState *cpu)
1107{
1108    gdb_set_stop_cpu(cpu);
1109    qemu_system_debug_request();
1110    cpu->stopped = true;
1111}
1112
1113#ifdef CONFIG_LINUX
1114static void sigbus_reraise(void)
1115{
1116    sigset_t set;
1117    struct sigaction action;
1118
1119    memset(&action, 0, sizeof(action));
1120    action.sa_handler = SIG_DFL;
1121    if (!sigaction(SIGBUS, &action, NULL)) {
1122        raise(SIGBUS);
1123        sigemptyset(&set);
1124        sigaddset(&set, SIGBUS);
1125        pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1126    }
1127    perror("Failed to re-raise SIGBUS!\n");
1128    abort();
1129}
1130
1131static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1132{
1133    if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1134        sigbus_reraise();
1135    }
1136
1137    if (current_cpu) {
1138        /* Called asynchronously in VCPU thread.  */
1139        if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1140            sigbus_reraise();
1141        }
1142    } else {
1143        /* Called synchronously (via signalfd) in main thread.  */
1144        if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1145            sigbus_reraise();
1146        }
1147    }
1148}
1149
1150static void qemu_init_sigbus(void)
1151{
1152    struct sigaction action;
1153
1154    memset(&action, 0, sizeof(action));
1155    action.sa_flags = SA_SIGINFO;
1156    action.sa_sigaction = sigbus_handler;
1157    sigaction(SIGBUS, &action, NULL);
1158
1159    prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1160}
1161#else /* !CONFIG_LINUX */
1162static void qemu_init_sigbus(void)
1163{
1164}
1165#endif /* !CONFIG_LINUX */
1166
1167static QemuMutex qemu_global_mutex;
1168
1169static QemuThread io_thread;
1170
1171/* cpu creation */
1172static QemuCond qemu_cpu_cond;
1173/* system init */
1174static QemuCond qemu_pause_cond;
1175
1176void qemu_init_cpu_loop(void)
1177{
1178    qemu_init_sigbus();
1179    qemu_cond_init(&qemu_cpu_cond);
1180    qemu_cond_init(&qemu_pause_cond);
1181    qemu_mutex_init(&qemu_global_mutex);
1182
1183    qemu_thread_get_self(&io_thread);
1184}
1185
1186void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1187{
1188    do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1189}
1190
1191static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1192{
1193    if (kvm_destroy_vcpu(cpu) < 0) {
1194        error_report("kvm_destroy_vcpu failed");
1195        exit(EXIT_FAILURE);
1196    }
1197}
1198
1199static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1200{
1201}
1202
1203static void qemu_cpu_stop(CPUState *cpu, bool exit)
1204{
1205    g_assert(qemu_cpu_is_self(cpu));
1206    cpu->stop = false;
1207    cpu->stopped = true;
1208    if (exit) {
1209        cpu_exit(cpu);
1210    }
1211    qemu_cond_broadcast(&qemu_pause_cond);
1212}
1213
1214static void qemu_wait_io_event_common(CPUState *cpu)
1215{
1216    atomic_mb_set(&cpu->thread_kicked, false);
1217    if (cpu->stop) {
1218        qemu_cpu_stop(cpu, false);
1219    }
1220    process_queued_cpu_work(cpu);
1221}
1222
1223static void qemu_tcg_rr_wait_io_event(void)
1224{
1225    CPUState *cpu;
1226
1227    while (all_cpu_threads_idle()) {
1228        stop_tcg_kick_timer();
1229        qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1230    }
1231
1232    start_tcg_kick_timer();
1233
1234    CPU_FOREACH(cpu) {
1235        qemu_wait_io_event_common(cpu);
1236    }
1237}
1238
1239static void qemu_wait_io_event(CPUState *cpu)
1240{
1241    while (cpu_thread_is_idle(cpu)) {
1242        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1243    }
1244
1245#ifdef _WIN32
1246    /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1247    if (!tcg_enabled()) {
1248        SleepEx(0, TRUE);
1249    }
1250#endif
1251    qemu_wait_io_event_common(cpu);
1252}
1253
1254static void *qemu_kvm_cpu_thread_fn(void *arg)
1255{
1256    CPUState *cpu = arg;
1257    int r;
1258
1259    rcu_register_thread();
1260
1261    qemu_mutex_lock_iothread();
1262    qemu_thread_get_self(cpu->thread);
1263    cpu->thread_id = qemu_get_thread_id();
1264    cpu->can_do_io = 1;
1265    current_cpu = cpu;
1266
1267    r = kvm_init_vcpu(cpu);
1268    if (r < 0) {
1269        error_report("kvm_init_vcpu failed: %s", strerror(-r));
1270        exit(1);
1271    }
1272
1273    kvm_init_cpu_signals(cpu);
1274
1275    /* signal CPU creation */
1276    cpu->created = true;
1277    qemu_cond_signal(&qemu_cpu_cond);
1278
1279    do {
1280        if (cpu_can_run(cpu)) {
1281            r = kvm_cpu_exec(cpu);
1282            if (r == EXCP_DEBUG) {
1283                cpu_handle_guest_debug(cpu);
1284            }
1285        }
1286        qemu_wait_io_event(cpu);
1287    } while (!cpu->unplug || cpu_can_run(cpu));
1288
1289    qemu_kvm_destroy_vcpu(cpu);
1290    cpu->created = false;
1291    qemu_cond_signal(&qemu_cpu_cond);
1292    qemu_mutex_unlock_iothread();
1293    rcu_unregister_thread();
1294    return NULL;
1295}
1296
1297static void *qemu_dummy_cpu_thread_fn(void *arg)
1298{
1299#ifdef _WIN32
1300    error_report("qtest is not supported under Windows");
1301    exit(1);
1302#else
1303    CPUState *cpu = arg;
1304    sigset_t waitset;
1305    int r;
1306
1307    rcu_register_thread();
1308
1309    qemu_mutex_lock_iothread();
1310    qemu_thread_get_self(cpu->thread);
1311    cpu->thread_id = qemu_get_thread_id();
1312    cpu->can_do_io = 1;
1313    current_cpu = cpu;
1314
1315    sigemptyset(&waitset);
1316    sigaddset(&waitset, SIG_IPI);
1317
1318    /* signal CPU creation */
1319    cpu->created = true;
1320    qemu_cond_signal(&qemu_cpu_cond);
1321
1322    do {
1323        qemu_mutex_unlock_iothread();
1324        do {
1325            int sig;
1326            r = sigwait(&waitset, &sig);
1327        } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1328        if (r == -1) {
1329            perror("sigwait");
1330            exit(1);
1331        }
1332        qemu_mutex_lock_iothread();
1333        qemu_wait_io_event(cpu);
1334    } while (!cpu->unplug);
1335
1336    rcu_unregister_thread();
1337    return NULL;
1338#endif
1339}
1340
1341static int64_t tcg_get_icount_limit(void)
1342{
1343    int64_t deadline;
1344
1345    if (replay_mode != REPLAY_MODE_PLAY) {
1346        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1347
1348        /* Maintain prior (possibly buggy) behaviour where if no deadline
1349         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1350         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1351         * nanoseconds.
1352         */
1353        if ((deadline < 0) || (deadline > INT32_MAX)) {
1354            deadline = INT32_MAX;
1355        }
1356
1357        return qemu_icount_round(deadline);
1358    } else {
1359        return replay_get_instructions();
1360    }
1361}
1362
1363static void handle_icount_deadline(void)
1364{
1365    assert(qemu_in_vcpu_thread());
1366    if (use_icount) {
1367        int64_t deadline =
1368            qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1369
1370        if (deadline == 0) {
1371            /* Wake up other AioContexts.  */
1372            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1373            qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1374        }
1375    }
1376}
1377
1378static void prepare_icount_for_run(CPUState *cpu)
1379{
1380    if (use_icount) {
1381        int insns_left;
1382
1383        /* These should always be cleared by process_icount_data after
1384         * each vCPU execution. However u16.high can be raised
1385         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1386         */
1387        g_assert(cpu->icount_decr.u16.low == 0);
1388        g_assert(cpu->icount_extra == 0);
1389
1390        cpu->icount_budget = tcg_get_icount_limit();
1391        insns_left = MIN(0xffff, cpu->icount_budget);
1392        cpu->icount_decr.u16.low = insns_left;
1393        cpu->icount_extra = cpu->icount_budget - insns_left;
1394
1395        replay_mutex_lock();
1396    }
1397}
1398
1399static void process_icount_data(CPUState *cpu)
1400{
1401    if (use_icount) {
1402        /* Account for executed instructions */
1403        cpu_update_icount(cpu);
1404
1405        /* Reset the counters */
1406        cpu->icount_decr.u16.low = 0;
1407        cpu->icount_extra = 0;
1408        cpu->icount_budget = 0;
1409
1410        replay_account_executed_instructions();
1411
1412        replay_mutex_unlock();
1413    }
1414}
1415
1416
1417static int tcg_cpu_exec(CPUState *cpu)
1418{
1419    int ret;
1420#ifdef CONFIG_PROFILER
1421    int64_t ti;
1422#endif
1423
1424    assert(tcg_enabled());
1425#ifdef CONFIG_PROFILER
1426    ti = profile_getclock();
1427#endif
1428    cpu_exec_start(cpu);
1429    ret = cpu_exec(cpu);
1430    cpu_exec_end(cpu);
1431#ifdef CONFIG_PROFILER
1432    atomic_set(&tcg_ctx->prof.cpu_exec_time,
1433               tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1434#endif
1435    return ret;
1436}
1437
1438/* Destroy any remaining vCPUs which have been unplugged and have
1439 * finished running
1440 */
1441static void deal_with_unplugged_cpus(void)
1442{
1443    CPUState *cpu;
1444
1445    CPU_FOREACH(cpu) {
1446        if (cpu->unplug && !cpu_can_run(cpu)) {
1447            qemu_tcg_destroy_vcpu(cpu);
1448            cpu->created = false;
1449            qemu_cond_signal(&qemu_cpu_cond);
1450            break;
1451        }
1452    }
1453}
1454
1455/* Single-threaded TCG
1456 *
1457 * In the single-threaded case each vCPU is simulated in turn. If
1458 * there is more than a single vCPU we create a simple timer to kick
1459 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1460 * This is done explicitly rather than relying on side-effects
1461 * elsewhere.
1462 */
1463
1464static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1465{
1466    CPUState *cpu = arg;
1467
1468    assert(tcg_enabled());
1469    rcu_register_thread();
1470    tcg_register_thread();
1471
1472    qemu_mutex_lock_iothread();
1473    qemu_thread_get_self(cpu->thread);
1474
1475    cpu->thread_id = qemu_get_thread_id();
1476    cpu->created = true;
1477    cpu->can_do_io = 1;
1478    qemu_cond_signal(&qemu_cpu_cond);
1479
1480    /* wait for initial kick-off after machine start */
1481    while (first_cpu->stopped) {
1482        qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1483
1484        /* process any pending work */
1485        CPU_FOREACH(cpu) {
1486            current_cpu = cpu;
1487            qemu_wait_io_event_common(cpu);
1488        }
1489    }
1490
1491    start_tcg_kick_timer();
1492
1493    cpu = first_cpu;
1494
1495    /* process any pending work */
1496    cpu->exit_request = 1;
1497
1498    while (1) {
1499        qemu_mutex_unlock_iothread();
1500        replay_mutex_lock();
1501        qemu_mutex_lock_iothread();
1502        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1503        qemu_account_warp_timer();
1504
1505        /* Run the timers here.  This is much more efficient than
1506         * waking up the I/O thread and waiting for completion.
1507         */
1508        handle_icount_deadline();
1509
1510        replay_mutex_unlock();
1511
1512        if (!cpu) {
1513            cpu = first_cpu;
1514        }
1515
1516        while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1517
1518            atomic_mb_set(&tcg_current_rr_cpu, cpu);
1519            current_cpu = cpu;
1520
1521            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1522                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1523
1524            if (cpu_can_run(cpu)) {
1525                int r;
1526
1527                qemu_mutex_unlock_iothread();
1528                prepare_icount_for_run(cpu);
1529
1530                r = tcg_cpu_exec(cpu);
1531
1532                process_icount_data(cpu);
1533                qemu_mutex_lock_iothread();
1534
1535                if (r == EXCP_DEBUG) {
1536                    cpu_handle_guest_debug(cpu);
1537                    break;
1538                } else if (r == EXCP_ATOMIC) {
1539                    qemu_mutex_unlock_iothread();
1540                    cpu_exec_step_atomic(cpu);
1541                    qemu_mutex_lock_iothread();
1542                    break;
1543                }
1544            } else if (cpu->stop) {
1545                if (cpu->unplug) {
1546                    cpu = CPU_NEXT(cpu);
1547                }
1548                break;
1549            }
1550
1551            cpu = CPU_NEXT(cpu);
1552        } /* while (cpu && !cpu->exit_request).. */
1553
1554        /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1555        atomic_set(&tcg_current_rr_cpu, NULL);
1556
1557        if (cpu && cpu->exit_request) {
1558            atomic_mb_set(&cpu->exit_request, 0);
1559        }
1560
1561        if (use_icount && all_cpu_threads_idle()) {
1562            /*
1563             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1564             * in the main_loop, wake it up in order to start the warp timer.
1565             */
1566            qemu_notify_event();
1567        }
1568
1569        qemu_tcg_rr_wait_io_event();
1570        deal_with_unplugged_cpus();
1571    }
1572
1573    rcu_unregister_thread();
1574    return NULL;
1575}
1576
1577static void *qemu_hax_cpu_thread_fn(void *arg)
1578{
1579    CPUState *cpu = arg;
1580    int r;
1581
1582    rcu_register_thread();
1583    qemu_mutex_lock_iothread();
1584    qemu_thread_get_self(cpu->thread);
1585
1586    cpu->thread_id = qemu_get_thread_id();
1587    cpu->created = true;
1588    cpu->halted = 0;
1589    current_cpu = cpu;
1590
1591    hax_init_vcpu(cpu);
1592    qemu_cond_signal(&qemu_cpu_cond);
1593
1594    do {
1595        if (cpu_can_run(cpu)) {
1596            r = hax_smp_cpu_exec(cpu);
1597            if (r == EXCP_DEBUG) {
1598                cpu_handle_guest_debug(cpu);
1599            }
1600        }
1601
1602        qemu_wait_io_event(cpu);
1603    } while (!cpu->unplug || cpu_can_run(cpu));
1604    rcu_unregister_thread();
1605    return NULL;
1606}
1607
1608/* The HVF-specific vCPU thread function. This one should only run when the host
1609 * CPU supports the VMX "unrestricted guest" feature. */
1610static void *qemu_hvf_cpu_thread_fn(void *arg)
1611{
1612    CPUState *cpu = arg;
1613
1614    int r;
1615
1616    assert(hvf_enabled());
1617
1618    rcu_register_thread();
1619
1620    qemu_mutex_lock_iothread();
1621    qemu_thread_get_self(cpu->thread);
1622
1623    cpu->thread_id = qemu_get_thread_id();
1624    cpu->can_do_io = 1;
1625    current_cpu = cpu;
1626
1627    hvf_init_vcpu(cpu);
1628
1629    /* signal CPU creation */
1630    cpu->created = true;
1631    qemu_cond_signal(&qemu_cpu_cond);
1632
1633    do {
1634        if (cpu_can_run(cpu)) {
1635            r = hvf_vcpu_exec(cpu);
1636            if (r == EXCP_DEBUG) {
1637                cpu_handle_guest_debug(cpu);
1638            }
1639        }
1640        qemu_wait_io_event(cpu);
1641    } while (!cpu->unplug || cpu_can_run(cpu));
1642
1643    hvf_vcpu_destroy(cpu);
1644    cpu->created = false;
1645    qemu_cond_signal(&qemu_cpu_cond);
1646    qemu_mutex_unlock_iothread();
1647    rcu_unregister_thread();
1648    return NULL;
1649}
1650
1651static void *qemu_whpx_cpu_thread_fn(void *arg)
1652{
1653    CPUState *cpu = arg;
1654    int r;
1655
1656    rcu_register_thread();
1657
1658    qemu_mutex_lock_iothread();
1659    qemu_thread_get_self(cpu->thread);
1660    cpu->thread_id = qemu_get_thread_id();
1661    current_cpu = cpu;
1662
1663    r = whpx_init_vcpu(cpu);
1664    if (r < 0) {
1665        fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1666        exit(1);
1667    }
1668
1669    /* signal CPU creation */
1670    cpu->created = true;
1671    qemu_cond_signal(&qemu_cpu_cond);
1672
1673    do {
1674        if (cpu_can_run(cpu)) {
1675            r = whpx_vcpu_exec(cpu);
1676            if (r == EXCP_DEBUG) {
1677                cpu_handle_guest_debug(cpu);
1678            }
1679        }
1680        while (cpu_thread_is_idle(cpu)) {
1681            qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1682        }
1683        qemu_wait_io_event_common(cpu);
1684    } while (!cpu->unplug || cpu_can_run(cpu));
1685
1686    whpx_destroy_vcpu(cpu);
1687    cpu->created = false;
1688    qemu_cond_signal(&qemu_cpu_cond);
1689    qemu_mutex_unlock_iothread();
1690    rcu_unregister_thread();
1691    return NULL;
1692}
1693
1694#ifdef _WIN32
1695static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1696{
1697}
1698#endif
1699
1700/* Multi-threaded TCG
1701 *
1702 * In the multi-threaded case each vCPU has its own thread. The TLS
1703 * variable current_cpu can be used deep in the code to find the
1704 * current CPUState for a given thread.
1705 */
1706
1707static void *qemu_tcg_cpu_thread_fn(void *arg)
1708{
1709    CPUState *cpu = arg;
1710
1711    assert(tcg_enabled());
1712    g_assert(!use_icount);
1713
1714    rcu_register_thread();
1715    tcg_register_thread();
1716
1717    qemu_mutex_lock_iothread();
1718    qemu_thread_get_self(cpu->thread);
1719
1720    cpu->thread_id = qemu_get_thread_id();
1721    cpu->created = true;
1722    cpu->can_do_io = 1;
1723    current_cpu = cpu;
1724    qemu_cond_signal(&qemu_cpu_cond);
1725
1726    /* process any pending work */
1727    cpu->exit_request = 1;
1728
1729    do {
1730        if (cpu_can_run(cpu)) {
1731            int r;
1732            qemu_mutex_unlock_iothread();
1733            r = tcg_cpu_exec(cpu);
1734            qemu_mutex_lock_iothread();
1735            switch (r) {
1736            case EXCP_DEBUG:
1737                cpu_handle_guest_debug(cpu);
1738                break;
1739            case EXCP_HALTED:
1740                /* during start-up the vCPU is reset and the thread is
1741                 * kicked several times. If we don't ensure we go back
1742                 * to sleep in the halted state we won't cleanly
1743                 * start-up when the vCPU is enabled.
1744                 *
1745                 * cpu->halted should ensure we sleep in wait_io_event
1746                 */
1747                g_assert(cpu->halted);
1748                break;
1749            case EXCP_ATOMIC:
1750                qemu_mutex_unlock_iothread();
1751                cpu_exec_step_atomic(cpu);
1752                qemu_mutex_lock_iothread();
1753            default:
1754                /* Ignore everything else? */
1755                break;
1756            }
1757        }
1758
1759        atomic_mb_set(&cpu->exit_request, 0);
1760        qemu_wait_io_event(cpu);
1761    } while (!cpu->unplug || cpu_can_run(cpu));
1762
1763    qemu_tcg_destroy_vcpu(cpu);
1764    cpu->created = false;
1765    qemu_cond_signal(&qemu_cpu_cond);
1766    qemu_mutex_unlock_iothread();
1767    rcu_unregister_thread();
1768    return NULL;
1769}
1770
1771static void qemu_cpu_kick_thread(CPUState *cpu)
1772{
1773#ifndef _WIN32
1774    int err;
1775
1776    if (cpu->thread_kicked) {
1777        return;
1778    }
1779    cpu->thread_kicked = true;
1780    err = pthread_kill(cpu->thread->thread, SIG_IPI);
1781    if (err) {
1782        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1783        exit(1);
1784    }
1785#else /* _WIN32 */
1786    if (!qemu_cpu_is_self(cpu)) {
1787        if (whpx_enabled()) {
1788            whpx_vcpu_kick(cpu);
1789        } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1790            fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1791                    __func__, GetLastError());
1792            exit(1);
1793        }
1794    }
1795#endif
1796}
1797
1798void qemu_cpu_kick(CPUState *cpu)
1799{
1800    qemu_cond_broadcast(cpu->halt_cond);
1801    if (tcg_enabled()) {
1802        cpu_exit(cpu);
1803        /* NOP unless doing single-thread RR */
1804        qemu_cpu_kick_rr_cpu();
1805    } else {
1806        if (hax_enabled()) {
1807            /*
1808             * FIXME: race condition with the exit_request check in
1809             * hax_vcpu_hax_exec
1810             */
1811            cpu->exit_request = 1;
1812        }
1813        qemu_cpu_kick_thread(cpu);
1814    }
1815}
1816
1817void qemu_cpu_kick_self(void)
1818{
1819    assert(current_cpu);
1820    qemu_cpu_kick_thread(current_cpu);
1821}
1822
1823bool qemu_cpu_is_self(CPUState *cpu)
1824{
1825    return qemu_thread_is_self(cpu->thread);
1826}
1827
1828bool qemu_in_vcpu_thread(void)
1829{
1830    return current_cpu && qemu_cpu_is_self(current_cpu);
1831}
1832
1833static __thread bool iothread_locked = false;
1834
1835bool qemu_mutex_iothread_locked(void)
1836{
1837    return iothread_locked;
1838}
1839
1840/*
1841 * The BQL is taken from so many places that it is worth profiling the
1842 * callers directly, instead of funneling them all through a single function.
1843 */
1844void qemu_mutex_lock_iothread_impl(const char *file, int line)
1845{
1846    QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1847
1848    g_assert(!qemu_mutex_iothread_locked());
1849    bql_lock(&qemu_global_mutex, file, line);
1850    iothread_locked = true;
1851}
1852
1853void qemu_mutex_unlock_iothread(void)
1854{
1855    g_assert(qemu_mutex_iothread_locked());
1856    iothread_locked = false;
1857    qemu_mutex_unlock(&qemu_global_mutex);
1858}
1859
1860static bool all_vcpus_paused(void)
1861{
1862    CPUState *cpu;
1863
1864    CPU_FOREACH(cpu) {
1865        if (!cpu->stopped) {
1866            return false;
1867        }
1868    }
1869
1870    return true;
1871}
1872
1873void pause_all_vcpus(void)
1874{
1875    CPUState *cpu;
1876
1877    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1878    CPU_FOREACH(cpu) {
1879        if (qemu_cpu_is_self(cpu)) {
1880            qemu_cpu_stop(cpu, true);
1881        } else {
1882            cpu->stop = true;
1883            qemu_cpu_kick(cpu);
1884        }
1885    }
1886
1887    /* We need to drop the replay_lock so any vCPU threads woken up
1888     * can finish their replay tasks
1889     */
1890    replay_mutex_unlock();
1891
1892    while (!all_vcpus_paused()) {
1893        qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1894        CPU_FOREACH(cpu) {
1895            qemu_cpu_kick(cpu);
1896        }
1897    }
1898
1899    qemu_mutex_unlock_iothread();
1900    replay_mutex_lock();
1901    qemu_mutex_lock_iothread();
1902}
1903
1904void cpu_resume(CPUState *cpu)
1905{
1906    cpu->stop = false;
1907    cpu->stopped = false;
1908    qemu_cpu_kick(cpu);
1909}
1910
1911void resume_all_vcpus(void)
1912{
1913    CPUState *cpu;
1914
1915    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1916    CPU_FOREACH(cpu) {
1917        cpu_resume(cpu);
1918    }
1919}
1920
1921void cpu_remove_sync(CPUState *cpu)
1922{
1923    cpu->stop = true;
1924    cpu->unplug = true;
1925    qemu_cpu_kick(cpu);
1926    qemu_mutex_unlock_iothread();
1927    qemu_thread_join(cpu->thread);
1928    qemu_mutex_lock_iothread();
1929}
1930
1931/* For temporary buffers for forming a name */
1932#define VCPU_THREAD_NAME_SIZE 16
1933
1934static void qemu_tcg_init_vcpu(CPUState *cpu)
1935{
1936    char thread_name[VCPU_THREAD_NAME_SIZE];
1937    static QemuCond *single_tcg_halt_cond;
1938    static QemuThread *single_tcg_cpu_thread;
1939    static int tcg_region_inited;
1940
1941    assert(tcg_enabled());
1942    /*
1943     * Initialize TCG regions--once. Now is a good time, because:
1944     * (1) TCG's init context, prologue and target globals have been set up.
1945     * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1946     *     -accel flag is processed, so the check doesn't work then).
1947     */
1948    if (!tcg_region_inited) {
1949        tcg_region_inited = 1;
1950        tcg_region_init();
1951    }
1952
1953    if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1954        cpu->thread = g_malloc0(sizeof(QemuThread));
1955        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1956        qemu_cond_init(cpu->halt_cond);
1957
1958        if (qemu_tcg_mttcg_enabled()) {
1959            /* create a thread per vCPU with TCG (MTTCG) */
1960            parallel_cpus = true;
1961            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1962                 cpu->cpu_index);
1963
1964            qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1965                               cpu, QEMU_THREAD_JOINABLE);
1966
1967        } else {
1968            /* share a single thread for all cpus with TCG */
1969            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1970            qemu_thread_create(cpu->thread, thread_name,
1971                               qemu_tcg_rr_cpu_thread_fn,
1972                               cpu, QEMU_THREAD_JOINABLE);
1973
1974            single_tcg_halt_cond = cpu->halt_cond;
1975            single_tcg_cpu_thread = cpu->thread;
1976        }
1977#ifdef _WIN32
1978        cpu->hThread = qemu_thread_get_handle(cpu->thread);
1979#endif
1980    } else {
1981        /* For non-MTTCG cases we share the thread */
1982        cpu->thread = single_tcg_cpu_thread;
1983        cpu->halt_cond = single_tcg_halt_cond;
1984        cpu->thread_id = first_cpu->thread_id;
1985        cpu->can_do_io = 1;
1986        cpu->created = true;
1987    }
1988}
1989
1990static void qemu_hax_start_vcpu(CPUState *cpu)
1991{
1992    char thread_name[VCPU_THREAD_NAME_SIZE];
1993
1994    cpu->thread = g_malloc0(sizeof(QemuThread));
1995    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1996    qemu_cond_init(cpu->halt_cond);
1997
1998    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
1999             cpu->cpu_index);
2000    qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2001                       cpu, QEMU_THREAD_JOINABLE);
2002#ifdef _WIN32
2003    cpu->hThread = qemu_thread_get_handle(cpu->thread);
2004#endif
2005}
2006
2007static void qemu_kvm_start_vcpu(CPUState *cpu)
2008{
2009    char thread_name[VCPU_THREAD_NAME_SIZE];
2010
2011    cpu->thread = g_malloc0(sizeof(QemuThread));
2012    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2013    qemu_cond_init(cpu->halt_cond);
2014    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2015             cpu->cpu_index);
2016    qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2017                       cpu, QEMU_THREAD_JOINABLE);
2018}
2019
2020static void qemu_hvf_start_vcpu(CPUState *cpu)
2021{
2022    char thread_name[VCPU_THREAD_NAME_SIZE];
2023
2024    /* HVF currently does not support TCG, and only runs in
2025     * unrestricted-guest mode. */
2026    assert(hvf_enabled());
2027
2028    cpu->thread = g_malloc0(sizeof(QemuThread));
2029    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2030    qemu_cond_init(cpu->halt_cond);
2031
2032    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2033             cpu->cpu_index);
2034    qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2035                       cpu, QEMU_THREAD_JOINABLE);
2036}
2037
2038static void qemu_whpx_start_vcpu(CPUState *cpu)
2039{
2040    char thread_name[VCPU_THREAD_NAME_SIZE];
2041
2042    cpu->thread = g_malloc0(sizeof(QemuThread));
2043    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2044    qemu_cond_init(cpu->halt_cond);
2045    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2046             cpu->cpu_index);
2047    qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2048                       cpu, QEMU_THREAD_JOINABLE);
2049#ifdef _WIN32
2050    cpu->hThread = qemu_thread_get_handle(cpu->thread);
2051#endif
2052}
2053
2054static void qemu_dummy_start_vcpu(CPUState *cpu)
2055{
2056    char thread_name[VCPU_THREAD_NAME_SIZE];
2057
2058    cpu->thread = g_malloc0(sizeof(QemuThread));
2059    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2060    qemu_cond_init(cpu->halt_cond);
2061    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2062             cpu->cpu_index);
2063    qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2064                       QEMU_THREAD_JOINABLE);
2065}
2066
2067void qemu_init_vcpu(CPUState *cpu)
2068{
2069    cpu->nr_cores = smp_cores;
2070    cpu->nr_threads = smp_threads;
2071    cpu->stopped = true;
2072
2073    if (!cpu->as) {
2074        /* If the target cpu hasn't set up any address spaces itself,
2075         * give it the default one.
2076         */
2077        cpu->num_ases = 1;
2078        cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2079    }
2080
2081    if (kvm_enabled()) {
2082        qemu_kvm_start_vcpu(cpu);
2083    } else if (hax_enabled()) {
2084        qemu_hax_start_vcpu(cpu);
2085    } else if (hvf_enabled()) {
2086        qemu_hvf_start_vcpu(cpu);
2087    } else if (tcg_enabled()) {
2088        qemu_tcg_init_vcpu(cpu);
2089    } else if (whpx_enabled()) {
2090        qemu_whpx_start_vcpu(cpu);
2091    } else {
2092        qemu_dummy_start_vcpu(cpu);
2093    }
2094
2095    while (!cpu->created) {
2096        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2097    }
2098}
2099
2100void cpu_stop_current(void)
2101{
2102    if (current_cpu) {
2103        qemu_cpu_stop(current_cpu, true);
2104    }
2105}
2106
2107int vm_stop(RunState state)
2108{
2109    if (qemu_in_vcpu_thread()) {
2110        qemu_system_vmstop_request_prepare();
2111        qemu_system_vmstop_request(state);
2112        /*
2113         * FIXME: should not return to device code in case
2114         * vm_stop() has been requested.
2115         */
2116        cpu_stop_current();
2117        return 0;
2118    }
2119
2120    return do_vm_stop(state, true);
2121}
2122
2123/**
2124 * Prepare for (re)starting the VM.
2125 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2126 * running or in case of an error condition), 0 otherwise.
2127 */
2128int vm_prepare_start(void)
2129{
2130    RunState requested;
2131
2132    qemu_vmstop_requested(&requested);
2133    if (runstate_is_running() && requested == RUN_STATE__MAX) {
2134        return -1;
2135    }
2136
2137    /* Ensure that a STOP/RESUME pair of events is emitted if a
2138     * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2139     * example, according to documentation is always followed by
2140     * the STOP event.
2141     */
2142    if (runstate_is_running()) {
2143        qapi_event_send_stop();
2144        qapi_event_send_resume();
2145        return -1;
2146    }
2147
2148    /* We are sending this now, but the CPUs will be resumed shortly later */
2149    qapi_event_send_resume();
2150
2151    replay_enable_events();
2152    cpu_enable_ticks();
2153    runstate_set(RUN_STATE_RUNNING);
2154    vm_state_notify(1, RUN_STATE_RUNNING);
2155    return 0;
2156}
2157
2158void vm_start(void)
2159{
2160    if (!vm_prepare_start()) {
2161        resume_all_vcpus();
2162    }
2163}
2164
2165/* does a state transition even if the VM is already stopped,
2166   current state is forgotten forever */
2167int vm_stop_force_state(RunState state)
2168{
2169    if (runstate_is_running()) {
2170        return vm_stop(state);
2171    } else {
2172        runstate_set(state);
2173
2174        bdrv_drain_all();
2175        /* Make sure to return an error if the flush in a previous vm_stop()
2176         * failed. */
2177        return bdrv_flush_all();
2178    }
2179}
2180
2181void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg)
2182{
2183    /* XXX: implement xxx_cpu_list for targets that still miss it */
2184#if defined(cpu_list)
2185    cpu_list(f, cpu_fprintf);
2186#endif
2187}
2188
2189CpuInfoList *qmp_query_cpus(Error **errp)
2190{
2191    MachineState *ms = MACHINE(qdev_get_machine());
2192    MachineClass *mc = MACHINE_GET_CLASS(ms);
2193    CpuInfoList *head = NULL, *cur_item = NULL;
2194    CPUState *cpu;
2195
2196    CPU_FOREACH(cpu) {
2197        CpuInfoList *info;
2198#if defined(TARGET_I386)
2199        X86CPU *x86_cpu = X86_CPU(cpu);
2200        CPUX86State *env = &x86_cpu->env;
2201#elif defined(TARGET_PPC)
2202        PowerPCCPU *ppc_cpu = POWERPC_CPU(cpu);
2203        CPUPPCState *env = &ppc_cpu->env;
2204#elif defined(TARGET_SPARC)
2205        SPARCCPU *sparc_cpu = SPARC_CPU(cpu);
2206        CPUSPARCState *env = &sparc_cpu->env;
2207#elif defined(TARGET_RISCV)
2208        RISCVCPU *riscv_cpu = RISCV_CPU(cpu);
2209        CPURISCVState *env = &riscv_cpu->env;
2210#elif defined(TARGET_MIPS)
2211        MIPSCPU *mips_cpu = MIPS_CPU(cpu);
2212        CPUMIPSState *env = &mips_cpu->env;
2213#elif defined(TARGET_TRICORE)
2214        TriCoreCPU *tricore_cpu = TRICORE_CPU(cpu);
2215        CPUTriCoreState *env = &tricore_cpu->env;
2216#elif defined(TARGET_S390X)
2217        S390CPU *s390_cpu = S390_CPU(cpu);
2218        CPUS390XState *env = &s390_cpu->env;
2219#endif
2220
2221        cpu_synchronize_state(cpu);
2222
2223        info = g_malloc0(sizeof(*info));
2224        info->value = g_malloc0(sizeof(*info->value));
2225        info->value->CPU = cpu->cpu_index;
2226        info->value->current = (cpu == first_cpu);
2227        info->value->halted = cpu->halted;
2228        info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2229        info->value->thread_id = cpu->thread_id;
2230#if defined(TARGET_I386)
2231        info->value->arch = CPU_INFO_ARCH_X86;
2232        info->value->u.x86.pc = env->eip + env->segs[R_CS].base;
2233#elif defined(TARGET_PPC)
2234        info->value->arch = CPU_INFO_ARCH_PPC;
2235        info->value->u.ppc.nip = env->nip;
2236#elif defined(TARGET_SPARC)
2237        info->value->arch = CPU_INFO_ARCH_SPARC;
2238        info->value->u.q_sparc.pc = env->pc;
2239        info->value->u.q_sparc.npc = env->npc;
2240#elif defined(TARGET_MIPS)
2241        info->value->arch = CPU_INFO_ARCH_MIPS;
2242        info->value->u.q_mips.PC = env->active_tc.PC;
2243#elif defined(TARGET_TRICORE)
2244        info->value->arch = CPU_INFO_ARCH_TRICORE;
2245        info->value->u.tricore.PC = env->PC;
2246#elif defined(TARGET_S390X)
2247        info->value->arch = CPU_INFO_ARCH_S390;
2248        info->value->u.s390.cpu_state = env->cpu_state;
2249#elif defined(TARGET_RISCV)
2250        info->value->arch = CPU_INFO_ARCH_RISCV;
2251        info->value->u.riscv.pc = env->pc;
2252#else
2253        info->value->arch = CPU_INFO_ARCH_OTHER;
2254#endif
2255        info->value->has_props = !!mc->cpu_index_to_instance_props;
2256        if (info->value->has_props) {
2257            CpuInstanceProperties *props;
2258            props = g_malloc0(sizeof(*props));
2259            *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2260            info->value->props = props;
2261        }
2262
2263        /* XXX: waiting for the qapi to support GSList */
2264        if (!cur_item) {
2265            head = cur_item = info;
2266        } else {
2267            cur_item->next = info;
2268            cur_item = info;
2269        }
2270    }
2271
2272    return head;
2273}
2274
2275static CpuInfoArch sysemu_target_to_cpuinfo_arch(SysEmuTarget target)
2276{
2277    /*
2278     * The @SysEmuTarget -> @CpuInfoArch mapping below is based on the
2279     * TARGET_ARCH -> TARGET_BASE_ARCH mapping in the "configure" script.
2280     */
2281    switch (target) {
2282    case SYS_EMU_TARGET_I386:
2283    case SYS_EMU_TARGET_X86_64:
2284        return CPU_INFO_ARCH_X86;
2285
2286    case SYS_EMU_TARGET_PPC:
2287    case SYS_EMU_TARGET_PPC64:
2288        return CPU_INFO_ARCH_PPC;
2289
2290    case SYS_EMU_TARGET_SPARC:
2291    case SYS_EMU_TARGET_SPARC64:
2292        return CPU_INFO_ARCH_SPARC;
2293
2294    case SYS_EMU_TARGET_MIPS:
2295    case SYS_EMU_TARGET_MIPSEL:
2296    case SYS_EMU_TARGET_MIPS64:
2297    case SYS_EMU_TARGET_MIPS64EL:
2298        return CPU_INFO_ARCH_MIPS;
2299
2300    case SYS_EMU_TARGET_TRICORE:
2301        return CPU_INFO_ARCH_TRICORE;
2302
2303    case SYS_EMU_TARGET_S390X:
2304        return CPU_INFO_ARCH_S390;
2305
2306    case SYS_EMU_TARGET_RISCV32:
2307    case SYS_EMU_TARGET_RISCV64:
2308        return CPU_INFO_ARCH_RISCV;
2309
2310    default:
2311        return CPU_INFO_ARCH_OTHER;
2312    }
2313}
2314
2315static void cpustate_to_cpuinfo_s390(CpuInfoS390 *info, const CPUState *cpu)
2316{
2317#ifdef TARGET_S390X
2318    S390CPU *s390_cpu = S390_CPU(cpu);
2319    CPUS390XState *env = &s390_cpu->env;
2320
2321    info->cpu_state = env->cpu_state;
2322#else
2323    abort();
2324#endif
2325}
2326
2327/*
2328 * fast means: we NEVER interrupt vCPU threads to retrieve
2329 * information from KVM.
2330 */
2331CpuInfoFastList *qmp_query_cpus_fast(Error **errp)
2332{
2333    MachineState *ms = MACHINE(qdev_get_machine());
2334    MachineClass *mc = MACHINE_GET_CLASS(ms);
2335    CpuInfoFastList *head = NULL, *cur_item = NULL;
2336    SysEmuTarget target = qapi_enum_parse(&SysEmuTarget_lookup, TARGET_NAME,
2337                                          -1, &error_abort);
2338    CPUState *cpu;
2339
2340    CPU_FOREACH(cpu) {
2341        CpuInfoFastList *info = g_malloc0(sizeof(*info));
2342        info->value = g_malloc0(sizeof(*info->value));
2343
2344        info->value->cpu_index = cpu->cpu_index;
2345        info->value->qom_path = object_get_canonical_path(OBJECT(cpu));
2346        info->value->thread_id = cpu->thread_id;
2347
2348        info->value->has_props = !!mc->cpu_index_to_instance_props;
2349        if (info->value->has_props) {
2350            CpuInstanceProperties *props;
2351            props = g_malloc0(sizeof(*props));
2352            *props = mc->cpu_index_to_instance_props(ms, cpu->cpu_index);
2353            info->value->props = props;
2354        }
2355
2356        info->value->arch = sysemu_target_to_cpuinfo_arch(target);
2357        info->value->target = target;
2358        if (target == SYS_EMU_TARGET_S390X) {
2359            cpustate_to_cpuinfo_s390(&info->value->u.s390x, cpu);
2360        }
2361
2362        if (!cur_item) {
2363            head = cur_item = info;
2364        } else {
2365            cur_item->next = info;
2366            cur_item = info;
2367        }
2368    }
2369
2370    return head;
2371}
2372
2373void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2374                 bool has_cpu, int64_t cpu_index, Error **errp)
2375{
2376    FILE *f;
2377    uint32_t l;
2378    CPUState *cpu;
2379    uint8_t buf[1024];
2380    int64_t orig_addr = addr, orig_size = size;
2381
2382    if (!has_cpu) {
2383        cpu_index = 0;
2384    }
2385
2386    cpu = qemu_get_cpu(cpu_index);
2387    if (cpu == NULL) {
2388        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2389                   "a CPU number");
2390        return;
2391    }
2392
2393    f = fopen(filename, "wb");
2394    if (!f) {
2395        error_setg_file_open(errp, errno, filename);
2396        return;
2397    }
2398
2399    while (size != 0) {
2400        l = sizeof(buf);
2401        if (l > size)
2402            l = size;
2403        if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2404            error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2405                             " specified", orig_addr, orig_size);
2406            goto exit;
2407        }
2408        if (fwrite(buf, 1, l, f) != l) {
2409            error_setg(errp, QERR_IO_ERROR);
2410            goto exit;
2411        }
2412        addr += l;
2413        size -= l;
2414    }
2415
2416exit:
2417    fclose(f);
2418}
2419
2420void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2421                  Error **errp)
2422{
2423    FILE *f;
2424    uint32_t l;
2425    uint8_t buf[1024];
2426
2427    f = fopen(filename, "wb");
2428    if (!f) {
2429        error_setg_file_open(errp, errno, filename);
2430        return;
2431    }
2432
2433    while (size != 0) {
2434        l = sizeof(buf);
2435        if (l > size)
2436            l = size;
2437        cpu_physical_memory_read(addr, buf, l);
2438        if (fwrite(buf, 1, l, f) != l) {
2439            error_setg(errp, QERR_IO_ERROR);
2440            goto exit;
2441        }
2442        addr += l;
2443        size -= l;
2444    }
2445
2446exit:
2447    fclose(f);
2448}
2449
2450void qmp_inject_nmi(Error **errp)
2451{
2452    nmi_monitor_handle(monitor_get_cpu_index(), errp);
2453}
2454
2455void dump_drift_info(FILE *f, fprintf_function cpu_fprintf)
2456{
2457    if (!use_icount) {
2458        return;
2459    }
2460
2461    cpu_fprintf(f, "Host - Guest clock  %"PRIi64" ms\n",
2462                (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2463    if (icount_align_option) {
2464        cpu_fprintf(f, "Max guest delay     %"PRIi64" ms\n", -max_delay/SCALE_MS);
2465        cpu_fprintf(f, "Max guest advance   %"PRIi64" ms\n", max_advance/SCALE_MS);
2466    } else {
2467        cpu_fprintf(f, "Max guest delay     NA\n");
2468        cpu_fprintf(f, "Max guest advance   NA\n");
2469    }
2470}
2471