qemu/cpus.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qemu-common.h"
  27#include "qemu/config-file.h"
  28#include "migration/vmstate.h"
  29#include "monitor/monitor.h"
  30#include "qapi/error.h"
  31#include "qapi/qapi-commands-misc.h"
  32#include "qapi/qapi-events-run-state.h"
  33#include "qapi/qmp/qerror.h"
  34#include "qemu/error-report.h"
  35#include "qemu/qemu-print.h"
  36#include "sysemu/tcg.h"
  37#include "sysemu/block-backend.h"
  38#include "exec/gdbstub.h"
  39#include "sysemu/dma.h"
  40#include "sysemu/hw_accel.h"
  41#include "sysemu/kvm.h"
  42#include "sysemu/hax.h"
  43#include "sysemu/hvf.h"
  44#include "sysemu/whpx.h"
  45#include "exec/exec-all.h"
  46
  47#include "qemu/thread.h"
  48#include "qemu/plugin.h"
  49#include "sysemu/cpus.h"
  50#include "sysemu/qtest.h"
  51#include "qemu/main-loop.h"
  52#include "qemu/option.h"
  53#include "qemu/bitmap.h"
  54#include "qemu/seqlock.h"
  55#include "qemu/guest-random.h"
  56#include "tcg.h"
  57#include "hw/nmi.h"
  58#include "sysemu/replay.h"
  59#include "sysemu/runstate.h"
  60#include "hw/boards.h"
  61#include "hw/hw.h"
  62
  63#ifdef CONFIG_LINUX
  64
  65#include <sys/prctl.h>
  66
  67#ifndef PR_MCE_KILL
  68#define PR_MCE_KILL 33
  69#endif
  70
  71#ifndef PR_MCE_KILL_SET
  72#define PR_MCE_KILL_SET 1
  73#endif
  74
  75#ifndef PR_MCE_KILL_EARLY
  76#define PR_MCE_KILL_EARLY 1
  77#endif
  78
  79#endif /* CONFIG_LINUX */
  80
  81static QemuMutex qemu_global_mutex;
  82
  83int64_t max_delay;
  84int64_t max_advance;
  85
  86/* vcpu throttling controls */
  87static QEMUTimer *throttle_timer;
  88static unsigned int throttle_percentage;
  89
  90#define CPU_THROTTLE_PCT_MIN 1
  91#define CPU_THROTTLE_PCT_MAX 99
  92#define CPU_THROTTLE_TIMESLICE_NS 10000000
  93
  94bool cpu_is_stopped(CPUState *cpu)
  95{
  96    return cpu->stopped || !runstate_is_running();
  97}
  98
  99static bool cpu_thread_is_idle(CPUState *cpu)
 100{
 101    if (cpu->stop || cpu->queued_work_first) {
 102        return false;
 103    }
 104    if (cpu_is_stopped(cpu)) {
 105        return true;
 106    }
 107    if (!cpu->halted || cpu_has_work(cpu) ||
 108        kvm_halt_in_kernel()) {
 109        return false;
 110    }
 111    return true;
 112}
 113
 114static bool all_cpu_threads_idle(void)
 115{
 116    CPUState *cpu;
 117
 118    CPU_FOREACH(cpu) {
 119        if (!cpu_thread_is_idle(cpu)) {
 120            return false;
 121        }
 122    }
 123    return true;
 124}
 125
 126/***********************************************************/
 127/* guest cycle counter */
 128
 129/* Protected by TimersState seqlock */
 130
 131static bool icount_sleep = true;
 132/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 133#define MAX_ICOUNT_SHIFT 10
 134
 135typedef struct TimersState {
 136    /* Protected by BQL.  */
 137    int64_t cpu_ticks_prev;
 138    int64_t cpu_ticks_offset;
 139
 140    /* Protect fields that can be respectively read outside the
 141     * BQL, and written from multiple threads.
 142     */
 143    QemuSeqLock vm_clock_seqlock;
 144    QemuSpin vm_clock_lock;
 145
 146    int16_t cpu_ticks_enabled;
 147
 148    /* Conversion factor from emulated instructions to virtual clock ticks.  */
 149    int16_t icount_time_shift;
 150
 151    /* Compensate for varying guest execution speed.  */
 152    int64_t qemu_icount_bias;
 153
 154    int64_t vm_clock_warp_start;
 155    int64_t cpu_clock_offset;
 156
 157    /* Only written by TCG thread */
 158    int64_t qemu_icount;
 159
 160    /* for adjusting icount */
 161    QEMUTimer *icount_rt_timer;
 162    QEMUTimer *icount_vm_timer;
 163    QEMUTimer *icount_warp_timer;
 164} TimersState;
 165
 166static TimersState timers_state;
 167bool mttcg_enabled;
 168
 169/*
 170 * We default to false if we know other options have been enabled
 171 * which are currently incompatible with MTTCG. Otherwise when each
 172 * guest (target) has been updated to support:
 173 *   - atomic instructions
 174 *   - memory ordering primitives (barriers)
 175 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 176 *
 177 * Once a guest architecture has been converted to the new primitives
 178 * there are two remaining limitations to check.
 179 *
 180 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 181 * - The host must have a stronger memory order than the guest
 182 *
 183 * It may be possible in future to support strong guests on weak hosts
 184 * but that will require tagging all load/stores in a guest with their
 185 * implicit memory order requirements which would likely slow things
 186 * down a lot.
 187 */
 188
 189static bool check_tcg_memory_orders_compatible(void)
 190{
 191#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 192    return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 193#else
 194    return false;
 195#endif
 196}
 197
 198static bool default_mttcg_enabled(void)
 199{
 200    if (use_icount || TCG_OVERSIZED_GUEST) {
 201        return false;
 202    } else {
 203#ifdef TARGET_SUPPORTS_MTTCG
 204        return check_tcg_memory_orders_compatible();
 205#else
 206        return false;
 207#endif
 208    }
 209}
 210
 211void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 212{
 213    const char *t = qemu_opt_get(opts, "thread");
 214    if (t) {
 215        if (strcmp(t, "multi") == 0) {
 216            if (TCG_OVERSIZED_GUEST) {
 217                error_setg(errp, "No MTTCG when guest word size > hosts");
 218            } else if (use_icount) {
 219                error_setg(errp, "No MTTCG when icount is enabled");
 220            } else {
 221#ifndef TARGET_SUPPORTS_MTTCG
 222                warn_report("Guest not yet converted to MTTCG - "
 223                            "you may get unexpected results");
 224#endif
 225                if (!check_tcg_memory_orders_compatible()) {
 226                    warn_report("Guest expects a stronger memory ordering "
 227                                "than the host provides");
 228                    error_printf("This may cause strange/hard to debug errors\n");
 229                }
 230                mttcg_enabled = true;
 231            }
 232        } else if (strcmp(t, "single") == 0) {
 233            mttcg_enabled = false;
 234        } else {
 235            error_setg(errp, "Invalid 'thread' setting %s", t);
 236        }
 237    } else {
 238        mttcg_enabled = default_mttcg_enabled();
 239    }
 240}
 241
 242/* The current number of executed instructions is based on what we
 243 * originally budgeted minus the current state of the decrementing
 244 * icount counters in extra/u16.low.
 245 */
 246static int64_t cpu_get_icount_executed(CPUState *cpu)
 247{
 248    return (cpu->icount_budget -
 249            (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 250}
 251
 252/*
 253 * Update the global shared timer_state.qemu_icount to take into
 254 * account executed instructions. This is done by the TCG vCPU
 255 * thread so the main-loop can see time has moved forward.
 256 */
 257static void cpu_update_icount_locked(CPUState *cpu)
 258{
 259    int64_t executed = cpu_get_icount_executed(cpu);
 260    cpu->icount_budget -= executed;
 261
 262    atomic_set_i64(&timers_state.qemu_icount,
 263                   timers_state.qemu_icount + executed);
 264}
 265
 266/*
 267 * Update the global shared timer_state.qemu_icount to take into
 268 * account executed instructions. This is done by the TCG vCPU
 269 * thread so the main-loop can see time has moved forward.
 270 */
 271void cpu_update_icount(CPUState *cpu)
 272{
 273    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 274                       &timers_state.vm_clock_lock);
 275    cpu_update_icount_locked(cpu);
 276    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 277                         &timers_state.vm_clock_lock);
 278}
 279
 280static int64_t cpu_get_icount_raw_locked(void)
 281{
 282    CPUState *cpu = current_cpu;
 283
 284    if (cpu && cpu->running) {
 285        if (!cpu->can_do_io) {
 286            error_report("Bad icount read");
 287            exit(1);
 288        }
 289        /* Take into account what has run */
 290        cpu_update_icount_locked(cpu);
 291    }
 292    /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 293    return atomic_read_i64(&timers_state.qemu_icount);
 294}
 295
 296static int64_t cpu_get_icount_locked(void)
 297{
 298    int64_t icount = cpu_get_icount_raw_locked();
 299    return atomic_read_i64(&timers_state.qemu_icount_bias) +
 300        cpu_icount_to_ns(icount);
 301}
 302
 303int64_t cpu_get_icount_raw(void)
 304{
 305    int64_t icount;
 306    unsigned start;
 307
 308    do {
 309        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 310        icount = cpu_get_icount_raw_locked();
 311    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 312
 313    return icount;
 314}
 315
 316/* Return the virtual CPU time, based on the instruction counter.  */
 317int64_t cpu_get_icount(void)
 318{
 319    int64_t icount;
 320    unsigned start;
 321
 322    do {
 323        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 324        icount = cpu_get_icount_locked();
 325    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 326
 327    return icount;
 328}
 329
 330int64_t cpu_icount_to_ns(int64_t icount)
 331{
 332    return icount << atomic_read(&timers_state.icount_time_shift);
 333}
 334
 335static int64_t cpu_get_ticks_locked(void)
 336{
 337    int64_t ticks = timers_state.cpu_ticks_offset;
 338    if (timers_state.cpu_ticks_enabled) {
 339        ticks += cpu_get_host_ticks();
 340    }
 341
 342    if (timers_state.cpu_ticks_prev > ticks) {
 343        /* Non increasing ticks may happen if the host uses software suspend.  */
 344        timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 345        ticks = timers_state.cpu_ticks_prev;
 346    }
 347
 348    timers_state.cpu_ticks_prev = ticks;
 349    return ticks;
 350}
 351
 352/* return the time elapsed in VM between vm_start and vm_stop.  Unless
 353 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 354 * counter.
 355 */
 356int64_t cpu_get_ticks(void)
 357{
 358    int64_t ticks;
 359
 360    if (use_icount) {
 361        return cpu_get_icount();
 362    }
 363
 364    qemu_spin_lock(&timers_state.vm_clock_lock);
 365    ticks = cpu_get_ticks_locked();
 366    qemu_spin_unlock(&timers_state.vm_clock_lock);
 367    return ticks;
 368}
 369
 370static int64_t cpu_get_clock_locked(void)
 371{
 372    int64_t time;
 373
 374    time = timers_state.cpu_clock_offset;
 375    if (timers_state.cpu_ticks_enabled) {
 376        time += get_clock();
 377    }
 378
 379    return time;
 380}
 381
 382/* Return the monotonic time elapsed in VM, i.e.,
 383 * the time between vm_start and vm_stop
 384 */
 385int64_t cpu_get_clock(void)
 386{
 387    int64_t ti;
 388    unsigned start;
 389
 390    do {
 391        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 392        ti = cpu_get_clock_locked();
 393    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 394
 395    return ti;
 396}
 397
 398/* enable cpu_get_ticks()
 399 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 400 */
 401void cpu_enable_ticks(void)
 402{
 403    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 404                       &timers_state.vm_clock_lock);
 405    if (!timers_state.cpu_ticks_enabled) {
 406        timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 407        timers_state.cpu_clock_offset -= get_clock();
 408        timers_state.cpu_ticks_enabled = 1;
 409    }
 410    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 411                       &timers_state.vm_clock_lock);
 412}
 413
 414/* disable cpu_get_ticks() : the clock is stopped. You must not call
 415 * cpu_get_ticks() after that.
 416 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 417 */
 418void cpu_disable_ticks(void)
 419{
 420    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 421                       &timers_state.vm_clock_lock);
 422    if (timers_state.cpu_ticks_enabled) {
 423        timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 424        timers_state.cpu_clock_offset = cpu_get_clock_locked();
 425        timers_state.cpu_ticks_enabled = 0;
 426    }
 427    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 428                         &timers_state.vm_clock_lock);
 429}
 430
 431/* Correlation between real and virtual time is always going to be
 432   fairly approximate, so ignore small variation.
 433   When the guest is idle real and virtual time will be aligned in
 434   the IO wait loop.  */
 435#define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 436
 437static void icount_adjust(void)
 438{
 439    int64_t cur_time;
 440    int64_t cur_icount;
 441    int64_t delta;
 442
 443    /* Protected by TimersState mutex.  */
 444    static int64_t last_delta;
 445
 446    /* If the VM is not running, then do nothing.  */
 447    if (!runstate_is_running()) {
 448        return;
 449    }
 450
 451    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 452                       &timers_state.vm_clock_lock);
 453    cur_time = cpu_get_clock_locked();
 454    cur_icount = cpu_get_icount_locked();
 455
 456    delta = cur_icount - cur_time;
 457    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 458    if (delta > 0
 459        && last_delta + ICOUNT_WOBBLE < delta * 2
 460        && timers_state.icount_time_shift > 0) {
 461        /* The guest is getting too far ahead.  Slow time down.  */
 462        atomic_set(&timers_state.icount_time_shift,
 463                   timers_state.icount_time_shift - 1);
 464    }
 465    if (delta < 0
 466        && last_delta - ICOUNT_WOBBLE > delta * 2
 467        && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 468        /* The guest is getting too far behind.  Speed time up.  */
 469        atomic_set(&timers_state.icount_time_shift,
 470                   timers_state.icount_time_shift + 1);
 471    }
 472    last_delta = delta;
 473    atomic_set_i64(&timers_state.qemu_icount_bias,
 474                   cur_icount - (timers_state.qemu_icount
 475                                 << timers_state.icount_time_shift));
 476    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 477                         &timers_state.vm_clock_lock);
 478}
 479
 480static void icount_adjust_rt(void *opaque)
 481{
 482    timer_mod(timers_state.icount_rt_timer,
 483              qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 484    icount_adjust();
 485}
 486
 487static void icount_adjust_vm(void *opaque)
 488{
 489    timer_mod(timers_state.icount_vm_timer,
 490                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 491                   NANOSECONDS_PER_SECOND / 10);
 492    icount_adjust();
 493}
 494
 495static int64_t qemu_icount_round(int64_t count)
 496{
 497    int shift = atomic_read(&timers_state.icount_time_shift);
 498    return (count + (1 << shift) - 1) >> shift;
 499}
 500
 501static void icount_warp_rt(void)
 502{
 503    unsigned seq;
 504    int64_t warp_start;
 505
 506    /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 507     * changes from -1 to another value, so the race here is okay.
 508     */
 509    do {
 510        seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 511        warp_start = timers_state.vm_clock_warp_start;
 512    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 513
 514    if (warp_start == -1) {
 515        return;
 516    }
 517
 518    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 519                       &timers_state.vm_clock_lock);
 520    if (runstate_is_running()) {
 521        int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 522                                            cpu_get_clock_locked());
 523        int64_t warp_delta;
 524
 525        warp_delta = clock - timers_state.vm_clock_warp_start;
 526        if (use_icount == 2) {
 527            /*
 528             * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 529             * far ahead of real time.
 530             */
 531            int64_t cur_icount = cpu_get_icount_locked();
 532            int64_t delta = clock - cur_icount;
 533            warp_delta = MIN(warp_delta, delta);
 534        }
 535        atomic_set_i64(&timers_state.qemu_icount_bias,
 536                       timers_state.qemu_icount_bias + warp_delta);
 537    }
 538    timers_state.vm_clock_warp_start = -1;
 539    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 540                       &timers_state.vm_clock_lock);
 541
 542    if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 543        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 544    }
 545}
 546
 547static void icount_timer_cb(void *opaque)
 548{
 549    /* No need for a checkpoint because the timer already synchronizes
 550     * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 551     */
 552    icount_warp_rt();
 553}
 554
 555void qtest_clock_warp(int64_t dest)
 556{
 557    int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 558    AioContext *aio_context;
 559    assert(qtest_enabled());
 560    aio_context = qemu_get_aio_context();
 561    while (clock < dest) {
 562        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 563                                                      QEMU_TIMER_ATTR_ALL);
 564        int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 565
 566        seqlock_write_lock(&timers_state.vm_clock_seqlock,
 567                           &timers_state.vm_clock_lock);
 568        atomic_set_i64(&timers_state.qemu_icount_bias,
 569                       timers_state.qemu_icount_bias + warp);
 570        seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 571                             &timers_state.vm_clock_lock);
 572
 573        qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 574        timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 575        clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 576    }
 577    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 578}
 579
 580void qemu_start_warp_timer(void)
 581{
 582    int64_t clock;
 583    int64_t deadline;
 584
 585    if (!use_icount) {
 586        return;
 587    }
 588
 589    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 590     * do not fire, so computing the deadline does not make sense.
 591     */
 592    if (!runstate_is_running()) {
 593        return;
 594    }
 595
 596    if (replay_mode != REPLAY_MODE_PLAY) {
 597        if (!all_cpu_threads_idle()) {
 598            return;
 599        }
 600
 601        if (qtest_enabled()) {
 602            /* When testing, qtest commands advance icount.  */
 603            return;
 604        }
 605
 606        replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 607    } else {
 608        /* warp clock deterministically in record/replay mode */
 609        if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 610            /* vCPU is sleeping and warp can't be started.
 611               It is probably a race condition: notification sent
 612               to vCPU was processed in advance and vCPU went to sleep.
 613               Therefore we have to wake it up for doing someting. */
 614            if (replay_has_checkpoint()) {
 615                qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 616            }
 617            return;
 618        }
 619    }
 620
 621    /* We want to use the earliest deadline from ALL vm_clocks */
 622    clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 623    deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 624                                          ~QEMU_TIMER_ATTR_EXTERNAL);
 625    if (deadline < 0) {
 626        static bool notified;
 627        if (!icount_sleep && !notified) {
 628            warn_report("icount sleep disabled and no active timers");
 629            notified = true;
 630        }
 631        return;
 632    }
 633
 634    if (deadline > 0) {
 635        /*
 636         * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 637         * sleep.  Otherwise, the CPU might be waiting for a future timer
 638         * interrupt to wake it up, but the interrupt never comes because
 639         * the vCPU isn't running any insns and thus doesn't advance the
 640         * QEMU_CLOCK_VIRTUAL.
 641         */
 642        if (!icount_sleep) {
 643            /*
 644             * We never let VCPUs sleep in no sleep icount mode.
 645             * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 646             * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 647             * It is useful when we want a deterministic execution time,
 648             * isolated from host latencies.
 649             */
 650            seqlock_write_lock(&timers_state.vm_clock_seqlock,
 651                               &timers_state.vm_clock_lock);
 652            atomic_set_i64(&timers_state.qemu_icount_bias,
 653                           timers_state.qemu_icount_bias + deadline);
 654            seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 655                                 &timers_state.vm_clock_lock);
 656            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 657        } else {
 658            /*
 659             * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 660             * "real" time, (related to the time left until the next event) has
 661             * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 662             * This avoids that the warps are visible externally; for example,
 663             * you will not be sending network packets continuously instead of
 664             * every 100ms.
 665             */
 666            seqlock_write_lock(&timers_state.vm_clock_seqlock,
 667                               &timers_state.vm_clock_lock);
 668            if (timers_state.vm_clock_warp_start == -1
 669                || timers_state.vm_clock_warp_start > clock) {
 670                timers_state.vm_clock_warp_start = clock;
 671            }
 672            seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 673                                 &timers_state.vm_clock_lock);
 674            timer_mod_anticipate(timers_state.icount_warp_timer,
 675                                 clock + deadline);
 676        }
 677    } else if (deadline == 0) {
 678        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 679    }
 680}
 681
 682static void qemu_account_warp_timer(void)
 683{
 684    if (!use_icount || !icount_sleep) {
 685        return;
 686    }
 687
 688    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 689     * do not fire, so computing the deadline does not make sense.
 690     */
 691    if (!runstate_is_running()) {
 692        return;
 693    }
 694
 695    /* warp clock deterministically in record/replay mode */
 696    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 697        return;
 698    }
 699
 700    timer_del(timers_state.icount_warp_timer);
 701    icount_warp_rt();
 702}
 703
 704static bool icount_state_needed(void *opaque)
 705{
 706    return use_icount;
 707}
 708
 709static bool warp_timer_state_needed(void *opaque)
 710{
 711    TimersState *s = opaque;
 712    return s->icount_warp_timer != NULL;
 713}
 714
 715static bool adjust_timers_state_needed(void *opaque)
 716{
 717    TimersState *s = opaque;
 718    return s->icount_rt_timer != NULL;
 719}
 720
 721/*
 722 * Subsection for warp timer migration is optional, because may not be created
 723 */
 724static const VMStateDescription icount_vmstate_warp_timer = {
 725    .name = "timer/icount/warp_timer",
 726    .version_id = 1,
 727    .minimum_version_id = 1,
 728    .needed = warp_timer_state_needed,
 729    .fields = (VMStateField[]) {
 730        VMSTATE_INT64(vm_clock_warp_start, TimersState),
 731        VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 732        VMSTATE_END_OF_LIST()
 733    }
 734};
 735
 736static const VMStateDescription icount_vmstate_adjust_timers = {
 737    .name = "timer/icount/timers",
 738    .version_id = 1,
 739    .minimum_version_id = 1,
 740    .needed = adjust_timers_state_needed,
 741    .fields = (VMStateField[]) {
 742        VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 743        VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 744        VMSTATE_END_OF_LIST()
 745    }
 746};
 747
 748/*
 749 * This is a subsection for icount migration.
 750 */
 751static const VMStateDescription icount_vmstate_timers = {
 752    .name = "timer/icount",
 753    .version_id = 1,
 754    .minimum_version_id = 1,
 755    .needed = icount_state_needed,
 756    .fields = (VMStateField[]) {
 757        VMSTATE_INT64(qemu_icount_bias, TimersState),
 758        VMSTATE_INT64(qemu_icount, TimersState),
 759        VMSTATE_END_OF_LIST()
 760    },
 761    .subsections = (const VMStateDescription*[]) {
 762        &icount_vmstate_warp_timer,
 763        &icount_vmstate_adjust_timers,
 764        NULL
 765    }
 766};
 767
 768static const VMStateDescription vmstate_timers = {
 769    .name = "timer",
 770    .version_id = 2,
 771    .minimum_version_id = 1,
 772    .fields = (VMStateField[]) {
 773        VMSTATE_INT64(cpu_ticks_offset, TimersState),
 774        VMSTATE_UNUSED(8),
 775        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 776        VMSTATE_END_OF_LIST()
 777    },
 778    .subsections = (const VMStateDescription*[]) {
 779        &icount_vmstate_timers,
 780        NULL
 781    }
 782};
 783
 784static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 785{
 786    double pct;
 787    double throttle_ratio;
 788    int64_t sleeptime_ns, endtime_ns;
 789
 790    if (!cpu_throttle_get_percentage()) {
 791        return;
 792    }
 793
 794    pct = (double)cpu_throttle_get_percentage()/100;
 795    throttle_ratio = pct / (1 - pct);
 796    /* Add 1ns to fix double's rounding error (like 0.9999999...) */
 797    sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
 798    endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
 799    while (sleeptime_ns > 0 && !cpu->stop) {
 800        if (sleeptime_ns > SCALE_MS) {
 801            qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
 802                                sleeptime_ns / SCALE_MS);
 803        } else {
 804            qemu_mutex_unlock_iothread();
 805            g_usleep(sleeptime_ns / SCALE_US);
 806            qemu_mutex_lock_iothread();
 807        }
 808        sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 809    }
 810    atomic_set(&cpu->throttle_thread_scheduled, 0);
 811}
 812
 813static void cpu_throttle_timer_tick(void *opaque)
 814{
 815    CPUState *cpu;
 816    double pct;
 817
 818    /* Stop the timer if needed */
 819    if (!cpu_throttle_get_percentage()) {
 820        return;
 821    }
 822    CPU_FOREACH(cpu) {
 823        if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 824            async_run_on_cpu(cpu, cpu_throttle_thread,
 825                             RUN_ON_CPU_NULL);
 826        }
 827    }
 828
 829    pct = (double)cpu_throttle_get_percentage()/100;
 830    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 831                                   CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 832}
 833
 834void cpu_throttle_set(int new_throttle_pct)
 835{
 836    /* Ensure throttle percentage is within valid range */
 837    new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 838    new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 839
 840    atomic_set(&throttle_percentage, new_throttle_pct);
 841
 842    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 843                                       CPU_THROTTLE_TIMESLICE_NS);
 844}
 845
 846void cpu_throttle_stop(void)
 847{
 848    atomic_set(&throttle_percentage, 0);
 849}
 850
 851bool cpu_throttle_active(void)
 852{
 853    return (cpu_throttle_get_percentage() != 0);
 854}
 855
 856int cpu_throttle_get_percentage(void)
 857{
 858    return atomic_read(&throttle_percentage);
 859}
 860
 861void cpu_ticks_init(void)
 862{
 863    seqlock_init(&timers_state.vm_clock_seqlock);
 864    qemu_spin_init(&timers_state.vm_clock_lock);
 865    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 866    throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 867                                           cpu_throttle_timer_tick, NULL);
 868}
 869
 870void configure_icount(QemuOpts *opts, Error **errp)
 871{
 872    const char *option;
 873    char *rem_str = NULL;
 874
 875    option = qemu_opt_get(opts, "shift");
 876    if (!option) {
 877        if (qemu_opt_get(opts, "align") != NULL) {
 878            error_setg(errp, "Please specify shift option when using align");
 879        }
 880        return;
 881    }
 882
 883    icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 884    if (icount_sleep) {
 885        timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 886                                         icount_timer_cb, NULL);
 887    }
 888
 889    icount_align_option = qemu_opt_get_bool(opts, "align", false);
 890
 891    if (icount_align_option && !icount_sleep) {
 892        error_setg(errp, "align=on and sleep=off are incompatible");
 893    }
 894    if (strcmp(option, "auto") != 0) {
 895        errno = 0;
 896        timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 897        if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 898            error_setg(errp, "icount: Invalid shift value");
 899        }
 900        use_icount = 1;
 901        return;
 902    } else if (icount_align_option) {
 903        error_setg(errp, "shift=auto and align=on are incompatible");
 904    } else if (!icount_sleep) {
 905        error_setg(errp, "shift=auto and sleep=off are incompatible");
 906    }
 907
 908    use_icount = 2;
 909
 910    /* 125MIPS seems a reasonable initial guess at the guest speed.
 911       It will be corrected fairly quickly anyway.  */
 912    timers_state.icount_time_shift = 3;
 913
 914    /* Have both realtime and virtual time triggers for speed adjustment.
 915       The realtime trigger catches emulated time passing too slowly,
 916       the virtual time trigger catches emulated time passing too fast.
 917       Realtime triggers occur even when idle, so use them less frequently
 918       than VM triggers.  */
 919    timers_state.vm_clock_warp_start = -1;
 920    timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 921                                   icount_adjust_rt, NULL);
 922    timer_mod(timers_state.icount_rt_timer,
 923                   qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 924    timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 925                                        icount_adjust_vm, NULL);
 926    timer_mod(timers_state.icount_vm_timer,
 927                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 928                   NANOSECONDS_PER_SECOND / 10);
 929}
 930
 931/***********************************************************/
 932/* TCG vCPU kick timer
 933 *
 934 * The kick timer is responsible for moving single threaded vCPU
 935 * emulation on to the next vCPU. If more than one vCPU is running a
 936 * timer event with force a cpu->exit so the next vCPU can get
 937 * scheduled.
 938 *
 939 * The timer is removed if all vCPUs are idle and restarted again once
 940 * idleness is complete.
 941 */
 942
 943static QEMUTimer *tcg_kick_vcpu_timer;
 944static CPUState *tcg_current_rr_cpu;
 945
 946#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 947
 948static inline int64_t qemu_tcg_next_kick(void)
 949{
 950    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 951}
 952
 953/* Kick the currently round-robin scheduled vCPU to next */
 954static void qemu_cpu_kick_rr_next_cpu(void)
 955{
 956    CPUState *cpu;
 957    do {
 958        cpu = atomic_mb_read(&tcg_current_rr_cpu);
 959        if (cpu) {
 960            cpu_exit(cpu);
 961        }
 962    } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 963}
 964
 965/* Kick all RR vCPUs */
 966static void qemu_cpu_kick_rr_cpus(void)
 967{
 968    CPUState *cpu;
 969
 970    CPU_FOREACH(cpu) {
 971        cpu_exit(cpu);
 972    };
 973}
 974
 975static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 976{
 977}
 978
 979void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 980{
 981    if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 982        qemu_notify_event();
 983        return;
 984    }
 985
 986    if (qemu_in_vcpu_thread()) {
 987        /* A CPU is currently running; kick it back out to the
 988         * tcg_cpu_exec() loop so it will recalculate its
 989         * icount deadline immediately.
 990         */
 991        qemu_cpu_kick(current_cpu);
 992    } else if (first_cpu) {
 993        /* qemu_cpu_kick is not enough to kick a halted CPU out of
 994         * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 995         * causes cpu_thread_is_idle to return false.  This way,
 996         * handle_icount_deadline can run.
 997         * If we have no CPUs at all for some reason, we don't
 998         * need to do anything.
 999         */
1000        async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
1001    }
1002}
1003
1004static void kick_tcg_thread(void *opaque)
1005{
1006    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
1007    qemu_cpu_kick_rr_next_cpu();
1008}
1009
1010static void start_tcg_kick_timer(void)
1011{
1012    assert(!mttcg_enabled);
1013    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
1014        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
1015                                           kick_tcg_thread, NULL);
1016    }
1017    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
1018        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
1019    }
1020}
1021
1022static void stop_tcg_kick_timer(void)
1023{
1024    assert(!mttcg_enabled);
1025    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
1026        timer_del(tcg_kick_vcpu_timer);
1027    }
1028}
1029
1030/***********************************************************/
1031void hw_error(const char *fmt, ...)
1032{
1033    va_list ap;
1034    CPUState *cpu;
1035
1036    va_start(ap, fmt);
1037    fprintf(stderr, "qemu: hardware error: ");
1038    vfprintf(stderr, fmt, ap);
1039    fprintf(stderr, "\n");
1040    CPU_FOREACH(cpu) {
1041        fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1042        cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1043    }
1044    va_end(ap);
1045    abort();
1046}
1047
1048void cpu_synchronize_all_states(void)
1049{
1050    CPUState *cpu;
1051
1052    CPU_FOREACH(cpu) {
1053        cpu_synchronize_state(cpu);
1054        /* TODO: move to cpu_synchronize_state() */
1055        if (hvf_enabled()) {
1056            hvf_cpu_synchronize_state(cpu);
1057        }
1058    }
1059}
1060
1061void cpu_synchronize_all_post_reset(void)
1062{
1063    CPUState *cpu;
1064
1065    CPU_FOREACH(cpu) {
1066        cpu_synchronize_post_reset(cpu);
1067        /* TODO: move to cpu_synchronize_post_reset() */
1068        if (hvf_enabled()) {
1069            hvf_cpu_synchronize_post_reset(cpu);
1070        }
1071    }
1072}
1073
1074void cpu_synchronize_all_post_init(void)
1075{
1076    CPUState *cpu;
1077
1078    CPU_FOREACH(cpu) {
1079        cpu_synchronize_post_init(cpu);
1080        /* TODO: move to cpu_synchronize_post_init() */
1081        if (hvf_enabled()) {
1082            hvf_cpu_synchronize_post_init(cpu);
1083        }
1084    }
1085}
1086
1087void cpu_synchronize_all_pre_loadvm(void)
1088{
1089    CPUState *cpu;
1090
1091    CPU_FOREACH(cpu) {
1092        cpu_synchronize_pre_loadvm(cpu);
1093    }
1094}
1095
1096static int do_vm_stop(RunState state, bool send_stop)
1097{
1098    int ret = 0;
1099
1100    if (runstate_is_running()) {
1101        cpu_disable_ticks();
1102        pause_all_vcpus();
1103        runstate_set(state);
1104        vm_state_notify(0, state);
1105        if (send_stop) {
1106            qapi_event_send_stop();
1107        }
1108    }
1109
1110    bdrv_drain_all();
1111    ret = bdrv_flush_all();
1112
1113    return ret;
1114}
1115
1116/* Special vm_stop() variant for terminating the process.  Historically clients
1117 * did not expect a QMP STOP event and so we need to retain compatibility.
1118 */
1119int vm_shutdown(void)
1120{
1121    return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1122}
1123
1124static bool cpu_can_run(CPUState *cpu)
1125{
1126    if (cpu->stop) {
1127        return false;
1128    }
1129    if (cpu_is_stopped(cpu)) {
1130        return false;
1131    }
1132    return true;
1133}
1134
1135static void cpu_handle_guest_debug(CPUState *cpu)
1136{
1137    gdb_set_stop_cpu(cpu);
1138    qemu_system_debug_request();
1139    cpu->stopped = true;
1140}
1141
1142#ifdef CONFIG_LINUX
1143static void sigbus_reraise(void)
1144{
1145    sigset_t set;
1146    struct sigaction action;
1147
1148    memset(&action, 0, sizeof(action));
1149    action.sa_handler = SIG_DFL;
1150    if (!sigaction(SIGBUS, &action, NULL)) {
1151        raise(SIGBUS);
1152        sigemptyset(&set);
1153        sigaddset(&set, SIGBUS);
1154        pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1155    }
1156    perror("Failed to re-raise SIGBUS!\n");
1157    abort();
1158}
1159
1160static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1161{
1162    if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1163        sigbus_reraise();
1164    }
1165
1166    if (current_cpu) {
1167        /* Called asynchronously in VCPU thread.  */
1168        if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1169            sigbus_reraise();
1170        }
1171    } else {
1172        /* Called synchronously (via signalfd) in main thread.  */
1173        if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1174            sigbus_reraise();
1175        }
1176    }
1177}
1178
1179static void qemu_init_sigbus(void)
1180{
1181    struct sigaction action;
1182
1183    memset(&action, 0, sizeof(action));
1184    action.sa_flags = SA_SIGINFO;
1185    action.sa_sigaction = sigbus_handler;
1186    sigaction(SIGBUS, &action, NULL);
1187
1188    prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1189}
1190#else /* !CONFIG_LINUX */
1191static void qemu_init_sigbus(void)
1192{
1193}
1194#endif /* !CONFIG_LINUX */
1195
1196static QemuThread io_thread;
1197
1198/* cpu creation */
1199static QemuCond qemu_cpu_cond;
1200/* system init */
1201static QemuCond qemu_pause_cond;
1202
1203void qemu_init_cpu_loop(void)
1204{
1205    qemu_init_sigbus();
1206    qemu_cond_init(&qemu_cpu_cond);
1207    qemu_cond_init(&qemu_pause_cond);
1208    qemu_mutex_init(&qemu_global_mutex);
1209
1210    qemu_thread_get_self(&io_thread);
1211}
1212
1213void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1214{
1215    do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1216}
1217
1218static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1219{
1220    if (kvm_destroy_vcpu(cpu) < 0) {
1221        error_report("kvm_destroy_vcpu failed");
1222        exit(EXIT_FAILURE);
1223    }
1224}
1225
1226static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1227{
1228}
1229
1230static void qemu_cpu_stop(CPUState *cpu, bool exit)
1231{
1232    g_assert(qemu_cpu_is_self(cpu));
1233    cpu->stop = false;
1234    cpu->stopped = true;
1235    if (exit) {
1236        cpu_exit(cpu);
1237    }
1238    qemu_cond_broadcast(&qemu_pause_cond);
1239}
1240
1241static void qemu_wait_io_event_common(CPUState *cpu)
1242{
1243    atomic_mb_set(&cpu->thread_kicked, false);
1244    if (cpu->stop) {
1245        qemu_cpu_stop(cpu, false);
1246    }
1247    process_queued_cpu_work(cpu);
1248}
1249
1250static void qemu_tcg_rr_wait_io_event(void)
1251{
1252    CPUState *cpu;
1253
1254    while (all_cpu_threads_idle()) {
1255        stop_tcg_kick_timer();
1256        qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1257    }
1258
1259    start_tcg_kick_timer();
1260
1261    CPU_FOREACH(cpu) {
1262        qemu_wait_io_event_common(cpu);
1263    }
1264}
1265
1266static void qemu_wait_io_event(CPUState *cpu)
1267{
1268    bool slept = false;
1269
1270    while (cpu_thread_is_idle(cpu)) {
1271        if (!slept) {
1272            slept = true;
1273            qemu_plugin_vcpu_idle_cb(cpu);
1274        }
1275        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1276    }
1277    if (slept) {
1278        qemu_plugin_vcpu_resume_cb(cpu);
1279    }
1280
1281#ifdef _WIN32
1282    /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1283    if (!tcg_enabled()) {
1284        SleepEx(0, TRUE);
1285    }
1286#endif
1287    qemu_wait_io_event_common(cpu);
1288}
1289
1290static void *qemu_kvm_cpu_thread_fn(void *arg)
1291{
1292    CPUState *cpu = arg;
1293    int r;
1294
1295    rcu_register_thread();
1296
1297    qemu_mutex_lock_iothread();
1298    qemu_thread_get_self(cpu->thread);
1299    cpu->thread_id = qemu_get_thread_id();
1300    cpu->can_do_io = 1;
1301    current_cpu = cpu;
1302
1303    r = kvm_init_vcpu(cpu);
1304    if (r < 0) {
1305        error_report("kvm_init_vcpu failed: %s", strerror(-r));
1306        exit(1);
1307    }
1308
1309    kvm_init_cpu_signals(cpu);
1310
1311    /* signal CPU creation */
1312    cpu->created = true;
1313    qemu_cond_signal(&qemu_cpu_cond);
1314    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1315
1316    do {
1317        if (cpu_can_run(cpu)) {
1318            r = kvm_cpu_exec(cpu);
1319            if (r == EXCP_DEBUG) {
1320                cpu_handle_guest_debug(cpu);
1321            }
1322        }
1323        qemu_wait_io_event(cpu);
1324    } while (!cpu->unplug || cpu_can_run(cpu));
1325
1326    qemu_kvm_destroy_vcpu(cpu);
1327    cpu->created = false;
1328    qemu_cond_signal(&qemu_cpu_cond);
1329    qemu_mutex_unlock_iothread();
1330    rcu_unregister_thread();
1331    return NULL;
1332}
1333
1334static void *qemu_dummy_cpu_thread_fn(void *arg)
1335{
1336#ifdef _WIN32
1337    error_report("qtest is not supported under Windows");
1338    exit(1);
1339#else
1340    CPUState *cpu = arg;
1341    sigset_t waitset;
1342    int r;
1343
1344    rcu_register_thread();
1345
1346    qemu_mutex_lock_iothread();
1347    qemu_thread_get_self(cpu->thread);
1348    cpu->thread_id = qemu_get_thread_id();
1349    cpu->can_do_io = 1;
1350    current_cpu = cpu;
1351
1352    sigemptyset(&waitset);
1353    sigaddset(&waitset, SIG_IPI);
1354
1355    /* signal CPU creation */
1356    cpu->created = true;
1357    qemu_cond_signal(&qemu_cpu_cond);
1358    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1359
1360    do {
1361        qemu_mutex_unlock_iothread();
1362        do {
1363            int sig;
1364            r = sigwait(&waitset, &sig);
1365        } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1366        if (r == -1) {
1367            perror("sigwait");
1368            exit(1);
1369        }
1370        qemu_mutex_lock_iothread();
1371        qemu_wait_io_event(cpu);
1372    } while (!cpu->unplug);
1373
1374    qemu_mutex_unlock_iothread();
1375    rcu_unregister_thread();
1376    return NULL;
1377#endif
1378}
1379
1380static int64_t tcg_get_icount_limit(void)
1381{
1382    int64_t deadline;
1383
1384    if (replay_mode != REPLAY_MODE_PLAY) {
1385        /*
1386         * Include all the timers, because they may need an attention.
1387         * Too long CPU execution may create unnecessary delay in UI.
1388         */
1389        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1390                                              QEMU_TIMER_ATTR_ALL);
1391
1392        /* Maintain prior (possibly buggy) behaviour where if no deadline
1393         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1394         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1395         * nanoseconds.
1396         */
1397        if ((deadline < 0) || (deadline > INT32_MAX)) {
1398            deadline = INT32_MAX;
1399        }
1400
1401        return qemu_icount_round(deadline);
1402    } else {
1403        return replay_get_instructions();
1404    }
1405}
1406
1407static void handle_icount_deadline(void)
1408{
1409    assert(qemu_in_vcpu_thread());
1410    if (use_icount) {
1411        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1412                                                      QEMU_TIMER_ATTR_ALL);
1413
1414        if (deadline == 0) {
1415            /* Wake up other AioContexts.  */
1416            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1417            qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1418        }
1419    }
1420}
1421
1422static void prepare_icount_for_run(CPUState *cpu)
1423{
1424    if (use_icount) {
1425        int insns_left;
1426
1427        /* These should always be cleared by process_icount_data after
1428         * each vCPU execution. However u16.high can be raised
1429         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1430         */
1431        g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1432        g_assert(cpu->icount_extra == 0);
1433
1434        cpu->icount_budget = tcg_get_icount_limit();
1435        insns_left = MIN(0xffff, cpu->icount_budget);
1436        cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1437        cpu->icount_extra = cpu->icount_budget - insns_left;
1438
1439        replay_mutex_lock();
1440    }
1441}
1442
1443static void process_icount_data(CPUState *cpu)
1444{
1445    if (use_icount) {
1446        /* Account for executed instructions */
1447        cpu_update_icount(cpu);
1448
1449        /* Reset the counters */
1450        cpu_neg(cpu)->icount_decr.u16.low = 0;
1451        cpu->icount_extra = 0;
1452        cpu->icount_budget = 0;
1453
1454        replay_account_executed_instructions();
1455
1456        replay_mutex_unlock();
1457    }
1458}
1459
1460
1461static int tcg_cpu_exec(CPUState *cpu)
1462{
1463    int ret;
1464#ifdef CONFIG_PROFILER
1465    int64_t ti;
1466#endif
1467
1468    assert(tcg_enabled());
1469#ifdef CONFIG_PROFILER
1470    ti = profile_getclock();
1471#endif
1472    cpu_exec_start(cpu);
1473    ret = cpu_exec(cpu);
1474    cpu_exec_end(cpu);
1475#ifdef CONFIG_PROFILER
1476    atomic_set(&tcg_ctx->prof.cpu_exec_time,
1477               tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1478#endif
1479    return ret;
1480}
1481
1482/* Destroy any remaining vCPUs which have been unplugged and have
1483 * finished running
1484 */
1485static void deal_with_unplugged_cpus(void)
1486{
1487    CPUState *cpu;
1488
1489    CPU_FOREACH(cpu) {
1490        if (cpu->unplug && !cpu_can_run(cpu)) {
1491            qemu_tcg_destroy_vcpu(cpu);
1492            cpu->created = false;
1493            qemu_cond_signal(&qemu_cpu_cond);
1494            break;
1495        }
1496    }
1497}
1498
1499/* Single-threaded TCG
1500 *
1501 * In the single-threaded case each vCPU is simulated in turn. If
1502 * there is more than a single vCPU we create a simple timer to kick
1503 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1504 * This is done explicitly rather than relying on side-effects
1505 * elsewhere.
1506 */
1507
1508static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1509{
1510    CPUState *cpu = arg;
1511
1512    assert(tcg_enabled());
1513    rcu_register_thread();
1514    tcg_register_thread();
1515
1516    qemu_mutex_lock_iothread();
1517    qemu_thread_get_self(cpu->thread);
1518
1519    cpu->thread_id = qemu_get_thread_id();
1520    cpu->created = true;
1521    cpu->can_do_io = 1;
1522    qemu_cond_signal(&qemu_cpu_cond);
1523    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1524
1525    /* wait for initial kick-off after machine start */
1526    while (first_cpu->stopped) {
1527        qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1528
1529        /* process any pending work */
1530        CPU_FOREACH(cpu) {
1531            current_cpu = cpu;
1532            qemu_wait_io_event_common(cpu);
1533        }
1534    }
1535
1536    start_tcg_kick_timer();
1537
1538    cpu = first_cpu;
1539
1540    /* process any pending work */
1541    cpu->exit_request = 1;
1542
1543    while (1) {
1544        qemu_mutex_unlock_iothread();
1545        replay_mutex_lock();
1546        qemu_mutex_lock_iothread();
1547        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1548        qemu_account_warp_timer();
1549
1550        /* Run the timers here.  This is much more efficient than
1551         * waking up the I/O thread and waiting for completion.
1552         */
1553        handle_icount_deadline();
1554
1555        replay_mutex_unlock();
1556
1557        if (!cpu) {
1558            cpu = first_cpu;
1559        }
1560
1561        while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1562
1563            atomic_mb_set(&tcg_current_rr_cpu, cpu);
1564            current_cpu = cpu;
1565
1566            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1567                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1568
1569            if (cpu_can_run(cpu)) {
1570                int r;
1571
1572                qemu_mutex_unlock_iothread();
1573                prepare_icount_for_run(cpu);
1574
1575                r = tcg_cpu_exec(cpu);
1576
1577                process_icount_data(cpu);
1578                qemu_mutex_lock_iothread();
1579
1580                if (r == EXCP_DEBUG) {
1581                    cpu_handle_guest_debug(cpu);
1582                    break;
1583                } else if (r == EXCP_ATOMIC) {
1584                    qemu_mutex_unlock_iothread();
1585                    cpu_exec_step_atomic(cpu);
1586                    qemu_mutex_lock_iothread();
1587                    break;
1588                }
1589            } else if (cpu->stop) {
1590                if (cpu->unplug) {
1591                    cpu = CPU_NEXT(cpu);
1592                }
1593                break;
1594            }
1595
1596            cpu = CPU_NEXT(cpu);
1597        } /* while (cpu && !cpu->exit_request).. */
1598
1599        /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1600        atomic_set(&tcg_current_rr_cpu, NULL);
1601
1602        if (cpu && cpu->exit_request) {
1603            atomic_mb_set(&cpu->exit_request, 0);
1604        }
1605
1606        if (use_icount && all_cpu_threads_idle()) {
1607            /*
1608             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1609             * in the main_loop, wake it up in order to start the warp timer.
1610             */
1611            qemu_notify_event();
1612        }
1613
1614        qemu_tcg_rr_wait_io_event();
1615        deal_with_unplugged_cpus();
1616    }
1617
1618    rcu_unregister_thread();
1619    return NULL;
1620}
1621
1622static void *qemu_hax_cpu_thread_fn(void *arg)
1623{
1624    CPUState *cpu = arg;
1625    int r;
1626
1627    rcu_register_thread();
1628    qemu_mutex_lock_iothread();
1629    qemu_thread_get_self(cpu->thread);
1630
1631    cpu->thread_id = qemu_get_thread_id();
1632    cpu->created = true;
1633    current_cpu = cpu;
1634
1635    hax_init_vcpu(cpu);
1636    qemu_cond_signal(&qemu_cpu_cond);
1637    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1638
1639    do {
1640        if (cpu_can_run(cpu)) {
1641            r = hax_smp_cpu_exec(cpu);
1642            if (r == EXCP_DEBUG) {
1643                cpu_handle_guest_debug(cpu);
1644            }
1645        }
1646
1647        qemu_wait_io_event(cpu);
1648    } while (!cpu->unplug || cpu_can_run(cpu));
1649    rcu_unregister_thread();
1650    return NULL;
1651}
1652
1653/* The HVF-specific vCPU thread function. This one should only run when the host
1654 * CPU supports the VMX "unrestricted guest" feature. */
1655static void *qemu_hvf_cpu_thread_fn(void *arg)
1656{
1657    CPUState *cpu = arg;
1658
1659    int r;
1660
1661    assert(hvf_enabled());
1662
1663    rcu_register_thread();
1664
1665    qemu_mutex_lock_iothread();
1666    qemu_thread_get_self(cpu->thread);
1667
1668    cpu->thread_id = qemu_get_thread_id();
1669    cpu->can_do_io = 1;
1670    current_cpu = cpu;
1671
1672    hvf_init_vcpu(cpu);
1673
1674    /* signal CPU creation */
1675    cpu->created = true;
1676    qemu_cond_signal(&qemu_cpu_cond);
1677    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1678
1679    do {
1680        if (cpu_can_run(cpu)) {
1681            r = hvf_vcpu_exec(cpu);
1682            if (r == EXCP_DEBUG) {
1683                cpu_handle_guest_debug(cpu);
1684            }
1685        }
1686        qemu_wait_io_event(cpu);
1687    } while (!cpu->unplug || cpu_can_run(cpu));
1688
1689    hvf_vcpu_destroy(cpu);
1690    cpu->created = false;
1691    qemu_cond_signal(&qemu_cpu_cond);
1692    qemu_mutex_unlock_iothread();
1693    rcu_unregister_thread();
1694    return NULL;
1695}
1696
1697static void *qemu_whpx_cpu_thread_fn(void *arg)
1698{
1699    CPUState *cpu = arg;
1700    int r;
1701
1702    rcu_register_thread();
1703
1704    qemu_mutex_lock_iothread();
1705    qemu_thread_get_self(cpu->thread);
1706    cpu->thread_id = qemu_get_thread_id();
1707    current_cpu = cpu;
1708
1709    r = whpx_init_vcpu(cpu);
1710    if (r < 0) {
1711        fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1712        exit(1);
1713    }
1714
1715    /* signal CPU creation */
1716    cpu->created = true;
1717    qemu_cond_signal(&qemu_cpu_cond);
1718    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1719
1720    do {
1721        if (cpu_can_run(cpu)) {
1722            r = whpx_vcpu_exec(cpu);
1723            if (r == EXCP_DEBUG) {
1724                cpu_handle_guest_debug(cpu);
1725            }
1726        }
1727        while (cpu_thread_is_idle(cpu)) {
1728            qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1729        }
1730        qemu_wait_io_event_common(cpu);
1731    } while (!cpu->unplug || cpu_can_run(cpu));
1732
1733    whpx_destroy_vcpu(cpu);
1734    cpu->created = false;
1735    qemu_cond_signal(&qemu_cpu_cond);
1736    qemu_mutex_unlock_iothread();
1737    rcu_unregister_thread();
1738    return NULL;
1739}
1740
1741#ifdef _WIN32
1742static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1743{
1744}
1745#endif
1746
1747/* Multi-threaded TCG
1748 *
1749 * In the multi-threaded case each vCPU has its own thread. The TLS
1750 * variable current_cpu can be used deep in the code to find the
1751 * current CPUState for a given thread.
1752 */
1753
1754static void *qemu_tcg_cpu_thread_fn(void *arg)
1755{
1756    CPUState *cpu = arg;
1757
1758    assert(tcg_enabled());
1759    g_assert(!use_icount);
1760
1761    rcu_register_thread();
1762    tcg_register_thread();
1763
1764    qemu_mutex_lock_iothread();
1765    qemu_thread_get_self(cpu->thread);
1766
1767    cpu->thread_id = qemu_get_thread_id();
1768    cpu->created = true;
1769    cpu->can_do_io = 1;
1770    current_cpu = cpu;
1771    qemu_cond_signal(&qemu_cpu_cond);
1772    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1773
1774    /* process any pending work */
1775    cpu->exit_request = 1;
1776
1777    do {
1778        if (cpu_can_run(cpu)) {
1779            int r;
1780            qemu_mutex_unlock_iothread();
1781            r = tcg_cpu_exec(cpu);
1782            qemu_mutex_lock_iothread();
1783            switch (r) {
1784            case EXCP_DEBUG:
1785                cpu_handle_guest_debug(cpu);
1786                break;
1787            case EXCP_HALTED:
1788                /* during start-up the vCPU is reset and the thread is
1789                 * kicked several times. If we don't ensure we go back
1790                 * to sleep in the halted state we won't cleanly
1791                 * start-up when the vCPU is enabled.
1792                 *
1793                 * cpu->halted should ensure we sleep in wait_io_event
1794                 */
1795                g_assert(cpu->halted);
1796                break;
1797            case EXCP_ATOMIC:
1798                qemu_mutex_unlock_iothread();
1799                cpu_exec_step_atomic(cpu);
1800                qemu_mutex_lock_iothread();
1801            default:
1802                /* Ignore everything else? */
1803                break;
1804            }
1805        }
1806
1807        atomic_mb_set(&cpu->exit_request, 0);
1808        qemu_wait_io_event(cpu);
1809    } while (!cpu->unplug || cpu_can_run(cpu));
1810
1811    qemu_tcg_destroy_vcpu(cpu);
1812    cpu->created = false;
1813    qemu_cond_signal(&qemu_cpu_cond);
1814    qemu_mutex_unlock_iothread();
1815    rcu_unregister_thread();
1816    return NULL;
1817}
1818
1819static void qemu_cpu_kick_thread(CPUState *cpu)
1820{
1821#ifndef _WIN32
1822    int err;
1823
1824    if (cpu->thread_kicked) {
1825        return;
1826    }
1827    cpu->thread_kicked = true;
1828    err = pthread_kill(cpu->thread->thread, SIG_IPI);
1829    if (err && err != ESRCH) {
1830        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1831        exit(1);
1832    }
1833#else /* _WIN32 */
1834    if (!qemu_cpu_is_self(cpu)) {
1835        if (whpx_enabled()) {
1836            whpx_vcpu_kick(cpu);
1837        } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1838            fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1839                    __func__, GetLastError());
1840            exit(1);
1841        }
1842    }
1843#endif
1844}
1845
1846void qemu_cpu_kick(CPUState *cpu)
1847{
1848    qemu_cond_broadcast(cpu->halt_cond);
1849    if (tcg_enabled()) {
1850        if (qemu_tcg_mttcg_enabled()) {
1851            cpu_exit(cpu);
1852        } else {
1853            qemu_cpu_kick_rr_cpus();
1854        }
1855    } else {
1856        if (hax_enabled()) {
1857            /*
1858             * FIXME: race condition with the exit_request check in
1859             * hax_vcpu_hax_exec
1860             */
1861            cpu->exit_request = 1;
1862        }
1863        qemu_cpu_kick_thread(cpu);
1864    }
1865}
1866
1867void qemu_cpu_kick_self(void)
1868{
1869    assert(current_cpu);
1870    qemu_cpu_kick_thread(current_cpu);
1871}
1872
1873bool qemu_cpu_is_self(CPUState *cpu)
1874{
1875    return qemu_thread_is_self(cpu->thread);
1876}
1877
1878bool qemu_in_vcpu_thread(void)
1879{
1880    return current_cpu && qemu_cpu_is_self(current_cpu);
1881}
1882
1883static __thread bool iothread_locked = false;
1884
1885bool qemu_mutex_iothread_locked(void)
1886{
1887    return iothread_locked;
1888}
1889
1890/*
1891 * The BQL is taken from so many places that it is worth profiling the
1892 * callers directly, instead of funneling them all through a single function.
1893 */
1894void qemu_mutex_lock_iothread_impl(const char *file, int line)
1895{
1896    QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1897
1898    g_assert(!qemu_mutex_iothread_locked());
1899    bql_lock(&qemu_global_mutex, file, line);
1900    iothread_locked = true;
1901}
1902
1903void qemu_mutex_unlock_iothread(void)
1904{
1905    g_assert(qemu_mutex_iothread_locked());
1906    iothread_locked = false;
1907    qemu_mutex_unlock(&qemu_global_mutex);
1908}
1909
1910static bool all_vcpus_paused(void)
1911{
1912    CPUState *cpu;
1913
1914    CPU_FOREACH(cpu) {
1915        if (!cpu->stopped) {
1916            return false;
1917        }
1918    }
1919
1920    return true;
1921}
1922
1923void pause_all_vcpus(void)
1924{
1925    CPUState *cpu;
1926
1927    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1928    CPU_FOREACH(cpu) {
1929        if (qemu_cpu_is_self(cpu)) {
1930            qemu_cpu_stop(cpu, true);
1931        } else {
1932            cpu->stop = true;
1933            qemu_cpu_kick(cpu);
1934        }
1935    }
1936
1937    /* We need to drop the replay_lock so any vCPU threads woken up
1938     * can finish their replay tasks
1939     */
1940    replay_mutex_unlock();
1941
1942    while (!all_vcpus_paused()) {
1943        qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1944        CPU_FOREACH(cpu) {
1945            qemu_cpu_kick(cpu);
1946        }
1947    }
1948
1949    qemu_mutex_unlock_iothread();
1950    replay_mutex_lock();
1951    qemu_mutex_lock_iothread();
1952}
1953
1954void cpu_resume(CPUState *cpu)
1955{
1956    cpu->stop = false;
1957    cpu->stopped = false;
1958    qemu_cpu_kick(cpu);
1959}
1960
1961void resume_all_vcpus(void)
1962{
1963    CPUState *cpu;
1964
1965    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1966    CPU_FOREACH(cpu) {
1967        cpu_resume(cpu);
1968    }
1969}
1970
1971void cpu_remove_sync(CPUState *cpu)
1972{
1973    cpu->stop = true;
1974    cpu->unplug = true;
1975    qemu_cpu_kick(cpu);
1976    qemu_mutex_unlock_iothread();
1977    qemu_thread_join(cpu->thread);
1978    qemu_mutex_lock_iothread();
1979}
1980
1981/* For temporary buffers for forming a name */
1982#define VCPU_THREAD_NAME_SIZE 16
1983
1984static void qemu_tcg_init_vcpu(CPUState *cpu)
1985{
1986    char thread_name[VCPU_THREAD_NAME_SIZE];
1987    static QemuCond *single_tcg_halt_cond;
1988    static QemuThread *single_tcg_cpu_thread;
1989    static int tcg_region_inited;
1990
1991    assert(tcg_enabled());
1992    /*
1993     * Initialize TCG regions--once. Now is a good time, because:
1994     * (1) TCG's init context, prologue and target globals have been set up.
1995     * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1996     *     -accel flag is processed, so the check doesn't work then).
1997     */
1998    if (!tcg_region_inited) {
1999        tcg_region_inited = 1;
2000        tcg_region_init();
2001    }
2002
2003    if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
2004        cpu->thread = g_malloc0(sizeof(QemuThread));
2005        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2006        qemu_cond_init(cpu->halt_cond);
2007
2008        if (qemu_tcg_mttcg_enabled()) {
2009            /* create a thread per vCPU with TCG (MTTCG) */
2010            parallel_cpus = true;
2011            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
2012                 cpu->cpu_index);
2013
2014            qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
2015                               cpu, QEMU_THREAD_JOINABLE);
2016
2017        } else {
2018            /* share a single thread for all cpus with TCG */
2019            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
2020            qemu_thread_create(cpu->thread, thread_name,
2021                               qemu_tcg_rr_cpu_thread_fn,
2022                               cpu, QEMU_THREAD_JOINABLE);
2023
2024            single_tcg_halt_cond = cpu->halt_cond;
2025            single_tcg_cpu_thread = cpu->thread;
2026        }
2027#ifdef _WIN32
2028        cpu->hThread = qemu_thread_get_handle(cpu->thread);
2029#endif
2030    } else {
2031        /* For non-MTTCG cases we share the thread */
2032        cpu->thread = single_tcg_cpu_thread;
2033        cpu->halt_cond = single_tcg_halt_cond;
2034        cpu->thread_id = first_cpu->thread_id;
2035        cpu->can_do_io = 1;
2036        cpu->created = true;
2037    }
2038}
2039
2040static void qemu_hax_start_vcpu(CPUState *cpu)
2041{
2042    char thread_name[VCPU_THREAD_NAME_SIZE];
2043
2044    cpu->thread = g_malloc0(sizeof(QemuThread));
2045    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2046    qemu_cond_init(cpu->halt_cond);
2047
2048    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2049             cpu->cpu_index);
2050    qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2051                       cpu, QEMU_THREAD_JOINABLE);
2052#ifdef _WIN32
2053    cpu->hThread = qemu_thread_get_handle(cpu->thread);
2054#endif
2055}
2056
2057static void qemu_kvm_start_vcpu(CPUState *cpu)
2058{
2059    char thread_name[VCPU_THREAD_NAME_SIZE];
2060
2061    cpu->thread = g_malloc0(sizeof(QemuThread));
2062    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2063    qemu_cond_init(cpu->halt_cond);
2064    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2065             cpu->cpu_index);
2066    qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2067                       cpu, QEMU_THREAD_JOINABLE);
2068}
2069
2070static void qemu_hvf_start_vcpu(CPUState *cpu)
2071{
2072    char thread_name[VCPU_THREAD_NAME_SIZE];
2073
2074    /* HVF currently does not support TCG, and only runs in
2075     * unrestricted-guest mode. */
2076    assert(hvf_enabled());
2077
2078    cpu->thread = g_malloc0(sizeof(QemuThread));
2079    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2080    qemu_cond_init(cpu->halt_cond);
2081
2082    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2083             cpu->cpu_index);
2084    qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2085                       cpu, QEMU_THREAD_JOINABLE);
2086}
2087
2088static void qemu_whpx_start_vcpu(CPUState *cpu)
2089{
2090    char thread_name[VCPU_THREAD_NAME_SIZE];
2091
2092    cpu->thread = g_malloc0(sizeof(QemuThread));
2093    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2094    qemu_cond_init(cpu->halt_cond);
2095    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2096             cpu->cpu_index);
2097    qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2098                       cpu, QEMU_THREAD_JOINABLE);
2099#ifdef _WIN32
2100    cpu->hThread = qemu_thread_get_handle(cpu->thread);
2101#endif
2102}
2103
2104static void qemu_dummy_start_vcpu(CPUState *cpu)
2105{
2106    char thread_name[VCPU_THREAD_NAME_SIZE];
2107
2108    cpu->thread = g_malloc0(sizeof(QemuThread));
2109    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2110    qemu_cond_init(cpu->halt_cond);
2111    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2112             cpu->cpu_index);
2113    qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2114                       QEMU_THREAD_JOINABLE);
2115}
2116
2117void qemu_init_vcpu(CPUState *cpu)
2118{
2119    MachineState *ms = MACHINE(qdev_get_machine());
2120
2121    cpu->nr_cores = ms->smp.cores;
2122    cpu->nr_threads =  ms->smp.threads;
2123    cpu->stopped = true;
2124    cpu->random_seed = qemu_guest_random_seed_thread_part1();
2125
2126    if (!cpu->as) {
2127        /* If the target cpu hasn't set up any address spaces itself,
2128         * give it the default one.
2129         */
2130        cpu->num_ases = 1;
2131        cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2132    }
2133
2134    if (kvm_enabled()) {
2135        qemu_kvm_start_vcpu(cpu);
2136    } else if (hax_enabled()) {
2137        qemu_hax_start_vcpu(cpu);
2138    } else if (hvf_enabled()) {
2139        qemu_hvf_start_vcpu(cpu);
2140    } else if (tcg_enabled()) {
2141        qemu_tcg_init_vcpu(cpu);
2142    } else if (whpx_enabled()) {
2143        qemu_whpx_start_vcpu(cpu);
2144    } else {
2145        qemu_dummy_start_vcpu(cpu);
2146    }
2147
2148    while (!cpu->created) {
2149        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2150    }
2151}
2152
2153void cpu_stop_current(void)
2154{
2155    if (current_cpu) {
2156        current_cpu->stop = true;
2157        cpu_exit(current_cpu);
2158    }
2159}
2160
2161int vm_stop(RunState state)
2162{
2163    if (qemu_in_vcpu_thread()) {
2164        qemu_system_vmstop_request_prepare();
2165        qemu_system_vmstop_request(state);
2166        /*
2167         * FIXME: should not return to device code in case
2168         * vm_stop() has been requested.
2169         */
2170        cpu_stop_current();
2171        return 0;
2172    }
2173
2174    return do_vm_stop(state, true);
2175}
2176
2177/**
2178 * Prepare for (re)starting the VM.
2179 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2180 * running or in case of an error condition), 0 otherwise.
2181 */
2182int vm_prepare_start(void)
2183{
2184    RunState requested;
2185
2186    qemu_vmstop_requested(&requested);
2187    if (runstate_is_running() && requested == RUN_STATE__MAX) {
2188        return -1;
2189    }
2190
2191    /* Ensure that a STOP/RESUME pair of events is emitted if a
2192     * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2193     * example, according to documentation is always followed by
2194     * the STOP event.
2195     */
2196    if (runstate_is_running()) {
2197        qapi_event_send_stop();
2198        qapi_event_send_resume();
2199        return -1;
2200    }
2201
2202    /* We are sending this now, but the CPUs will be resumed shortly later */
2203    qapi_event_send_resume();
2204
2205    cpu_enable_ticks();
2206    runstate_set(RUN_STATE_RUNNING);
2207    vm_state_notify(1, RUN_STATE_RUNNING);
2208    return 0;
2209}
2210
2211void vm_start(void)
2212{
2213    if (!vm_prepare_start()) {
2214        resume_all_vcpus();
2215    }
2216}
2217
2218/* does a state transition even if the VM is already stopped,
2219   current state is forgotten forever */
2220int vm_stop_force_state(RunState state)
2221{
2222    if (runstate_is_running()) {
2223        return vm_stop(state);
2224    } else {
2225        runstate_set(state);
2226
2227        bdrv_drain_all();
2228        /* Make sure to return an error if the flush in a previous vm_stop()
2229         * failed. */
2230        return bdrv_flush_all();
2231    }
2232}
2233
2234void list_cpus(const char *optarg)
2235{
2236    /* XXX: implement xxx_cpu_list for targets that still miss it */
2237#if defined(cpu_list)
2238    cpu_list();
2239#endif
2240}
2241
2242void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2243                 bool has_cpu, int64_t cpu_index, Error **errp)
2244{
2245    FILE *f;
2246    uint32_t l;
2247    CPUState *cpu;
2248    uint8_t buf[1024];
2249    int64_t orig_addr = addr, orig_size = size;
2250
2251    if (!has_cpu) {
2252        cpu_index = 0;
2253    }
2254
2255    cpu = qemu_get_cpu(cpu_index);
2256    if (cpu == NULL) {
2257        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2258                   "a CPU number");
2259        return;
2260    }
2261
2262    f = fopen(filename, "wb");
2263    if (!f) {
2264        error_setg_file_open(errp, errno, filename);
2265        return;
2266    }
2267
2268    while (size != 0) {
2269        l = sizeof(buf);
2270        if (l > size)
2271            l = size;
2272        if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2273            error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2274                             " specified", orig_addr, orig_size);
2275            goto exit;
2276        }
2277        if (fwrite(buf, 1, l, f) != l) {
2278            error_setg(errp, QERR_IO_ERROR);
2279            goto exit;
2280        }
2281        addr += l;
2282        size -= l;
2283    }
2284
2285exit:
2286    fclose(f);
2287}
2288
2289void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2290                  Error **errp)
2291{
2292    FILE *f;
2293    uint32_t l;
2294    uint8_t buf[1024];
2295
2296    f = fopen(filename, "wb");
2297    if (!f) {
2298        error_setg_file_open(errp, errno, filename);
2299        return;
2300    }
2301
2302    while (size != 0) {
2303        l = sizeof(buf);
2304        if (l > size)
2305            l = size;
2306        cpu_physical_memory_read(addr, buf, l);
2307        if (fwrite(buf, 1, l, f) != l) {
2308            error_setg(errp, QERR_IO_ERROR);
2309            goto exit;
2310        }
2311        addr += l;
2312        size -= l;
2313    }
2314
2315exit:
2316    fclose(f);
2317}
2318
2319void qmp_inject_nmi(Error **errp)
2320{
2321    nmi_monitor_handle(monitor_get_cpu_index(), errp);
2322}
2323
2324void dump_drift_info(void)
2325{
2326    if (!use_icount) {
2327        return;
2328    }
2329
2330    qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2331                (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2332    if (icount_align_option) {
2333        qemu_printf("Max guest delay     %"PRIi64" ms\n",
2334                    -max_delay / SCALE_MS);
2335        qemu_printf("Max guest advance   %"PRIi64" ms\n",
2336                    max_advance / SCALE_MS);
2337    } else {
2338        qemu_printf("Max guest delay     NA\n");
2339        qemu_printf("Max guest advance   NA\n");
2340    }
2341}
2342