LXR qemu/cpus.c

   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qemu-common.h"
  27#include "qemu/config-file.h"
  28#include "migration/vmstate.h"
  29#include "monitor/monitor.h"
  30#include "qapi/error.h"
  31#include "qapi/qapi-commands-misc.h"
  32#include "qapi/qapi-events-run-state.h"
  33#include "qapi/qmp/qerror.h"
  34#include "qemu/error-report.h"
  35#include "qemu/qemu-print.h"
  36#include "sysemu/tcg.h"
  37#include "sysemu/block-backend.h"
  38#include "exec/gdbstub.h"
  39#include "sysemu/dma.h"
  40#include "sysemu/hw_accel.h"
  41#include "sysemu/kvm.h"
  42#include "sysemu/hax.h"
  43#include "sysemu/hvf.h"
  44#include "sysemu/whpx.h"
  45#include "exec/exec-all.h"
  46
  47#include "qemu/thread.h"
  48#include "qemu/plugin.h"
  49#include "sysemu/cpus.h"
  50#include "sysemu/qtest.h"
  51#include "qemu/main-loop.h"
  52#include "qemu/option.h"
  53#include "qemu/bitmap.h"
  54#include "qemu/seqlock.h"
  55#include "qemu/guest-random.h"
  56#include "tcg.h"
  57#include "hw/nmi.h"
  58#include "sysemu/replay.h"
  59#include "sysemu/runstate.h"
  60#include "hw/boards.h"
  61#include "hw/hw.h"
  62
  63#ifdef CONFIG_LINUX
  64
  65#include <sys/prctl.h>
  66
  67#ifndef PR_MCE_KILL
  68#define PR_MCE_KILL 33
  69#endif
  70
  71#ifndef PR_MCE_KILL_SET
  72#define PR_MCE_KILL_SET 1
  73#endif
  74
  75#ifndef PR_MCE_KILL_EARLY
  76#define PR_MCE_KILL_EARLY 1
  77#endif
  78
  79#endif /* CONFIG_LINUX */
  80
  81static QemuMutex qemu_global_mutex;
  82
  83int64_t max_delay;
  84int64_t max_advance;
  85
  86/* vcpu throttling controls */
  87static QEMUTimer *throttle_timer;
  88static unsigned int throttle_percentage;
  89
  90#define CPU_THROTTLE_PCT_MIN 1
  91#define CPU_THROTTLE_PCT_MAX 99
  92#define CPU_THROTTLE_TIMESLICE_NS 10000000
  93
  94bool cpu_is_stopped(CPUState *cpu)
  95{
  96    return cpu->stopped || !runstate_is_running();
  97}
  98
  99static bool cpu_thread_is_idle(CPUState *cpu)
 100{
 101    if (cpu->stop || cpu->queued_work_first) {
 102        return false;
 103    }
 104    if (cpu_is_stopped(cpu)) {
 105        return true;
 106    }
 107    if (!cpu->halted || cpu_has_work(cpu) ||
 108        kvm_halt_in_kernel()) {
 109        return false;
 110    }
 111    return true;
 112}
 113
 114static bool all_cpu_threads_idle(void)
 115{
 116    CPUState *cpu;
 117
 118    CPU_FOREACH(cpu) {
 119        if (!cpu_thread_is_idle(cpu)) {
 120            return false;
 121        }
 122    }
 123    return true;
 124}
 125
 126/***********************************************************/
 127/* guest cycle counter */
 128
 129/* Protected by TimersState seqlock */
 130
 131static bool icount_sleep = true;
 132/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 133#define MAX_ICOUNT_SHIFT 10
 134
 135typedef struct TimersState {
 136    /* Protected by BQL.  */
 137    int64_t cpu_ticks_prev;
 138    int64_t cpu_ticks_offset;
 139
 140    /* Protect fields that can be respectively read outside the
 141     * BQL, and written from multiple threads.
 142     */
 143    QemuSeqLock vm_clock_seqlock;
 144    QemuSpin vm_clock_lock;
 145
 146    int16_t cpu_ticks_enabled;
 147
 148    /* Conversion factor from emulated instructions to virtual clock ticks.  */
 149    int16_t icount_time_shift;
 150
 151    /* Compensate for varying guest execution speed.  */
 152    int64_t qemu_icount_bias;
 153
 154    int64_t vm_clock_warp_start;
 155    int64_t cpu_clock_offset;
 156
 157    /* Only written by TCG thread */
 158    int64_t qemu_icount;
 159
 160    /* for adjusting icount */
 161    QEMUTimer *icount_rt_timer;
 162    QEMUTimer *icount_vm_timer;
 163    QEMUTimer *icount_warp_timer;
 164} TimersState;
 165
 166static TimersState timers_state;
 167bool mttcg_enabled;
 168
 169/*
 170 * We default to false if we know other options have been enabled
 171 * which are currently incompatible with MTTCG. Otherwise when each
 172 * guest (target) has been updated to support:
 173 *   - atomic instructions
 174 *   - memory ordering primitives (barriers)
 175 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 176 *
 177 * Once a guest architecture has been converted to the new primitives
 178 * there are two remaining limitations to check.
 179 *
 180 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 181 * - The host must have a stronger memory order than the guest
 182 *
 183 * It may be possible in future to support strong guests on weak hosts
 184 * but that will require tagging all load/stores in a guest with their
 185 * implicit memory order requirements which would likely slow things
 186 * down a lot.
 187 */
 188
 189static bool check_tcg_memory_orders_compatible(void)
 190{
 191#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 192    return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 193#else
 194    return false;
 195#endif
 196}
 197
 198static bool default_mttcg_enabled(void)
 199{
 200    if (use_icount || TCG_OVERSIZED_GUEST) {
 201        return false;
 202    } else {
 203#ifdef TARGET_SUPPORTS_MTTCG
 204        return check_tcg_memory_orders_compatible();
 205#else
 206        return false;
 207#endif
 208    }
 209}
 210
 211void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 212{
 213    const char *t = qemu_opt_get(opts, "thread");
 214    if (t) {
 215        if (strcmp(t, "multi") == 0) {
 216            if (TCG_OVERSIZED_GUEST) {
 217                error_setg(errp, "No MTTCG when guest word size > hosts");
 218            } else if (use_icount) {
 219                error_setg(errp, "No MTTCG when icount is enabled");
 220            } else {
 221#ifndef TARGET_SUPPORTS_MTTCG
 222                warn_report("Guest not yet converted to MTTCG - "
 223                            "you may get unexpected results");
 224#endif
 225                if (!check_tcg_memory_orders_compatible()) {
 226                    warn_report("Guest expects a stronger memory ordering "
 227                                "than the host provides");
 228                    error_printf("This may cause strange/hard to debug errors\n");
 229                }
 230                mttcg_enabled = true;
 231            }
 232        } else if (strcmp(t, "single") == 0) {
 233            mttcg_enabled = false;
 234        } else {
 235            error_setg(errp, "Invalid 'thread' setting %s", t);
 236        }
 237    } else {
 238        mttcg_enabled = default_mttcg_enabled();
 239    }
 240}
 241
 242/* The current number of executed instructions is based on what we
 243 * originally budgeted minus the current state of the decrementing
 244 * icount counters in extra/u16.low.
 245 */
 246static int64_t cpu_get_icount_executed(CPUState *cpu)
 247{
 248    return (cpu->icount_budget -
 249            (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 250}
 251
 252/*
 253 * Update the global shared timer_state.qemu_icount to take into
 254 * account executed instructions. This is done by the TCG vCPU
 255 * thread so the main-loop can see time has moved forward.
 256 */
 257static void cpu_update_icount_locked(CPUState *cpu)
 258{
 259    int64_t executed = cpu_get_icount_executed(cpu);
 260    cpu->icount_budget -= executed;
 261
 262    atomic_set_i64(&timers_state.qemu_icount,
 263                   timers_state.qemu_icount + executed);
 264}
 265
 266/*
 267 * Update the global shared timer_state.qemu_icount to take into
 268 * account executed instructions. This is done by the TCG vCPU
 269 * thread so the main-loop can see time has moved forward.
 270 */
 271void cpu_update_icount(CPUState *cpu)
 272{
 273    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 274                       &timers_state.vm_clock_lock);
 275    cpu_update_icount_locked(cpu);
 276    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 277                         &timers_state.vm_clock_lock);
 278}
 279
 280static int64_t cpu_get_icount_raw_locked(void)
 281{
 282    CPUState *cpu = current_cpu;
 283
 284    if (cpu && cpu->running) {
 285        if (!cpu->can_do_io) {
 286            qemu_log("Bad icount read\n");
 287        }
 288        /* Take into account what has run */
 289        cpu_update_icount_locked(cpu);
 290    }
 291    /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 292    return atomic_read_i64(&timers_state.qemu_icount);
 293}
 294
 295static int64_t cpu_get_icount_locked(void)
 296{
 297    int64_t icount = cpu_get_icount_raw_locked();
 298    return atomic_read_i64(&timers_state.qemu_icount_bias) +
 299        cpu_icount_to_ns(icount);
 300}
 301
 302int64_t cpu_get_icount_raw(void)
 303{
 304    int64_t icount;
 305    unsigned start;
 306
 307    do {
 308        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 309        icount = cpu_get_icount_raw_locked();
 310    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 311
 312    return icount;
 313}
 314
 315/* Return the virtual CPU time, based on the instruction counter.  */
 316int64_t cpu_get_icount(void)
 317{
 318    int64_t icount;
 319    unsigned start;
 320
 321    do {
 322        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 323        icount = cpu_get_icount_locked();
 324    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 325
 326    return icount;
 327}
 328
 329int64_t cpu_icount_to_ns(int64_t icount)
 330{
 331    return icount << atomic_read(&timers_state.icount_time_shift);
 332}
 333
 334static int64_t cpu_get_ticks_locked(void)
 335{
 336    int64_t ticks = timers_state.cpu_ticks_offset;
 337    if (timers_state.cpu_ticks_enabled) {
 338        ticks += cpu_get_host_ticks();
 339    }
 340
 341    if (timers_state.cpu_ticks_prev > ticks) {
 342        /* Non increasing ticks may happen if the host uses software suspend.  */
 343        timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 344        ticks = timers_state.cpu_ticks_prev;
 345    }
 346
 347    timers_state.cpu_ticks_prev = ticks;
 348    return ticks;
 349}
 350
 351/* return the time elapsed in VM between vm_start and vm_stop.  Unless
 352 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 353 * counter.
 354 */
 355int64_t cpu_get_ticks(void)
 356{
 357    int64_t ticks;
 358
 359    if (use_icount) {
 360        return cpu_get_icount();
 361    }
 362
 363    qemu_spin_lock(&timers_state.vm_clock_lock);
 364    ticks = cpu_get_ticks_locked();
 365    qemu_spin_unlock(&timers_state.vm_clock_lock);
 366    return ticks;
 367}
 368
 369static int64_t cpu_get_clock_locked(void)
 370{
 371    int64_t time;
 372
 373    time = timers_state.cpu_clock_offset;
 374    if (timers_state.cpu_ticks_enabled) {
 375        time += get_clock();
 376    }
 377
 378    return time;
 379}
 380
 381/* Return the monotonic time elapsed in VM, i.e.,
 382 * the time between vm_start and vm_stop
 383 */
 384int64_t cpu_get_clock(void)
 385{
 386    int64_t ti;
 387    unsigned start;
 388
 389    do {
 390        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 391        ti = cpu_get_clock_locked();
 392    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 393
 394    return ti;
 395}
 396
 397/* enable cpu_get_ticks()
 398 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 399 */
 400void cpu_enable_ticks(void)
 401{
 402    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 403                       &timers_state.vm_clock_lock);
 404    if (!timers_state.cpu_ticks_enabled) {
 405        timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 406        timers_state.cpu_clock_offset -= get_clock();
 407        timers_state.cpu_ticks_enabled = 1;
 408    }
 409    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 410                       &timers_state.vm_clock_lock);
 411}
 412
 413/* disable cpu_get_ticks() : the clock is stopped. You must not call
 414 * cpu_get_ticks() after that.
 415 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 416 */
 417void cpu_disable_ticks(void)
 418{
 419    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 420                       &timers_state.vm_clock_lock);
 421    if (timers_state.cpu_ticks_enabled) {
 422        timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 423        timers_state.cpu_clock_offset = cpu_get_clock_locked();
 424        timers_state.cpu_ticks_enabled = 0;
 425    }
 426    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 427                         &timers_state.vm_clock_lock);
 428}
 429
 430/* Correlation between real and virtual time is always going to be
 431   fairly approximate, so ignore small variation.
 432   When the guest is idle real and virtual time will be aligned in
 433   the IO wait loop.  */
 434#define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 435
 436static void icount_adjust(void)
 437{
 438    int64_t cur_time;
 439    int64_t cur_icount;
 440    int64_t delta;
 441
 442    /* Protected by TimersState mutex.  */
 443    static int64_t last_delta;
 444
 445    /* If the VM is not running, then do nothing.  */
 446    if (!runstate_is_running()) {
 447        return;
 448    }
 449
 450    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 451                       &timers_state.vm_clock_lock);
 452    cur_time = cpu_get_clock_locked();
 453    cur_icount = cpu_get_icount_locked();
 454
 455    delta = cur_icount - cur_time;
 456    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 457    if (delta > 0
 458        && last_delta + ICOUNT_WOBBLE < delta * 2
 459        && timers_state.icount_time_shift > 0) {
 460        /* The guest is getting too far ahead.  Slow time down.  */
 461        atomic_set(&timers_state.icount_time_shift,
 462                   timers_state.icount_time_shift - 1);
 463    }
 464    if (delta < 0
 465        && last_delta - ICOUNT_WOBBLE > delta * 2
 466        && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 467        /* The guest is getting too far behind.  Speed time up.  */
 468        atomic_set(&timers_state.icount_time_shift,
 469                   timers_state.icount_time_shift + 1);
 470    }
 471    last_delta = delta;
 472    atomic_set_i64(&timers_state.qemu_icount_bias,
 473                   cur_icount - (timers_state.qemu_icount
 474                                 << timers_state.icount_time_shift));
 475    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 476                         &timers_state.vm_clock_lock);
 477}
 478
 479static void icount_adjust_rt(void *opaque)
 480{
 481    timer_mod(timers_state.icount_rt_timer,
 482              qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 483    icount_adjust();
 484}
 485
 486static void icount_adjust_vm(void *opaque)
 487{
 488    timer_mod(timers_state.icount_vm_timer,
 489                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 490                   NANOSECONDS_PER_SECOND / 10);
 491    icount_adjust();
 492}
 493
 494static int64_t qemu_icount_round(int64_t count)
 495{
 496    int shift = atomic_read(&timers_state.icount_time_shift);
 497    return (count + (1 << shift) - 1) >> shift;
 498}
 499
 500static void icount_warp_rt(void)
 501{
 502    unsigned seq;
 503    int64_t warp_start;
 504
 505    /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 506     * changes from -1 to another value, so the race here is okay.
 507     */
 508    do {
 509        seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 510        warp_start = timers_state.vm_clock_warp_start;
 511    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 512
 513    if (warp_start == -1) {
 514        return;
 515    }
 516
 517    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 518                       &timers_state.vm_clock_lock);
 519    if (runstate_is_running()) {
 520        int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 521                                            cpu_get_clock_locked());
 522        int64_t warp_delta;
 523
 524        warp_delta = clock - timers_state.vm_clock_warp_start;
 525        if (use_icount == 2) {
 526            /*
 527             * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 528             * far ahead of real time.
 529             */
 530            int64_t cur_icount = cpu_get_icount_locked();
 531            int64_t delta = clock - cur_icount;
 532            warp_delta = MIN(warp_delta, delta);
 533        }
 534        atomic_set_i64(&timers_state.qemu_icount_bias,
 535                       timers_state.qemu_icount_bias + warp_delta);
 536    }
 537    timers_state.vm_clock_warp_start = -1;
 538    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 539                       &timers_state.vm_clock_lock);
 540
 541    if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 542        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 543    }
 544}
 545
 546static void icount_timer_cb(void *opaque)
 547{
 548    /* No need for a checkpoint because the timer already synchronizes
 549     * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 550     */
 551    icount_warp_rt();
 552}
 553
 554void qtest_clock_warp(int64_t dest)
 555{
 556    int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 557    AioContext *aio_context;
 558    assert(qtest_enabled());
 559    aio_context = qemu_get_aio_context();
 560    while (clock < dest) {
 561        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 562                                                      QEMU_TIMER_ATTR_ALL);
 563        int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 564
 565        seqlock_write_lock(&timers_state.vm_clock_seqlock,
 566                           &timers_state.vm_clock_lock);
 567        atomic_set_i64(&timers_state.qemu_icount_bias,
 568                       timers_state.qemu_icount_bias + warp);
 569        seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 570                             &timers_state.vm_clock_lock);
 571
 572        qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 573        timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 574        clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 575    }
 576    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 577}
 578
 579void qemu_start_warp_timer(void)
 580{
 581    int64_t clock;
 582    int64_t deadline;
 583
 584    if (!use_icount) {
 585        return;
 586    }
 587
 588    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 589     * do not fire, so computing the deadline does not make sense.
 590     */
 591    if (!runstate_is_running()) {
 592        return;
 593    }
 594
 595    if (replay_mode != REPLAY_MODE_PLAY) {
 596        if (!all_cpu_threads_idle()) {
 597            return;
 598        }
 599
 600        if (qtest_enabled()) {
 601            /* When testing, qtest commands advance icount.  */
 602            return;
 603        }
 604
 605        replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 606    } else {
 607        /* warp clock deterministically in record/replay mode */
 608        if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 609            /* vCPU is sleeping and warp can't be started.
 610               It is probably a race condition: notification sent
 611               to vCPU was processed in advance and vCPU went to sleep.
 612               Therefore we have to wake it up for doing someting. */
 613            if (replay_has_checkpoint()) {
 614                qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 615            }
 616            return;
 617        }
 618    }
 619
 620    /* We want to use the earliest deadline from ALL vm_clocks */
 621    clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 622    deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
 623                                          ~QEMU_TIMER_ATTR_EXTERNAL);
 624    if (deadline < 0) {
 625        static bool notified;
 626        if (!icount_sleep && !notified) {
 627            warn_report("icount sleep disabled and no active timers");
 628            notified = true;
 629        }
 630        return;
 631    }
 632
 633    if (deadline > 0) {
 634        /*
 635         * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 636         * sleep.  Otherwise, the CPU might be waiting for a future timer
 637         * interrupt to wake it up, but the interrupt never comes because
 638         * the vCPU isn't running any insns and thus doesn't advance the
 639         * QEMU_CLOCK_VIRTUAL.
 640         */
 641        if (!icount_sleep) {
 642            /*
 643             * We never let VCPUs sleep in no sleep icount mode.
 644             * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 645             * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 646             * It is useful when we want a deterministic execution time,
 647             * isolated from host latencies.
 648             */
 649            seqlock_write_lock(&timers_state.vm_clock_seqlock,
 650                               &timers_state.vm_clock_lock);
 651            atomic_set_i64(&timers_state.qemu_icount_bias,
 652                           timers_state.qemu_icount_bias + deadline);
 653            seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 654                                 &timers_state.vm_clock_lock);
 655            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 656        } else {
 657            /*
 658             * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 659             * "real" time, (related to the time left until the next event) has
 660             * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 661             * This avoids that the warps are visible externally; for example,
 662             * you will not be sending network packets continuously instead of
 663             * every 100ms.
 664             */
 665            seqlock_write_lock(&timers_state.vm_clock_seqlock,
 666                               &timers_state.vm_clock_lock);
 667            if (timers_state.vm_clock_warp_start == -1
 668                || timers_state.vm_clock_warp_start > clock) {
 669                timers_state.vm_clock_warp_start = clock;
 670            }
 671            seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 672                                 &timers_state.vm_clock_lock);
 673            timer_mod_anticipate(timers_state.icount_warp_timer,
 674                                 clock + deadline);
 675        }
 676    } else if (deadline == 0) {
 677        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 678    }
 679}
 680
 681static void qemu_account_warp_timer(void)
 682{
 683    if (!use_icount || !icount_sleep) {
 684        return;
 685    }
 686
 687    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 688     * do not fire, so computing the deadline does not make sense.
 689     */
 690    if (!runstate_is_running()) {
 691        return;
 692    }
 693
 694    /* warp clock deterministically in record/replay mode */
 695    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 696        return;
 697    }
 698
 699    timer_del(timers_state.icount_warp_timer);
 700    icount_warp_rt();
 701}
 702
 703static bool icount_state_needed(void *opaque)
 704{
 705    return use_icount;
 706}
 707
 708static bool warp_timer_state_needed(void *opaque)
 709{
 710    TimersState *s = opaque;
 711    return s->icount_warp_timer != NULL;
 712}
 713
 714static bool adjust_timers_state_needed(void *opaque)
 715{
 716    TimersState *s = opaque;
 717    return s->icount_rt_timer != NULL;
 718}
 719
 720/*
 721 * Subsection for warp timer migration is optional, because may not be created
 722 */
 723static const VMStateDescription icount_vmstate_warp_timer = {
 724    .name = "timer/icount/warp_timer",
 725    .version_id = 1,
 726    .minimum_version_id = 1,
 727    .needed = warp_timer_state_needed,
 728    .fields = (VMStateField[]) {
 729        VMSTATE_INT64(vm_clock_warp_start, TimersState),
 730        VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 731        VMSTATE_END_OF_LIST()
 732    }
 733};
 734
 735static const VMStateDescription icount_vmstate_adjust_timers = {
 736    .name = "timer/icount/timers",
 737    .version_id = 1,
 738    .minimum_version_id = 1,
 739    .needed = adjust_timers_state_needed,
 740    .fields = (VMStateField[]) {
 741        VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 742        VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 743        VMSTATE_END_OF_LIST()
 744    }
 745};
 746
 747/*
 748 * This is a subsection for icount migration.
 749 */
 750static const VMStateDescription icount_vmstate_timers = {
 751    .name = "timer/icount",
 752    .version_id = 1,
 753    .minimum_version_id = 1,
 754    .needed = icount_state_needed,
 755    .fields = (VMStateField[]) {
 756        VMSTATE_INT64(qemu_icount_bias, TimersState),
 757        VMSTATE_INT64(qemu_icount, TimersState),
 758        VMSTATE_END_OF_LIST()
 759    },
 760    .subsections = (const VMStateDescription*[]) {
 761        &icount_vmstate_warp_timer,
 762        &icount_vmstate_adjust_timers,
 763        NULL
 764    }
 765};
 766
 767static const VMStateDescription vmstate_timers = {
 768    .name = "timer",
 769    .version_id = 2,
 770    .minimum_version_id = 1,
 771    .fields = (VMStateField[]) {
 772        VMSTATE_INT64(cpu_ticks_offset, TimersState),
 773        VMSTATE_UNUSED(8),
 774        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 775        VMSTATE_END_OF_LIST()
 776    },
 777    .subsections = (const VMStateDescription*[]) {
 778        &icount_vmstate_timers,
 779        NULL
 780    }
 781};
 782
 783static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 784{
 785    double pct;
 786    double throttle_ratio;
 787    int64_t sleeptime_ns, endtime_ns;
 788
 789    if (!cpu_throttle_get_percentage()) {
 790        return;
 791    }
 792
 793    pct = (double)cpu_throttle_get_percentage()/100;
 794    throttle_ratio = pct / (1 - pct);
 795    /* Add 1ns to fix double's rounding error (like 0.9999999...) */
 796    sleeptime_ns = (int64_t)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS + 1);
 797    endtime_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + sleeptime_ns;
 798    while (sleeptime_ns > 0 && !cpu->stop) {
 799        if (sleeptime_ns > SCALE_MS) {
 800            qemu_cond_timedwait(cpu->halt_cond, &qemu_global_mutex,
 801                                sleeptime_ns / SCALE_MS);
 802        } else {
 803            qemu_mutex_unlock_iothread();
 804            g_usleep(sleeptime_ns / SCALE_US);
 805            qemu_mutex_lock_iothread();
 806        }
 807        sleeptime_ns = endtime_ns - qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 808    }
 809    atomic_set(&cpu->throttle_thread_scheduled, 0);
 810}
 811
 812static void cpu_throttle_timer_tick(void *opaque)
 813{
 814    CPUState *cpu;
 815    double pct;
 816
 817    /* Stop the timer if needed */
 818    if (!cpu_throttle_get_percentage()) {
 819        return;
 820    }
 821    CPU_FOREACH(cpu) {
 822        if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 823            async_run_on_cpu(cpu, cpu_throttle_thread,
 824                             RUN_ON_CPU_NULL);
 825        }
 826    }
 827
 828    pct = (double)cpu_throttle_get_percentage()/100;
 829    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 830                                   CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 831}
 832
 833void cpu_throttle_set(int new_throttle_pct)
 834{
 835    /* Ensure throttle percentage is within valid range */
 836    new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 837    new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 838
 839    atomic_set(&throttle_percentage, new_throttle_pct);
 840
 841    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 842                                       CPU_THROTTLE_TIMESLICE_NS);
 843}
 844
 845void cpu_throttle_stop(void)
 846{
 847    atomic_set(&throttle_percentage, 0);
 848}
 849
 850bool cpu_throttle_active(void)
 851{
 852    return (cpu_throttle_get_percentage() != 0);
 853}
 854
 855int cpu_throttle_get_percentage(void)
 856{
 857    return atomic_read(&throttle_percentage);
 858}
 859
 860void cpu_ticks_init(void)
 861{
 862    seqlock_init(&timers_state.vm_clock_seqlock);
 863    qemu_spin_init(&timers_state.vm_clock_lock);
 864    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 865    throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 866                                           cpu_throttle_timer_tick, NULL);
 867}
 868
 869void configure_icount(QemuOpts *opts, Error **errp)
 870{
 871    const char *option;
 872    char *rem_str = NULL;
 873
 874    option = qemu_opt_get(opts, "shift");
 875    if (!option) {
 876        if (qemu_opt_get(opts, "align") != NULL) {
 877            error_setg(errp, "Please specify shift option when using align");
 878        }
 879        return;
 880    }
 881
 882    icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 883    if (icount_sleep) {
 884        timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 885                                         icount_timer_cb, NULL);
 886    }
 887
 888    icount_align_option = qemu_opt_get_bool(opts, "align", false);
 889
 890    if (icount_align_option && !icount_sleep) {
 891        error_setg(errp, "align=on and sleep=off are incompatible");
 892    }
 893    if (strcmp(option, "auto") != 0) {
 894        errno = 0;
 895        timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 896        if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 897            error_setg(errp, "icount: Invalid shift value");
 898        }
 899        use_icount = 1;
 900        return;
 901    } else if (icount_align_option) {
 902        error_setg(errp, "shift=auto and align=on are incompatible");
 903    } else if (!icount_sleep) {
 904        error_setg(errp, "shift=auto and sleep=off are incompatible");
 905    }
 906
 907    use_icount = 2;
 908
 909    /* 125MIPS seems a reasonable initial guess at the guest speed.
 910       It will be corrected fairly quickly anyway.  */
 911    timers_state.icount_time_shift = 3;
 912
 913    /* Have both realtime and virtual time triggers for speed adjustment.
 914       The realtime trigger catches emulated time passing too slowly,
 915       the virtual time trigger catches emulated time passing too fast.
 916       Realtime triggers occur even when idle, so use them less frequently
 917       than VM triggers.  */
 918    timers_state.vm_clock_warp_start = -1;
 919    timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 920                                   icount_adjust_rt, NULL);
 921    timer_mod(timers_state.icount_rt_timer,
 922                   qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 923    timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 924                                        icount_adjust_vm, NULL);
 925    timer_mod(timers_state.icount_vm_timer,
 926                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 927                   NANOSECONDS_PER_SECOND / 10);
 928}
 929
 930/***********************************************************/
 931/* TCG vCPU kick timer
 932 *
 933 * The kick timer is responsible for moving single threaded vCPU
 934 * emulation on to the next vCPU. If more than one vCPU is running a
 935 * timer event with force a cpu->exit so the next vCPU can get
 936 * scheduled.
 937 *
 938 * The timer is removed if all vCPUs are idle and restarted again once
 939 * idleness is complete.
 940 */
 941
 942static QEMUTimer *tcg_kick_vcpu_timer;
 943static CPUState *tcg_current_rr_cpu;
 944
 945#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 946
 947static inline int64_t qemu_tcg_next_kick(void)
 948{
 949    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 950}
 951
 952/* Kick the currently round-robin scheduled vCPU to next */
 953static void qemu_cpu_kick_rr_next_cpu(void)
 954{
 955    CPUState *cpu;
 956    do {
 957        cpu = atomic_mb_read(&tcg_current_rr_cpu);
 958        if (cpu) {
 959            cpu_exit(cpu);
 960        }
 961    } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 962}
 963
 964/* Kick all RR vCPUs */
 965static void qemu_cpu_kick_rr_cpus(void)
 966{
 967    CPUState *cpu;
 968
 969    CPU_FOREACH(cpu) {
 970        cpu_exit(cpu);
 971    };
 972}
 973
 974static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 975{
 976}
 977
 978void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 979{
 980    if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 981        qemu_notify_event();
 982        return;
 983    }
 984
 985    if (qemu_in_vcpu_thread()) {
 986        /* A CPU is currently running; kick it back out to the
 987         * tcg_cpu_exec() loop so it will recalculate its
 988         * icount deadline immediately.
 989         */
 990        qemu_cpu_kick(current_cpu);
 991    } else if (first_cpu) {
 992        /* qemu_cpu_kick is not enough to kick a halted CPU out of
 993         * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 994         * causes cpu_thread_is_idle to return false.  This way,
 995         * handle_icount_deadline can run.
 996         * If we have no CPUs at all for some reason, we don't
 997         * need to do anything.
 998         */
 999        async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
1000    }

1001}
1002
1003static void kick_tcg_thread(void *opaque)
1004{
1005    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
1006    qemu_cpu_kick_rr_next_cpu();
1007}
1008
1009static void start_tcg_kick_timer(void)
1010{
1011    assert(!mttcg_enabled);
1012    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
1013        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
1014                                           kick_tcg_thread, NULL);
1015    }
1016    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
1017        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
1018    }
1019}
1020
1021static void stop_tcg_kick_timer(void)
1022{
1023    assert(!mttcg_enabled);
1024    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
1025        timer_del(tcg_kick_vcpu_timer);
1026    }
1027}
1028
1029/***********************************************************/
1030void hw_error(const char *fmt, ...)
1031{
1032    va_list ap;
1033    CPUState *cpu;
1034
1035    va_start(ap, fmt);
1036    fprintf(stderr, "qemu: hardware error: ");
1037    vfprintf(stderr, fmt, ap);
1038    fprintf(stderr, "\n");
1039    CPU_FOREACH(cpu) {
1040        fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1041        cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1042    }
1043    va_end(ap);
1044    abort();
1045}
1046
1047void cpu_synchronize_all_states(void)
1048{
1049    CPUState *cpu;
1050
1051    CPU_FOREACH(cpu) {
1052        cpu_synchronize_state(cpu);
1053        /* TODO: move to cpu_synchronize_state() */
1054        if (hvf_enabled()) {
1055            hvf_cpu_synchronize_state(cpu);
1056        }
1057    }
1058}
1059
1060void cpu_synchronize_all_post_reset(void)
1061{
1062    CPUState *cpu;
1063
1064    CPU_FOREACH(cpu) {
1065        cpu_synchronize_post_reset(cpu);
1066        /* TODO: move to cpu_synchronize_post_reset() */
1067        if (hvf_enabled()) {
1068            hvf_cpu_synchronize_post_reset(cpu);
1069        }
1070    }
1071}
1072
1073void cpu_synchronize_all_post_init(void)
1074{
1075    CPUState *cpu;
1076
1077    CPU_FOREACH(cpu) {
1078        cpu_synchronize_post_init(cpu);
1079        /* TODO: move to cpu_synchronize_post_init() */
1080        if (hvf_enabled()) {
1081            hvf_cpu_synchronize_post_init(cpu);
1082        }
1083    }
1084}
1085
1086void cpu_synchronize_all_pre_loadvm(void)
1087{
1088    CPUState *cpu;
1089
1090    CPU_FOREACH(cpu) {
1091        cpu_synchronize_pre_loadvm(cpu);
1092    }
1093}
1094
1095static int do_vm_stop(RunState state, bool send_stop)
1096{
1097    int ret = 0;
1098
1099    if (runstate_is_running()) {
1100        cpu_disable_ticks();
1101        pause_all_vcpus();
1102        runstate_set(state);
1103        vm_state_notify(0, state);
1104        if (send_stop) {
1105            qapi_event_send_stop();
1106        }
1107    }
1108
1109    bdrv_drain_all();
1110    ret = bdrv_flush_all();
1111
1112    return ret;
1113}
1114
1115/* Special vm_stop() variant for terminating the process.  Historically clients
1116 * did not expect a QMP STOP event and so we need to retain compatibility.
1117 */
1118int vm_shutdown(void)
1119{
1120    return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1121}
1122
1123static bool cpu_can_run(CPUState *cpu)
1124{
1125    if (cpu->stop) {
1126        return false;
1127    }
1128    if (cpu_is_stopped(cpu)) {
1129        return false;
1130    }
1131    return true;
1132}
1133
1134static void cpu_handle_guest_debug(CPUState *cpu)
1135{
1136    gdb_set_stop_cpu(cpu);
1137    qemu_system_debug_request();
1138    cpu->stopped = true;
1139}
1140
1141#ifdef CONFIG_LINUX
1142static void sigbus_reraise(void)
1143{
1144    sigset_t set;
1145    struct sigaction action;
1146
1147    memset(&action, 0, sizeof(action));
1148    action.sa_handler = SIG_DFL;
1149    if (!sigaction(SIGBUS, &action, NULL)) {
1150        raise(SIGBUS);
1151        sigemptyset(&set);
1152        sigaddset(&set, SIGBUS);
1153        pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1154    }
1155    perror("Failed to re-raise SIGBUS!\n");
1156    abort();
1157}
1158
1159static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1160{
1161    if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1162        sigbus_reraise();
1163    }
1164
1165    if (current_cpu) {
1166        /* Called asynchronously in VCPU thread.  */
1167        if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1168            sigbus_reraise();
1169        }
1170    } else {
1171        /* Called synchronously (via signalfd) in main thread.  */
1172        if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1173            sigbus_reraise();
1174        }
1175    }
1176}
1177
1178static void qemu_init_sigbus(void)
1179{
1180    struct sigaction action;
1181
1182    memset(&action, 0, sizeof(action));
1183    action.sa_flags = SA_SIGINFO;
1184    action.sa_sigaction = sigbus_handler;
1185    sigaction(SIGBUS, &action, NULL);
1186
1187    prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1188}
1189#else /* !CONFIG_LINUX */
1190static void qemu_init_sigbus(void)
1191{
1192}
1193#endif /* !CONFIG_LINUX */
1194
1195static QemuThread io_thread;
1196
1197/* cpu creation */
1198static QemuCond qemu_cpu_cond;
1199/* system init */
1200static QemuCond qemu_pause_cond;
1201
1202void qemu_init_cpu_loop(void)
1203{
1204    qemu_init_sigbus();
1205    qemu_cond_init(&qemu_cpu_cond);
1206    qemu_cond_init(&qemu_pause_cond);
1207    qemu_mutex_init(&qemu_global_mutex);
1208
1209    qemu_thread_get_self(&io_thread);
1210}
1211
1212void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1213{
1214    do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1215}
1216
1217static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1218{
1219    if (kvm_destroy_vcpu(cpu) < 0) {
1220        error_report("kvm_destroy_vcpu failed");
1221        exit(EXIT_FAILURE);
1222    }
1223}
1224
1225static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1226{
1227}
1228
1229static void qemu_cpu_stop(CPUState *cpu, bool exit)
1230{
1231    g_assert(qemu_cpu_is_self(cpu));
1232    cpu->stop = false;
1233    cpu->stopped = true;
1234    if (exit) {
1235        cpu_exit(cpu);
1236    }
1237    qemu_cond_broadcast(&qemu_pause_cond);
1238}
1239
1240static void qemu_wait_io_event_common(CPUState *cpu)
1241{
1242    atomic_mb_set(&cpu->thread_kicked, false);
1243    if (cpu->stop) {
1244        qemu_cpu_stop(cpu, false);
1245    }
1246    process_queued_cpu_work(cpu);
1247}
1248
1249static void qemu_tcg_rr_wait_io_event(void)
1250{
1251    CPUState *cpu;
1252
1253    while (all_cpu_threads_idle()) {
1254        stop_tcg_kick_timer();
1255        qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1256    }
1257
1258    start_tcg_kick_timer();
1259
1260    CPU_FOREACH(cpu) {
1261        qemu_wait_io_event_common(cpu);
1262    }
1263}
1264
1265static void qemu_wait_io_event(CPUState *cpu)
1266{
1267    bool slept = false;
1268
1269    while (cpu_thread_is_idle(cpu)) {
1270        if (!slept) {
1271            slept = true;
1272            qemu_plugin_vcpu_idle_cb(cpu);
1273        }
1274        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1275    }
1276    if (slept) {
1277        qemu_plugin_vcpu_resume_cb(cpu);
1278    }
1279
1280#ifdef _WIN32
1281    /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1282    if (!tcg_enabled()) {
1283        SleepEx(0, TRUE);
1284    }
1285#endif
1286    qemu_wait_io_event_common(cpu);
1287}
1288
1289static void *qemu_kvm_cpu_thread_fn(void *arg)
1290{
1291    CPUState *cpu = arg;
1292    int r;
1293
1294    rcu_register_thread();
1295
1296    qemu_mutex_lock_iothread();
1297    qemu_thread_get_self(cpu->thread);
1298    cpu->thread_id = qemu_get_thread_id();
1299    cpu->can_do_io = 1;
1300    current_cpu = cpu;
1301
1302    r = kvm_init_vcpu(cpu);
1303    if (r < 0) {
1304        error_report("kvm_init_vcpu failed: %s", strerror(-r));
1305        exit(1);
1306    }
1307
1308    kvm_init_cpu_signals(cpu);
1309
1310    /* signal CPU creation */
1311    cpu->created = true;
1312    qemu_cond_signal(&qemu_cpu_cond);
1313    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1314
1315    do {
1316        if (cpu_can_run(cpu)) {
1317            r = kvm_cpu_exec(cpu);
1318            if (r == EXCP_DEBUG) {
1319                cpu_handle_guest_debug(cpu);
1320            }
1321        }
1322        qemu_wait_io_event(cpu);
1323    } while (!cpu->unplug || cpu_can_run(cpu));
1324
1325    qemu_kvm_destroy_vcpu(cpu);
1326    cpu->created = false;
1327    qemu_cond_signal(&qemu_cpu_cond);
1328    qemu_mutex_unlock_iothread();
1329    rcu_unregister_thread();
1330    return NULL;
1331}
1332
1333static void *qemu_dummy_cpu_thread_fn(void *arg)
1334{
1335#ifdef _WIN32
1336    error_report("qtest is not supported under Windows");
1337    exit(1);
1338#else
1339    CPUState *cpu = arg;
1340    sigset_t waitset;
1341    int r;
1342
1343    rcu_register_thread();
1344
1345    qemu_mutex_lock_iothread();
1346    qemu_thread_get_self(cpu->thread);
1347    cpu->thread_id = qemu_get_thread_id();
1348    cpu->can_do_io = 1;
1349    current_cpu = cpu;
1350
1351    sigemptyset(&waitset);
1352    sigaddset(&waitset, SIG_IPI);
1353
1354    /* signal CPU creation */
1355    cpu->created = true;
1356    qemu_cond_signal(&qemu_cpu_cond);
1357    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1358
1359    do {
1360        qemu_mutex_unlock_iothread();
1361        do {
1362            int sig;
1363            r = sigwait(&waitset, &sig);
1364        } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1365        if (r == -1) {
1366            perror("sigwait");
1367            exit(1);
1368        }
1369        qemu_mutex_lock_iothread();
1370        qemu_wait_io_event(cpu);
1371    } while (!cpu->unplug);
1372
1373    qemu_mutex_unlock_iothread();
1374    rcu_unregister_thread();
1375    return NULL;
1376#endif
1377}
1378
1379static int64_t tcg_get_icount_limit(void)
1380{
1381    int64_t deadline;
1382
1383    if (replay_mode != REPLAY_MODE_PLAY) {
1384        /*
1385         * Include all the timers, because they may need an attention.
1386         * Too long CPU execution may create unnecessary delay in UI.
1387         */
1388        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1389                                              QEMU_TIMER_ATTR_ALL);
1390
1391        /* Maintain prior (possibly buggy) behaviour where if no deadline
1392         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1393         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1394         * nanoseconds.
1395         */
1396        if ((deadline < 0) || (deadline > INT32_MAX)) {
1397            deadline = INT32_MAX;
1398        }
1399
1400        return qemu_icount_round(deadline);
1401    } else {
1402        return replay_get_instructions();
1403    }
1404}
1405
1406static void handle_icount_deadline(void)
1407{
1408    assert(qemu_in_vcpu_thread());
1409    if (use_icount) {
1410        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL,
1411                                                      QEMU_TIMER_ATTR_ALL);
1412
1413        if (deadline == 0) {
1414            /* Wake up other AioContexts.  */
1415            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1416            qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1417        }
1418    }
1419}
1420
1421static void prepare_icount_for_run(CPUState *cpu)
1422{
1423    if (use_icount) {
1424        int insns_left;
1425
1426        /* These should always be cleared by process_icount_data after
1427         * each vCPU execution. However u16.high can be raised
1428         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1429         */
1430        g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1431        g_assert(cpu->icount_extra == 0);
1432
1433        cpu->icount_budget = tcg_get_icount_limit();
1434        insns_left = MIN(0xffff, cpu->icount_budget);
1435        cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1436        cpu->icount_extra = cpu->icount_budget - insns_left;
1437
1438        replay_mutex_lock();
1439    }
1440}
1441
1442static void process_icount_data(CPUState *cpu)
1443{
1444    if (use_icount) {
1445        /* Account for executed instructions */
1446        cpu_update_icount(cpu);
1447
1448        /* Reset the counters */
1449        cpu_neg(cpu)->icount_decr.u16.low = 0;
1450        cpu->icount_extra = 0;
1451        cpu->icount_budget = 0;
1452
1453        replay_account_executed_instructions();
1454
1455        replay_mutex_unlock();
1456    }
1457}
1458
1459
1460static int tcg_cpu_exec(CPUState *cpu)
1461{
1462    int ret;
1463#ifdef CONFIG_PROFILER
1464    int64_t ti;
1465#endif
1466
1467    assert(tcg_enabled());
1468#ifdef CONFIG_PROFILER
1469    ti = profile_getclock();
1470#endif
1471    cpu_exec_start(cpu);
1472    ret = cpu_exec(cpu);
1473    cpu_exec_end(cpu);
1474#ifdef CONFIG_PROFILER
1475    atomic_set(&tcg_ctx->prof.cpu_exec_time,
1476               tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1477#endif
1478    return ret;
1479}
1480
1481/* Destroy any remaining vCPUs which have been unplugged and have
1482 * finished running
1483 */
1484static void deal_with_unplugged_cpus(void)
1485{
1486    CPUState *cpu;
1487
1488    CPU_FOREACH(cpu) {
1489        if (cpu->unplug && !cpu_can_run(cpu)) {
1490            qemu_tcg_destroy_vcpu(cpu);
1491            cpu->created = false;
1492            qemu_cond_signal(&qemu_cpu_cond);
1493            break;
1494        }
1495    }
1496}
1497
1498/* Single-threaded TCG
1499 *
1500 * In the single-threaded case each vCPU is simulated in turn. If
1501 * there is more than a single vCPU we create a simple timer to kick
1502 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1503 * This is done explicitly rather than relying on side-effects
1504 * elsewhere.
1505 */
1506
1507static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1508{
1509    CPUState *cpu = arg;
1510
1511    assert(tcg_enabled());
1512    rcu_register_thread();
1513    tcg_register_thread();
1514
1515    qemu_mutex_lock_iothread();
1516    qemu_thread_get_self(cpu->thread);
1517
1518    cpu->thread_id = qemu_get_thread_id();
1519    cpu->created = true;
1520    cpu->can_do_io = 1;
1521    qemu_cond_signal(&qemu_cpu_cond);
1522    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1523
1524    /* wait for initial kick-off after machine start */
1525    while (first_cpu->stopped) {
1526        qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1527
1528        /* process any pending work */
1529        CPU_FOREACH(cpu) {
1530            current_cpu = cpu;
1531            qemu_wait_io_event_common(cpu);
1532        }
1533    }
1534
1535    start_tcg_kick_timer();
1536
1537    cpu = first_cpu;
1538
1539    /* process any pending work */
1540    cpu->exit_request = 1;
1541
1542    while (1) {
1543        qemu_mutex_unlock_iothread();
1544        replay_mutex_lock();
1545        qemu_mutex_lock_iothread();
1546        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1547        qemu_account_warp_timer();
1548
1549        /* Run the timers here.  This is much more efficient than
1550         * waking up the I/O thread and waiting for completion.
1551         */
1552        handle_icount_deadline();
1553
1554        replay_mutex_unlock();
1555
1556        if (!cpu) {
1557            cpu = first_cpu;
1558        }
1559
1560        while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1561
1562            atomic_mb_set(&tcg_current_rr_cpu, cpu);
1563            current_cpu = cpu;
1564
1565            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1566                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1567
1568            if (cpu_can_run(cpu)) {
1569                int r;
1570
1571                qemu_mutex_unlock_iothread();
1572                prepare_icount_for_run(cpu);
1573
1574                r = tcg_cpu_exec(cpu);
1575
1576                process_icount_data(cpu);
1577                qemu_mutex_lock_iothread();
1578
1579                if (r == EXCP_DEBUG) {
1580                    cpu_handle_guest_debug(cpu);
1581                    break;
1582                } else if (r == EXCP_ATOMIC) {
1583                    qemu_mutex_unlock_iothread();
1584                    cpu_exec_step_atomic(cpu);
1585                    qemu_mutex_lock_iothread();
1586                    break;
1587                }
1588            } else if (cpu->stop) {
1589                if (cpu->unplug) {
1590                    cpu = CPU_NEXT(cpu);
1591                }
1592                break;
1593            }
1594
1595            cpu = CPU_NEXT(cpu);
1596        } /* while (cpu && !cpu->exit_request).. */
1597
1598        /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1599        atomic_set(&tcg_current_rr_cpu, NULL);
1600
1601        if (cpu && cpu->exit_request) {
1602            atomic_mb_set(&cpu->exit_request, 0);
1603        }
1604
1605        if (use_icount && all_cpu_threads_idle()) {
1606            /*
1607             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1608             * in the main_loop, wake it up in order to start the warp timer.
1609             */
1610            qemu_notify_event();
1611        }
1612
1613        qemu_tcg_rr_wait_io_event();
1614        deal_with_unplugged_cpus();
1615    }
1616
1617    rcu_unregister_thread();
1618    return NULL;
1619}
1620
1621static void *qemu_hax_cpu_thread_fn(void *arg)
1622{
1623    CPUState *cpu = arg;
1624    int r;
1625
1626    rcu_register_thread();
1627    qemu_mutex_lock_iothread();
1628    qemu_thread_get_self(cpu->thread);
1629
1630    cpu->thread_id = qemu_get_thread_id();
1631    cpu->created = true;
1632    current_cpu = cpu;
1633
1634    hax_init_vcpu(cpu);
1635    qemu_cond_signal(&qemu_cpu_cond);
1636    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1637
1638    do {
1639        if (cpu_can_run(cpu)) {
1640            r = hax_smp_cpu_exec(cpu);
1641            if (r == EXCP_DEBUG) {
1642                cpu_handle_guest_debug(cpu);
1643            }
1644        }
1645
1646        qemu_wait_io_event(cpu);
1647    } while (!cpu->unplug || cpu_can_run(cpu));
1648    rcu_unregister_thread();
1649    return NULL;
1650}
1651
1652/* The HVF-specific vCPU thread function. This one should only run when the host
1653 * CPU supports the VMX "unrestricted guest" feature. */
1654static void *qemu_hvf_cpu_thread_fn(void *arg)
1655{
1656    CPUState *cpu = arg;
1657
1658    int r;
1659
1660    assert(hvf_enabled());
1661
1662    rcu_register_thread();
1663
1664    qemu_mutex_lock_iothread();
1665    qemu_thread_get_self(cpu->thread);
1666
1667    cpu->thread_id = qemu_get_thread_id();
1668    cpu->can_do_io = 1;
1669    current_cpu = cpu;
1670
1671    hvf_init_vcpu(cpu);
1672
1673    /* signal CPU creation */
1674    cpu->created = true;
1675    qemu_cond_signal(&qemu_cpu_cond);
1676    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1677
1678    do {
1679        if (cpu_can_run(cpu)) {
1680            r = hvf_vcpu_exec(cpu);
1681            if (r == EXCP_DEBUG) {
1682                cpu_handle_guest_debug(cpu);
1683            }
1684        }
1685        qemu_wait_io_event(cpu);
1686    } while (!cpu->unplug || cpu_can_run(cpu));
1687
1688    hvf_vcpu_destroy(cpu);
1689    cpu->created = false;
1690    qemu_cond_signal(&qemu_cpu_cond);
1691    qemu_mutex_unlock_iothread();
1692    rcu_unregister_thread();
1693    return NULL;
1694}
1695
1696static void *qemu_whpx_cpu_thread_fn(void *arg)
1697{
1698    CPUState *cpu = arg;
1699    int r;
1700
1701    rcu_register_thread();
1702
1703    qemu_mutex_lock_iothread();
1704    qemu_thread_get_self(cpu->thread);
1705    cpu->thread_id = qemu_get_thread_id();
1706    current_cpu = cpu;
1707
1708    r = whpx_init_vcpu(cpu);
1709    if (r < 0) {
1710        fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1711        exit(1);
1712    }
1713
1714    /* signal CPU creation */
1715    cpu->created = true;
1716    qemu_cond_signal(&qemu_cpu_cond);
1717    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1718
1719    do {
1720        if (cpu_can_run(cpu)) {
1721            r = whpx_vcpu_exec(cpu);
1722            if (r == EXCP_DEBUG) {
1723                cpu_handle_guest_debug(cpu);
1724            }
1725        }
1726        while (cpu_thread_is_idle(cpu)) {
1727            qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1728        }
1729        qemu_wait_io_event_common(cpu);
1730    } while (!cpu->unplug || cpu_can_run(cpu));
1731
1732    whpx_destroy_vcpu(cpu);
1733    cpu->created = false;
1734    qemu_cond_signal(&qemu_cpu_cond);
1735    qemu_mutex_unlock_iothread();
1736    rcu_unregister_thread();
1737    return NULL;
1738}
1739
1740#ifdef _WIN32
1741static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1742{
1743}
1744#endif
1745
1746/* Multi-threaded TCG
1747 *
1748 * In the multi-threaded case each vCPU has its own thread. The TLS
1749 * variable current_cpu can be used deep in the code to find the
1750 * current CPUState for a given thread.
1751 */
1752
1753static void *qemu_tcg_cpu_thread_fn(void *arg)
1754{
1755    CPUState *cpu = arg;
1756
1757    assert(tcg_enabled());
1758    g_assert(!use_icount);
1759
1760    rcu_register_thread();
1761    tcg_register_thread();
1762
1763    qemu_mutex_lock_iothread();
1764    qemu_thread_get_self(cpu->thread);
1765
1766    cpu->thread_id = qemu_get_thread_id();
1767    cpu->created = true;
1768    cpu->can_do_io = 1;
1769    current_cpu = cpu;
1770    qemu_cond_signal(&qemu_cpu_cond);
1771    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1772
1773    /* process any pending work */
1774    cpu->exit_request = 1;
1775
1776    do {
1777        if (cpu_can_run(cpu)) {
1778            int r;
1779            qemu_mutex_unlock_iothread();
1780            r = tcg_cpu_exec(cpu);
1781            qemu_mutex_lock_iothread();
1782            switch (r) {
1783            case EXCP_DEBUG:
1784                cpu_handle_guest_debug(cpu);
1785                break;
1786            case EXCP_HALTED:
1787                /* during start-up the vCPU is reset and the thread is
1788                 * kicked several times. If we don't ensure we go back
1789                 * to sleep in the halted state we won't cleanly
1790                 * start-up when the vCPU is enabled.
1791                 *
1792                 * cpu->halted should ensure we sleep in wait_io_event
1793                 */
1794                if (!cpu->halted) {
1795                    qemu_log_mask(LOG_PM, "CPU%d: EXCP_HALTED while halted=0\n",
1796                             cpu->halted);
1797                }
1798                break;
1799            case EXCP_ATOMIC:
1800                qemu_mutex_unlock_iothread();
1801                cpu_exec_step_atomic(cpu);
1802                qemu_mutex_lock_iothread();
1803            default:
1804                /* Ignore everything else? */
1805                break;
1806            }
1807        }
1808
1809        atomic_mb_set(&cpu->exit_request, 0);
1810        qemu_wait_io_event(cpu);
1811    } while (!cpu->unplug || cpu_can_run(cpu));
1812
1813    qemu_tcg_destroy_vcpu(cpu);
1814    cpu->created = false;
1815    qemu_cond_signal(&qemu_cpu_cond);
1816    qemu_mutex_unlock_iothread();
1817    rcu_unregister_thread();
1818    return NULL;
1819}
1820
1821static void qemu_cpu_kick_thread(CPUState *cpu)
1822{
1823#ifndef _WIN32
1824    int err;
1825
1826    if (cpu->thread_kicked) {
1827        return;
1828    }
1829    cpu->thread_kicked = true;
1830    err = pthread_kill(cpu->thread->thread, SIG_IPI);
1831    if (err && err != ESRCH) {
1832        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1833        exit(1);
1834    }
1835#else /* _WIN32 */
1836    if (!qemu_cpu_is_self(cpu)) {
1837        if (whpx_enabled()) {
1838            whpx_vcpu_kick(cpu);
1839        } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1840            fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1841                    __func__, GetLastError());
1842            exit(1);
1843        }
1844    }
1845#endif
1846}
1847
1848void qemu_cpu_kick(CPUState *cpu)
1849{
1850    qemu_cond_broadcast(cpu->halt_cond);
1851    if (tcg_enabled()) {
1852        if (qemu_tcg_mttcg_enabled()) {
1853            cpu_exit(cpu);
1854        } else {
1855            qemu_cpu_kick_rr_cpus();
1856        }
1857    } else {
1858        if (hax_enabled()) {
1859            /*
1860             * FIXME: race condition with the exit_request check in
1861             * hax_vcpu_hax_exec
1862             */
1863            cpu->exit_request = 1;
1864        }
1865        qemu_cpu_kick_thread(cpu);
1866    }
1867}
1868
1869void qemu_cpu_kick_self(void)
1870{
1871    assert(current_cpu);
1872    qemu_cpu_kick_thread(current_cpu);
1873}
1874
1875bool qemu_cpu_is_self(CPUState *cpu)
1876{
1877    return qemu_thread_is_self(cpu->thread);
1878}
1879
1880bool qemu_in_vcpu_thread(void)
1881{
1882    return current_cpu && qemu_cpu_is_self(current_cpu);
1883}
1884
1885static __thread bool iothread_locked = false;
1886
1887bool qemu_mutex_iothread_locked(void)
1888{
1889    return iothread_locked;
1890}
1891
1892/*
1893 * The BQL is taken from so many places that it is worth profiling the
1894 * callers directly, instead of funneling them all through a single function.
1895 */
1896void qemu_mutex_lock_iothread_impl(const char *file, int line)
1897{
1898    QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1899
1900    g_assert(!qemu_mutex_iothread_locked());
1901    bql_lock(&qemu_global_mutex, file, line);
1902    iothread_locked = true;
1903}
1904
1905void qemu_mutex_unlock_iothread(void)
1906{
1907    g_assert(qemu_mutex_iothread_locked());
1908    iothread_locked = false;
1909    qemu_mutex_unlock(&qemu_global_mutex);
1910}
1911
1912static bool all_vcpus_paused(void)
1913{
1914    CPUState *cpu;
1915
1916    CPU_FOREACH(cpu) {
1917        if (!cpu->stopped) {
1918            return false;
1919        }
1920    }
1921
1922    return true;
1923}
1924
1925void pause_all_vcpus(void)
1926{
1927    CPUState *cpu;
1928
1929    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1930    CPU_FOREACH(cpu) {
1931        if (qemu_cpu_is_self(cpu)) {
1932            qemu_cpu_stop(cpu, true);
1933        } else {
1934            cpu->stop = true;
1935            qemu_cpu_kick(cpu);
1936        }
1937    }
1938
1939    /* We need to drop the replay_lock so any vCPU threads woken up
1940     * can finish their replay tasks
1941     */
1942    replay_mutex_unlock();
1943
1944    while (!all_vcpus_paused()) {
1945        qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1946        CPU_FOREACH(cpu) {
1947            qemu_cpu_kick(cpu);
1948        }
1949    }
1950
1951    qemu_mutex_unlock_iothread();
1952    replay_mutex_lock();
1953    qemu_mutex_lock_iothread();
1954}
1955
1956void cpu_resume(CPUState *cpu)
1957{
1958    cpu->stop = false;
1959    cpu->stopped = false;
1960    qemu_cpu_kick(cpu);
1961}
1962
1963void resume_all_vcpus(void)
1964{
1965    CPUState *cpu;
1966
1967    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1968    CPU_FOREACH(cpu) {
1969        cpu_resume(cpu);
1970    }
1971}
1972
1973void cpu_remove_sync(CPUState *cpu)
1974{
1975    cpu->stop = true;
1976    cpu->unplug = true;
1977    qemu_cpu_kick(cpu);
1978    qemu_mutex_unlock_iothread();
1979    qemu_thread_join(cpu->thread);
1980    qemu_mutex_lock_iothread();
1981}
1982
1983/* For temporary buffers for forming a name */
1984#define VCPU_THREAD_NAME_SIZE 16
1985
1986static void qemu_tcg_init_vcpu(CPUState *cpu)
1987{
1988    char thread_name[VCPU_THREAD_NAME_SIZE];
1989    static QemuCond *single_tcg_halt_cond;
1990    static QemuThread *single_tcg_cpu_thread;
1991    static int tcg_region_inited;
1992
1993    assert(tcg_enabled());
1994    /*
1995     * Initialize TCG regions--once. Now is a good time, because:
1996     * (1) TCG's init context, prologue and target globals have been set up.
1997     * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1998     *     -accel flag is processed, so the check doesn't work then).
1999     */
2000    if (!tcg_region_inited) {

2001        tcg_region_inited = 1;
2002        tcg_region_init();
2003    }
2004
2005    if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
2006        cpu->thread = g_malloc0(sizeof(QemuThread));
2007        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2008        qemu_cond_init(cpu->halt_cond);
2009
2010        if (qemu_tcg_mttcg_enabled()) {
2011            /* create a thread per vCPU with TCG (MTTCG) */
2012            parallel_cpus = true;
2013            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
2014                 cpu->cpu_index);
2015
2016            qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
2017                               cpu, QEMU_THREAD_JOINABLE);
2018
2019        } else {
2020            /* share a single thread for all cpus with TCG */
2021            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
2022            qemu_thread_create(cpu->thread, thread_name,
2023                               qemu_tcg_rr_cpu_thread_fn,
2024                               cpu, QEMU_THREAD_JOINABLE);
2025
2026            single_tcg_halt_cond = cpu->halt_cond;
2027            single_tcg_cpu_thread = cpu->thread;
2028        }
2029#ifdef _WIN32
2030        cpu->hThread = qemu_thread_get_handle(cpu->thread);
2031#endif
2032    } else {
2033        /* For non-MTTCG cases we share the thread */
2034        cpu->thread = single_tcg_cpu_thread;
2035        cpu->halt_cond = single_tcg_halt_cond;
2036        cpu->thread_id = first_cpu->thread_id;
2037        cpu->can_do_io = 1;
2038        cpu->created = true;
2039    }
2040}
2041
2042static void qemu_hax_start_vcpu(CPUState *cpu)
2043{
2044    char thread_name[VCPU_THREAD_NAME_SIZE];
2045
2046    cpu->thread = g_malloc0(sizeof(QemuThread));
2047    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2048    qemu_cond_init(cpu->halt_cond);
2049
2050    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2051             cpu->cpu_index);
2052    qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2053                       cpu, QEMU_THREAD_JOINABLE);
2054#ifdef _WIN32
2055    cpu->hThread = qemu_thread_get_handle(cpu->thread);
2056#endif
2057}
2058
2059static void qemu_kvm_start_vcpu(CPUState *cpu)
2060{
2061    char thread_name[VCPU_THREAD_NAME_SIZE];
2062
2063    cpu->thread = g_malloc0(sizeof(QemuThread));
2064    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2065    qemu_cond_init(cpu->halt_cond);
2066    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2067             cpu->cpu_index);
2068    qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2069                       cpu, QEMU_THREAD_JOINABLE);
2070}
2071
2072static void qemu_hvf_start_vcpu(CPUState *cpu)
2073{
2074    char thread_name[VCPU_THREAD_NAME_SIZE];
2075
2076    /* HVF currently does not support TCG, and only runs in
2077     * unrestricted-guest mode. */
2078    assert(hvf_enabled());
2079
2080    cpu->thread = g_malloc0(sizeof(QemuThread));
2081    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2082    qemu_cond_init(cpu->halt_cond);
2083
2084    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2085             cpu->cpu_index);
2086    qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2087                       cpu, QEMU_THREAD_JOINABLE);
2088}
2089
2090static void qemu_whpx_start_vcpu(CPUState *cpu)
2091{
2092    char thread_name[VCPU_THREAD_NAME_SIZE];
2093
2094    cpu->thread = g_malloc0(sizeof(QemuThread));
2095    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2096    qemu_cond_init(cpu->halt_cond);
2097    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2098             cpu->cpu_index);
2099    qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2100                       cpu, QEMU_THREAD_JOINABLE);
2101#ifdef _WIN32
2102    cpu->hThread = qemu_thread_get_handle(cpu->thread);
2103#endif
2104}
2105
2106static void qemu_dummy_start_vcpu(CPUState *cpu)
2107{
2108    char thread_name[VCPU_THREAD_NAME_SIZE];
2109
2110    cpu->thread = g_malloc0(sizeof(QemuThread));
2111    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2112    qemu_cond_init(cpu->halt_cond);
2113    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2114             cpu->cpu_index);
2115    qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2116                       QEMU_THREAD_JOINABLE);
2117}
2118
2119void qemu_init_vcpu(CPUState *cpu)
2120{
2121    MachineState *ms = MACHINE(qdev_get_machine());
2122
2123    cpu->nr_cores = ms->smp.cores;
2124    cpu->nr_threads =  ms->smp.threads;
2125    cpu->stopped = true;
2126    cpu->random_seed = qemu_guest_random_seed_thread_part1();
2127
2128    if (!cpu->as) {
2129        /* If the target cpu hasn't set up any address spaces itself,
2130         * give it the default one.
2131         */
2132        cpu->num_ases = 1;
2133        cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2134    }
2135
2136    if (kvm_enabled()) {
2137        qemu_kvm_start_vcpu(cpu);
2138    } else if (hax_enabled()) {
2139        qemu_hax_start_vcpu(cpu);
2140    } else if (hvf_enabled()) {
2141        qemu_hvf_start_vcpu(cpu);
2142    } else if (tcg_enabled()) {
2143        qemu_tcg_init_vcpu(cpu);
2144    } else if (whpx_enabled()) {
2145        qemu_whpx_start_vcpu(cpu);
2146    } else {
2147        qemu_dummy_start_vcpu(cpu);
2148    }
2149
2150    while (!cpu->created) {
2151        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2152    }
2153}
2154
2155void cpu_stop_current(void)
2156{
2157    if (current_cpu) {
2158        current_cpu->stop = true;
2159        cpu_exit(current_cpu);
2160    }
2161}
2162
2163void vm_stop_from_timer(RunState state)
2164{
2165    qemu_system_vmstop_request_prepare();
2166    qemu_system_vmstop_request(state);
2167    /*
2168     * FIXME: should not return to device code in case
2169     * vm_stop() has been requested.
2170     */
2171    cpu_stop_current();
2172}
2173
2174int vm_stop(RunState state)
2175{
2176    if (qemu_in_vcpu_thread()) {
2177        qemu_system_vmstop_request_prepare();
2178        qemu_system_vmstop_request(state);
2179        /*
2180         * FIXME: should not return to device code in case
2181         * vm_stop() has been requested.
2182         */
2183        cpu_stop_current();
2184        return 0;
2185    }
2186
2187    return do_vm_stop(state, true);
2188}
2189
2190/**
2191 * Prepare for (re)starting the VM.
2192 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2193 * running or in case of an error condition), 0 otherwise.
2194 */
2195int vm_prepare_start(void)
2196{
2197    RunState requested;
2198
2199    qemu_vmstop_requested(&requested);
2200    if (runstate_is_running() && requested == RUN_STATE__MAX) {
2201        return -1;
2202    }
2203
2204    /* Ensure that a STOP/RESUME pair of events is emitted if a
2205     * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2206     * example, according to documentation is always followed by
2207     * the STOP event.
2208     */
2209    if (runstate_is_running()) {
2210        qapi_event_send_stop();
2211        qapi_event_send_resume();
2212        return -1;
2213    }
2214
2215    /* We are sending this now, but the CPUs will be resumed shortly later */
2216    qapi_event_send_resume();
2217
2218    cpu_enable_ticks();
2219    runstate_set(RUN_STATE_RUNNING);
2220    vm_state_notify(1, RUN_STATE_RUNNING);
2221    return 0;
2222}
2223
2224void vm_start(void)
2225{
2226    if (!vm_prepare_start()) {
2227        resume_all_vcpus();
2228    }
2229}
2230
2231/* does a state transition even if the VM is already stopped,
2232   current state is forgotten forever */
2233int vm_stop_force_state(RunState state)
2234{
2235    if (runstate_is_running()) {
2236        return vm_stop(state);
2237    } else {
2238        runstate_set(state);
2239
2240        bdrv_drain_all();
2241        /* Make sure to return an error if the flush in a previous vm_stop()
2242         * failed. */
2243        return bdrv_flush_all();
2244    }
2245}
2246
2247void list_cpus(const char *optarg)
2248{
2249    /* XXX: implement xxx_cpu_list for targets that still miss it */
2250#if defined(cpu_list)
2251    cpu_list();
2252#endif
2253}
2254
2255void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2256                 bool has_cpu, int64_t cpu_index, Error **errp)
2257{
2258    FILE *f;
2259    uint32_t l;
2260    CPUState *cpu;
2261    uint8_t buf[1024];
2262    int64_t orig_addr = addr, orig_size = size;
2263
2264    if (!has_cpu) {
2265        cpu_index = 0;
2266    }
2267
2268    cpu = qemu_get_cpu(cpu_index);
2269    if (cpu == NULL) {
2270        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2271                   "a CPU number");
2272        return;
2273    }
2274
2275    f = fopen(filename, "wb");
2276    if (!f) {
2277        error_setg_file_open(errp, errno, filename);
2278        return;
2279    }
2280
2281    while (size != 0) {
2282        l = sizeof(buf);
2283        if (l > size)
2284            l = size;
2285        if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2286            error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2287                             " specified", orig_addr, orig_size);
2288            goto exit;
2289        }
2290        if (fwrite(buf, 1, l, f) != l) {
2291            error_setg(errp, QERR_IO_ERROR);
2292            goto exit;
2293        }
2294        addr += l;
2295        size -= l;
2296    }
2297
2298exit:
2299    fclose(f);
2300}
2301
2302void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2303                  Error **errp)
2304{
2305    FILE *f;
2306    uint32_t l;
2307    uint8_t buf[1024];
2308
2309    f = fopen(filename, "wb");
2310    if (!f) {
2311        error_setg_file_open(errp, errno, filename);
2312        return;
2313    }
2314
2315    while (size != 0) {
2316        l = sizeof(buf);
2317        if (l > size)
2318            l = size;
2319        cpu_physical_memory_read(addr, buf, l);
2320        if (fwrite(buf, 1, l, f) != l) {
2321            error_setg(errp, QERR_IO_ERROR);
2322            goto exit;
2323        }
2324        addr += l;
2325        size -= l;
2326    }
2327
2328exit:
2329    fclose(f);
2330}
2331
2332void qmp_inject_nmi(Error **errp)
2333{
2334    nmi_monitor_handle(monitor_get_cpu_index(), errp);
2335}
2336
2337void dump_drift_info(void)
2338{
2339    if (!use_icount) {
2340        return;
2341    }
2342
2343    qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2344                (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2345    if (icount_align_option) {
2346        qemu_printf("Max guest delay     %"PRIi64" ms\n",
2347                    -max_delay / SCALE_MS);
2348        qemu_printf("Max guest advance   %"PRIi64" ms\n",
2349                    max_advance / SCALE_MS);
2350    } else {
2351        qemu_printf("Max guest delay     NA\n");
2352        qemu_printf("Max guest advance   NA\n");
2353    }
2354}
2355