qemu/cpus.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "qemu-common.h"
  27#include "qemu/config-file.h"
  28#include "monitor/monitor.h"
  29#include "qapi/error.h"
  30#include "qapi/qapi-commands-misc.h"
  31#include "qapi/qapi-events-run-state.h"
  32#include "qapi/qmp/qerror.h"
  33#include "qemu/error-report.h"
  34#include "qemu/qemu-print.h"
  35#include "sysemu/tcg.h"
  36#include "sysemu/block-backend.h"
  37#include "exec/gdbstub.h"
  38#include "sysemu/dma.h"
  39#include "sysemu/hw_accel.h"
  40#include "sysemu/kvm.h"
  41#include "sysemu/hax.h"
  42#include "sysemu/hvf.h"
  43#include "sysemu/whpx.h"
  44#include "exec/exec-all.h"
  45
  46#include "qemu/thread.h"
  47#include "sysemu/cpus.h"
  48#include "sysemu/qtest.h"
  49#include "qemu/main-loop.h"
  50#include "qemu/option.h"
  51#include "qemu/bitmap.h"
  52#include "qemu/seqlock.h"
  53#include "qemu/guest-random.h"
  54#include "tcg.h"
  55#include "hw/nmi.h"
  56#include "sysemu/replay.h"
  57#include "hw/boards.h"
  58
  59#ifdef CONFIG_LINUX
  60
  61#include <sys/prctl.h>
  62
  63#ifndef PR_MCE_KILL
  64#define PR_MCE_KILL 33
  65#endif
  66
  67#ifndef PR_MCE_KILL_SET
  68#define PR_MCE_KILL_SET 1
  69#endif
  70
  71#ifndef PR_MCE_KILL_EARLY
  72#define PR_MCE_KILL_EARLY 1
  73#endif
  74
  75#endif /* CONFIG_LINUX */
  76
  77int64_t max_delay;
  78int64_t max_advance;
  79
  80/* vcpu throttling controls */
  81static QEMUTimer *throttle_timer;
  82static unsigned int throttle_percentage;
  83
  84#define CPU_THROTTLE_PCT_MIN 1
  85#define CPU_THROTTLE_PCT_MAX 99
  86#define CPU_THROTTLE_TIMESLICE_NS 10000000
  87
  88bool cpu_is_stopped(CPUState *cpu)
  89{
  90    return cpu->stopped || !runstate_is_running();
  91}
  92
  93static bool cpu_thread_is_idle(CPUState *cpu)
  94{
  95    if (cpu->stop || cpu->queued_work_first) {
  96        return false;
  97    }
  98    if (cpu_is_stopped(cpu)) {
  99        return true;
 100    }
 101    if (!cpu->halted || cpu_has_work(cpu) ||
 102        kvm_halt_in_kernel()) {
 103        return false;
 104    }
 105    return true;
 106}
 107
 108static bool all_cpu_threads_idle(void)
 109{
 110    CPUState *cpu;
 111
 112    CPU_FOREACH(cpu) {
 113        if (!cpu_thread_is_idle(cpu)) {
 114            return false;
 115        }
 116    }
 117    return true;
 118}
 119
 120/***********************************************************/
 121/* guest cycle counter */
 122
 123/* Protected by TimersState seqlock */
 124
 125static bool icount_sleep = true;
 126/* Arbitrarily pick 1MIPS as the minimum allowable speed.  */
 127#define MAX_ICOUNT_SHIFT 10
 128
 129typedef struct TimersState {
 130    /* Protected by BQL.  */
 131    int64_t cpu_ticks_prev;
 132    int64_t cpu_ticks_offset;
 133
 134    /* Protect fields that can be respectively read outside the
 135     * BQL, and written from multiple threads.
 136     */
 137    QemuSeqLock vm_clock_seqlock;
 138    QemuSpin vm_clock_lock;
 139
 140    int16_t cpu_ticks_enabled;
 141
 142    /* Conversion factor from emulated instructions to virtual clock ticks.  */
 143    int16_t icount_time_shift;
 144
 145    /* Compensate for varying guest execution speed.  */
 146    int64_t qemu_icount_bias;
 147
 148    int64_t vm_clock_warp_start;
 149    int64_t cpu_clock_offset;
 150
 151    /* Only written by TCG thread */
 152    int64_t qemu_icount;
 153
 154    /* for adjusting icount */
 155    QEMUTimer *icount_rt_timer;
 156    QEMUTimer *icount_vm_timer;
 157    QEMUTimer *icount_warp_timer;
 158} TimersState;
 159
 160static TimersState timers_state;
 161bool mttcg_enabled;
 162
 163/*
 164 * We default to false if we know other options have been enabled
 165 * which are currently incompatible with MTTCG. Otherwise when each
 166 * guest (target) has been updated to support:
 167 *   - atomic instructions
 168 *   - memory ordering primitives (barriers)
 169 * they can set the appropriate CONFIG flags in ${target}-softmmu.mak
 170 *
 171 * Once a guest architecture has been converted to the new primitives
 172 * there are two remaining limitations to check.
 173 *
 174 * - The guest can't be oversized (e.g. 64 bit guest on 32 bit host)
 175 * - The host must have a stronger memory order than the guest
 176 *
 177 * It may be possible in future to support strong guests on weak hosts
 178 * but that will require tagging all load/stores in a guest with their
 179 * implicit memory order requirements which would likely slow things
 180 * down a lot.
 181 */
 182
 183static bool check_tcg_memory_orders_compatible(void)
 184{
 185#if defined(TCG_GUEST_DEFAULT_MO) && defined(TCG_TARGET_DEFAULT_MO)
 186    return (TCG_GUEST_DEFAULT_MO & ~TCG_TARGET_DEFAULT_MO) == 0;
 187#else
 188    return false;
 189#endif
 190}
 191
 192static bool default_mttcg_enabled(void)
 193{
 194    if (use_icount || TCG_OVERSIZED_GUEST) {
 195        return false;
 196    } else {
 197#ifdef TARGET_SUPPORTS_MTTCG
 198        return check_tcg_memory_orders_compatible();
 199#else
 200        return false;
 201#endif
 202    }
 203}
 204
 205void qemu_tcg_configure(QemuOpts *opts, Error **errp)
 206{
 207    const char *t = qemu_opt_get(opts, "thread");
 208    if (t) {
 209        if (strcmp(t, "multi") == 0) {
 210            if (TCG_OVERSIZED_GUEST) {
 211                error_setg(errp, "No MTTCG when guest word size > hosts");
 212            } else if (use_icount) {
 213                error_setg(errp, "No MTTCG when icount is enabled");
 214            } else {
 215#ifndef TARGET_SUPPORTS_MTTCG
 216                warn_report("Guest not yet converted to MTTCG - "
 217                            "you may get unexpected results");
 218#endif
 219                if (!check_tcg_memory_orders_compatible()) {
 220                    warn_report("Guest expects a stronger memory ordering "
 221                                "than the host provides");
 222                    error_printf("This may cause strange/hard to debug errors\n");
 223                }
 224                mttcg_enabled = true;
 225            }
 226        } else if (strcmp(t, "single") == 0) {
 227            mttcg_enabled = false;
 228        } else {
 229            error_setg(errp, "Invalid 'thread' setting %s", t);
 230        }
 231    } else {
 232        mttcg_enabled = default_mttcg_enabled();
 233    }
 234}
 235
 236/* The current number of executed instructions is based on what we
 237 * originally budgeted minus the current state of the decrementing
 238 * icount counters in extra/u16.low.
 239 */
 240static int64_t cpu_get_icount_executed(CPUState *cpu)
 241{
 242    return (cpu->icount_budget -
 243            (cpu_neg(cpu)->icount_decr.u16.low + cpu->icount_extra));
 244}
 245
 246/*
 247 * Update the global shared timer_state.qemu_icount to take into
 248 * account executed instructions. This is done by the TCG vCPU
 249 * thread so the main-loop can see time has moved forward.
 250 */
 251static void cpu_update_icount_locked(CPUState *cpu)
 252{
 253    int64_t executed = cpu_get_icount_executed(cpu);
 254    cpu->icount_budget -= executed;
 255
 256    atomic_set_i64(&timers_state.qemu_icount,
 257                   timers_state.qemu_icount + executed);
 258}
 259
 260/*
 261 * Update the global shared timer_state.qemu_icount to take into
 262 * account executed instructions. This is done by the TCG vCPU
 263 * thread so the main-loop can see time has moved forward.
 264 */
 265void cpu_update_icount(CPUState *cpu)
 266{
 267    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 268                       &timers_state.vm_clock_lock);
 269    cpu_update_icount_locked(cpu);
 270    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 271                         &timers_state.vm_clock_lock);
 272}
 273
 274static int64_t cpu_get_icount_raw_locked(void)
 275{
 276    CPUState *cpu = current_cpu;
 277
 278    if (cpu && cpu->running) {
 279        if (!cpu->can_do_io) {
 280            error_report("Bad icount read");
 281            exit(1);
 282        }
 283        /* Take into account what has run */
 284        cpu_update_icount_locked(cpu);
 285    }
 286    /* The read is protected by the seqlock, but needs atomic64 to avoid UB */
 287    return atomic_read_i64(&timers_state.qemu_icount);
 288}
 289
 290static int64_t cpu_get_icount_locked(void)
 291{
 292    int64_t icount = cpu_get_icount_raw_locked();
 293    return atomic_read_i64(&timers_state.qemu_icount_bias) +
 294        cpu_icount_to_ns(icount);
 295}
 296
 297int64_t cpu_get_icount_raw(void)
 298{
 299    int64_t icount;
 300    unsigned start;
 301
 302    do {
 303        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 304        icount = cpu_get_icount_raw_locked();
 305    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 306
 307    return icount;
 308}
 309
 310/* Return the virtual CPU time, based on the instruction counter.  */
 311int64_t cpu_get_icount(void)
 312{
 313    int64_t icount;
 314    unsigned start;
 315
 316    do {
 317        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 318        icount = cpu_get_icount_locked();
 319    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 320
 321    return icount;
 322}
 323
 324int64_t cpu_icount_to_ns(int64_t icount)
 325{
 326    return icount << atomic_read(&timers_state.icount_time_shift);
 327}
 328
 329static int64_t cpu_get_ticks_locked(void)
 330{
 331    int64_t ticks = timers_state.cpu_ticks_offset;
 332    if (timers_state.cpu_ticks_enabled) {
 333        ticks += cpu_get_host_ticks();
 334    }
 335
 336    if (timers_state.cpu_ticks_prev > ticks) {
 337        /* Non increasing ticks may happen if the host uses software suspend.  */
 338        timers_state.cpu_ticks_offset += timers_state.cpu_ticks_prev - ticks;
 339        ticks = timers_state.cpu_ticks_prev;
 340    }
 341
 342    timers_state.cpu_ticks_prev = ticks;
 343    return ticks;
 344}
 345
 346/* return the time elapsed in VM between vm_start and vm_stop.  Unless
 347 * icount is active, cpu_get_ticks() uses units of the host CPU cycle
 348 * counter.
 349 */
 350int64_t cpu_get_ticks(void)
 351{
 352    int64_t ticks;
 353
 354    if (use_icount) {
 355        return cpu_get_icount();
 356    }
 357
 358    qemu_spin_lock(&timers_state.vm_clock_lock);
 359    ticks = cpu_get_ticks_locked();
 360    qemu_spin_unlock(&timers_state.vm_clock_lock);
 361    return ticks;
 362}
 363
 364static int64_t cpu_get_clock_locked(void)
 365{
 366    int64_t time;
 367
 368    time = timers_state.cpu_clock_offset;
 369    if (timers_state.cpu_ticks_enabled) {
 370        time += get_clock();
 371    }
 372
 373    return time;
 374}
 375
 376/* Return the monotonic time elapsed in VM, i.e.,
 377 * the time between vm_start and vm_stop
 378 */
 379int64_t cpu_get_clock(void)
 380{
 381    int64_t ti;
 382    unsigned start;
 383
 384    do {
 385        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 386        ti = cpu_get_clock_locked();
 387    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
 388
 389    return ti;
 390}
 391
 392/* enable cpu_get_ticks()
 393 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 394 */
 395void cpu_enable_ticks(void)
 396{
 397    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 398                       &timers_state.vm_clock_lock);
 399    if (!timers_state.cpu_ticks_enabled) {
 400        timers_state.cpu_ticks_offset -= cpu_get_host_ticks();
 401        timers_state.cpu_clock_offset -= get_clock();
 402        timers_state.cpu_ticks_enabled = 1;
 403    }
 404    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 405                       &timers_state.vm_clock_lock);
 406}
 407
 408/* disable cpu_get_ticks() : the clock is stopped. You must not call
 409 * cpu_get_ticks() after that.
 410 * Caller must hold BQL which serves as mutex for vm_clock_seqlock.
 411 */
 412void cpu_disable_ticks(void)
 413{
 414    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 415                       &timers_state.vm_clock_lock);
 416    if (timers_state.cpu_ticks_enabled) {
 417        timers_state.cpu_ticks_offset += cpu_get_host_ticks();
 418        timers_state.cpu_clock_offset = cpu_get_clock_locked();
 419        timers_state.cpu_ticks_enabled = 0;
 420    }
 421    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 422                         &timers_state.vm_clock_lock);
 423}
 424
 425/* Correlation between real and virtual time is always going to be
 426   fairly approximate, so ignore small variation.
 427   When the guest is idle real and virtual time will be aligned in
 428   the IO wait loop.  */
 429#define ICOUNT_WOBBLE (NANOSECONDS_PER_SECOND / 10)
 430
 431static void icount_adjust(void)
 432{
 433    int64_t cur_time;
 434    int64_t cur_icount;
 435    int64_t delta;
 436
 437    /* Protected by TimersState mutex.  */
 438    static int64_t last_delta;
 439
 440    /* If the VM is not running, then do nothing.  */
 441    if (!runstate_is_running()) {
 442        return;
 443    }
 444
 445    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 446                       &timers_state.vm_clock_lock);
 447    cur_time = cpu_get_clock_locked();
 448    cur_icount = cpu_get_icount_locked();
 449
 450    delta = cur_icount - cur_time;
 451    /* FIXME: This is a very crude algorithm, somewhat prone to oscillation.  */
 452    if (delta > 0
 453        && last_delta + ICOUNT_WOBBLE < delta * 2
 454        && timers_state.icount_time_shift > 0) {
 455        /* The guest is getting too far ahead.  Slow time down.  */
 456        atomic_set(&timers_state.icount_time_shift,
 457                   timers_state.icount_time_shift - 1);
 458    }
 459    if (delta < 0
 460        && last_delta - ICOUNT_WOBBLE > delta * 2
 461        && timers_state.icount_time_shift < MAX_ICOUNT_SHIFT) {
 462        /* The guest is getting too far behind.  Speed time up.  */
 463        atomic_set(&timers_state.icount_time_shift,
 464                   timers_state.icount_time_shift + 1);
 465    }
 466    last_delta = delta;
 467    atomic_set_i64(&timers_state.qemu_icount_bias,
 468                   cur_icount - (timers_state.qemu_icount
 469                                 << timers_state.icount_time_shift));
 470    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 471                         &timers_state.vm_clock_lock);
 472}
 473
 474static void icount_adjust_rt(void *opaque)
 475{
 476    timer_mod(timers_state.icount_rt_timer,
 477              qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 478    icount_adjust();
 479}
 480
 481static void icount_adjust_vm(void *opaque)
 482{
 483    timer_mod(timers_state.icount_vm_timer,
 484                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 485                   NANOSECONDS_PER_SECOND / 10);
 486    icount_adjust();
 487}
 488
 489static int64_t qemu_icount_round(int64_t count)
 490{
 491    int shift = atomic_read(&timers_state.icount_time_shift);
 492    return (count + (1 << shift) - 1) >> shift;
 493}
 494
 495static void icount_warp_rt(void)
 496{
 497    unsigned seq;
 498    int64_t warp_start;
 499
 500    /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
 501     * changes from -1 to another value, so the race here is okay.
 502     */
 503    do {
 504        seq = seqlock_read_begin(&timers_state.vm_clock_seqlock);
 505        warp_start = timers_state.vm_clock_warp_start;
 506    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, seq));
 507
 508    if (warp_start == -1) {
 509        return;
 510    }
 511
 512    seqlock_write_lock(&timers_state.vm_clock_seqlock,
 513                       &timers_state.vm_clock_lock);
 514    if (runstate_is_running()) {
 515        int64_t clock = REPLAY_CLOCK_LOCKED(REPLAY_CLOCK_VIRTUAL_RT,
 516                                            cpu_get_clock_locked());
 517        int64_t warp_delta;
 518
 519        warp_delta = clock - timers_state.vm_clock_warp_start;
 520        if (use_icount == 2) {
 521            /*
 522             * In adaptive mode, do not let QEMU_CLOCK_VIRTUAL run too
 523             * far ahead of real time.
 524             */
 525            int64_t cur_icount = cpu_get_icount_locked();
 526            int64_t delta = clock - cur_icount;
 527            warp_delta = MIN(warp_delta, delta);
 528        }
 529        atomic_set_i64(&timers_state.qemu_icount_bias,
 530                       timers_state.qemu_icount_bias + warp_delta);
 531    }
 532    timers_state.vm_clock_warp_start = -1;
 533    seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 534                       &timers_state.vm_clock_lock);
 535
 536    if (qemu_clock_expired(QEMU_CLOCK_VIRTUAL)) {
 537        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 538    }
 539}
 540
 541static void icount_timer_cb(void *opaque)
 542{
 543    /* No need for a checkpoint because the timer already synchronizes
 544     * with CHECKPOINT_CLOCK_VIRTUAL_RT.
 545     */
 546    icount_warp_rt();
 547}
 548
 549void qtest_clock_warp(int64_t dest)
 550{
 551    int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 552    AioContext *aio_context;
 553    assert(qtest_enabled());
 554    aio_context = qemu_get_aio_context();
 555    while (clock < dest) {
 556        int64_t deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 557        int64_t warp = qemu_soonest_timeout(dest - clock, deadline);
 558
 559        seqlock_write_lock(&timers_state.vm_clock_seqlock,
 560                           &timers_state.vm_clock_lock);
 561        atomic_set_i64(&timers_state.qemu_icount_bias,
 562                       timers_state.qemu_icount_bias + warp);
 563        seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 564                             &timers_state.vm_clock_lock);
 565
 566        qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
 567        timerlist_run_timers(aio_context->tlg.tl[QEMU_CLOCK_VIRTUAL]);
 568        clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 569    }
 570    qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 571}
 572
 573void qemu_start_warp_timer(void)
 574{
 575    int64_t clock;
 576    int64_t deadline;
 577
 578    if (!use_icount) {
 579        return;
 580    }
 581
 582    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 583     * do not fire, so computing the deadline does not make sense.
 584     */
 585    if (!runstate_is_running()) {
 586        return;
 587    }
 588
 589    if (replay_mode != REPLAY_MODE_PLAY) {
 590        if (!all_cpu_threads_idle()) {
 591            return;
 592        }
 593
 594        if (qtest_enabled()) {
 595            /* When testing, qtest commands advance icount.  */
 596            return;
 597        }
 598
 599        replay_checkpoint(CHECKPOINT_CLOCK_WARP_START);
 600    } else {
 601        /* warp clock deterministically in record/replay mode */
 602        if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_START)) {
 603            /* vCPU is sleeping and warp can't be started.
 604               It is probably a race condition: notification sent
 605               to vCPU was processed in advance and vCPU went to sleep.
 606               Therefore we have to wake it up for doing someting. */
 607            if (replay_has_checkpoint()) {
 608                qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 609            }
 610            return;
 611        }
 612    }
 613
 614    /* We want to use the earliest deadline from ALL vm_clocks */
 615    clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
 616    deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
 617    if (deadline < 0) {
 618        static bool notified;
 619        if (!icount_sleep && !notified) {
 620            warn_report("icount sleep disabled and no active timers");
 621            notified = true;
 622        }
 623        return;
 624    }
 625
 626    if (deadline > 0) {
 627        /*
 628         * Ensure QEMU_CLOCK_VIRTUAL proceeds even when the virtual CPU goes to
 629         * sleep.  Otherwise, the CPU might be waiting for a future timer
 630         * interrupt to wake it up, but the interrupt never comes because
 631         * the vCPU isn't running any insns and thus doesn't advance the
 632         * QEMU_CLOCK_VIRTUAL.
 633         */
 634        if (!icount_sleep) {
 635            /*
 636             * We never let VCPUs sleep in no sleep icount mode.
 637             * If there is a pending QEMU_CLOCK_VIRTUAL timer we just advance
 638             * to the next QEMU_CLOCK_VIRTUAL event and notify it.
 639             * It is useful when we want a deterministic execution time,
 640             * isolated from host latencies.
 641             */
 642            seqlock_write_lock(&timers_state.vm_clock_seqlock,
 643                               &timers_state.vm_clock_lock);
 644            atomic_set_i64(&timers_state.qemu_icount_bias,
 645                           timers_state.qemu_icount_bias + deadline);
 646            seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 647                                 &timers_state.vm_clock_lock);
 648            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 649        } else {
 650            /*
 651             * We do stop VCPUs and only advance QEMU_CLOCK_VIRTUAL after some
 652             * "real" time, (related to the time left until the next event) has
 653             * passed. The QEMU_CLOCK_VIRTUAL_RT clock will do this.
 654             * This avoids that the warps are visible externally; for example,
 655             * you will not be sending network packets continuously instead of
 656             * every 100ms.
 657             */
 658            seqlock_write_lock(&timers_state.vm_clock_seqlock,
 659                               &timers_state.vm_clock_lock);
 660            if (timers_state.vm_clock_warp_start == -1
 661                || timers_state.vm_clock_warp_start > clock) {
 662                timers_state.vm_clock_warp_start = clock;
 663            }
 664            seqlock_write_unlock(&timers_state.vm_clock_seqlock,
 665                                 &timers_state.vm_clock_lock);
 666            timer_mod_anticipate(timers_state.icount_warp_timer,
 667                                 clock + deadline);
 668        }
 669    } else if (deadline == 0) {
 670        qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
 671    }
 672}
 673
 674static void qemu_account_warp_timer(void)
 675{
 676    if (!use_icount || !icount_sleep) {
 677        return;
 678    }
 679
 680    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
 681     * do not fire, so computing the deadline does not make sense.
 682     */
 683    if (!runstate_is_running()) {
 684        return;
 685    }
 686
 687    /* warp clock deterministically in record/replay mode */
 688    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP_ACCOUNT)) {
 689        return;
 690    }
 691
 692    timer_del(timers_state.icount_warp_timer);
 693    icount_warp_rt();
 694}
 695
 696static bool icount_state_needed(void *opaque)
 697{
 698    return use_icount;
 699}
 700
 701static bool warp_timer_state_needed(void *opaque)
 702{
 703    TimersState *s = opaque;
 704    return s->icount_warp_timer != NULL;
 705}
 706
 707static bool adjust_timers_state_needed(void *opaque)
 708{
 709    TimersState *s = opaque;
 710    return s->icount_rt_timer != NULL;
 711}
 712
 713/*
 714 * Subsection for warp timer migration is optional, because may not be created
 715 */
 716static const VMStateDescription icount_vmstate_warp_timer = {
 717    .name = "timer/icount/warp_timer",
 718    .version_id = 1,
 719    .minimum_version_id = 1,
 720    .needed = warp_timer_state_needed,
 721    .fields = (VMStateField[]) {
 722        VMSTATE_INT64(vm_clock_warp_start, TimersState),
 723        VMSTATE_TIMER_PTR(icount_warp_timer, TimersState),
 724        VMSTATE_END_OF_LIST()
 725    }
 726};
 727
 728static const VMStateDescription icount_vmstate_adjust_timers = {
 729    .name = "timer/icount/timers",
 730    .version_id = 1,
 731    .minimum_version_id = 1,
 732    .needed = adjust_timers_state_needed,
 733    .fields = (VMStateField[]) {
 734        VMSTATE_TIMER_PTR(icount_rt_timer, TimersState),
 735        VMSTATE_TIMER_PTR(icount_vm_timer, TimersState),
 736        VMSTATE_END_OF_LIST()
 737    }
 738};
 739
 740/*
 741 * This is a subsection for icount migration.
 742 */
 743static const VMStateDescription icount_vmstate_timers = {
 744    .name = "timer/icount",
 745    .version_id = 1,
 746    .minimum_version_id = 1,
 747    .needed = icount_state_needed,
 748    .fields = (VMStateField[]) {
 749        VMSTATE_INT64(qemu_icount_bias, TimersState),
 750        VMSTATE_INT64(qemu_icount, TimersState),
 751        VMSTATE_END_OF_LIST()
 752    },
 753    .subsections = (const VMStateDescription*[]) {
 754        &icount_vmstate_warp_timer,
 755        &icount_vmstate_adjust_timers,
 756        NULL
 757    }
 758};
 759
 760static const VMStateDescription vmstate_timers = {
 761    .name = "timer",
 762    .version_id = 2,
 763    .minimum_version_id = 1,
 764    .fields = (VMStateField[]) {
 765        VMSTATE_INT64(cpu_ticks_offset, TimersState),
 766        VMSTATE_UNUSED(8),
 767        VMSTATE_INT64_V(cpu_clock_offset, TimersState, 2),
 768        VMSTATE_END_OF_LIST()
 769    },
 770    .subsections = (const VMStateDescription*[]) {
 771        &icount_vmstate_timers,
 772        NULL
 773    }
 774};
 775
 776static void cpu_throttle_thread(CPUState *cpu, run_on_cpu_data opaque)
 777{
 778    double pct;
 779    double throttle_ratio;
 780    long sleeptime_ns;
 781
 782    if (!cpu_throttle_get_percentage()) {
 783        return;
 784    }
 785
 786    pct = (double)cpu_throttle_get_percentage()/100;
 787    throttle_ratio = pct / (1 - pct);
 788    sleeptime_ns = (long)(throttle_ratio * CPU_THROTTLE_TIMESLICE_NS);
 789
 790    qemu_mutex_unlock_iothread();
 791    g_usleep(sleeptime_ns / 1000); /* Convert ns to us for usleep call */
 792    qemu_mutex_lock_iothread();
 793    atomic_set(&cpu->throttle_thread_scheduled, 0);
 794}
 795
 796static void cpu_throttle_timer_tick(void *opaque)
 797{
 798    CPUState *cpu;
 799    double pct;
 800
 801    /* Stop the timer if needed */
 802    if (!cpu_throttle_get_percentage()) {
 803        return;
 804    }
 805    CPU_FOREACH(cpu) {
 806        if (!atomic_xchg(&cpu->throttle_thread_scheduled, 1)) {
 807            async_run_on_cpu(cpu, cpu_throttle_thread,
 808                             RUN_ON_CPU_NULL);
 809        }
 810    }
 811
 812    pct = (double)cpu_throttle_get_percentage()/100;
 813    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 814                                   CPU_THROTTLE_TIMESLICE_NS / (1-pct));
 815}
 816
 817void cpu_throttle_set(int new_throttle_pct)
 818{
 819    /* Ensure throttle percentage is within valid range */
 820    new_throttle_pct = MIN(new_throttle_pct, CPU_THROTTLE_PCT_MAX);
 821    new_throttle_pct = MAX(new_throttle_pct, CPU_THROTTLE_PCT_MIN);
 822
 823    atomic_set(&throttle_percentage, new_throttle_pct);
 824
 825    timer_mod(throttle_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT) +
 826                                       CPU_THROTTLE_TIMESLICE_NS);
 827}
 828
 829void cpu_throttle_stop(void)
 830{
 831    atomic_set(&throttle_percentage, 0);
 832}
 833
 834bool cpu_throttle_active(void)
 835{
 836    return (cpu_throttle_get_percentage() != 0);
 837}
 838
 839int cpu_throttle_get_percentage(void)
 840{
 841    return atomic_read(&throttle_percentage);
 842}
 843
 844void cpu_ticks_init(void)
 845{
 846    seqlock_init(&timers_state.vm_clock_seqlock);
 847    qemu_spin_init(&timers_state.vm_clock_lock);
 848    vmstate_register(NULL, 0, &vmstate_timers, &timers_state);
 849    throttle_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 850                                           cpu_throttle_timer_tick, NULL);
 851}
 852
 853void configure_icount(QemuOpts *opts, Error **errp)
 854{
 855    const char *option;
 856    char *rem_str = NULL;
 857
 858    option = qemu_opt_get(opts, "shift");
 859    if (!option) {
 860        if (qemu_opt_get(opts, "align") != NULL) {
 861            error_setg(errp, "Please specify shift option when using align");
 862        }
 863        return;
 864    }
 865
 866    icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
 867    if (icount_sleep) {
 868        timers_state.icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
 869                                         icount_timer_cb, NULL);
 870    }
 871
 872    icount_align_option = qemu_opt_get_bool(opts, "align", false);
 873
 874    if (icount_align_option && !icount_sleep) {
 875        error_setg(errp, "align=on and sleep=off are incompatible");
 876    }
 877    if (strcmp(option, "auto") != 0) {
 878        errno = 0;
 879        timers_state.icount_time_shift = strtol(option, &rem_str, 0);
 880        if (errno != 0 || *rem_str != '\0' || !strlen(option)) {
 881            error_setg(errp, "icount: Invalid shift value");
 882        }
 883        use_icount = 1;
 884        return;
 885    } else if (icount_align_option) {
 886        error_setg(errp, "shift=auto and align=on are incompatible");
 887    } else if (!icount_sleep) {
 888        error_setg(errp, "shift=auto and sleep=off are incompatible");
 889    }
 890
 891    use_icount = 2;
 892
 893    /* 125MIPS seems a reasonable initial guess at the guest speed.
 894       It will be corrected fairly quickly anyway.  */
 895    timers_state.icount_time_shift = 3;
 896
 897    /* Have both realtime and virtual time triggers for speed adjustment.
 898       The realtime trigger catches emulated time passing too slowly,
 899       the virtual time trigger catches emulated time passing too fast.
 900       Realtime triggers occur even when idle, so use them less frequently
 901       than VM triggers.  */
 902    timers_state.vm_clock_warp_start = -1;
 903    timers_state.icount_rt_timer = timer_new_ms(QEMU_CLOCK_VIRTUAL_RT,
 904                                   icount_adjust_rt, NULL);
 905    timer_mod(timers_state.icount_rt_timer,
 906                   qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL_RT) + 1000);
 907    timers_state.icount_vm_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 908                                        icount_adjust_vm, NULL);
 909    timer_mod(timers_state.icount_vm_timer,
 910                   qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
 911                   NANOSECONDS_PER_SECOND / 10);
 912}
 913
 914/***********************************************************/
 915/* TCG vCPU kick timer
 916 *
 917 * The kick timer is responsible for moving single threaded vCPU
 918 * emulation on to the next vCPU. If more than one vCPU is running a
 919 * timer event with force a cpu->exit so the next vCPU can get
 920 * scheduled.
 921 *
 922 * The timer is removed if all vCPUs are idle and restarted again once
 923 * idleness is complete.
 924 */
 925
 926static QEMUTimer *tcg_kick_vcpu_timer;
 927static CPUState *tcg_current_rr_cpu;
 928
 929#define TCG_KICK_PERIOD (NANOSECONDS_PER_SECOND / 10)
 930
 931static inline int64_t qemu_tcg_next_kick(void)
 932{
 933    return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + TCG_KICK_PERIOD;
 934}
 935
 936/* Kick the currently round-robin scheduled vCPU */
 937static void qemu_cpu_kick_rr_cpu(void)
 938{
 939    CPUState *cpu;
 940    do {
 941        cpu = atomic_mb_read(&tcg_current_rr_cpu);
 942        if (cpu) {
 943            cpu_exit(cpu);
 944        }
 945    } while (cpu != atomic_mb_read(&tcg_current_rr_cpu));
 946}
 947
 948static void do_nothing(CPUState *cpu, run_on_cpu_data unused)
 949{
 950}
 951
 952void qemu_timer_notify_cb(void *opaque, QEMUClockType type)
 953{
 954    if (!use_icount || type != QEMU_CLOCK_VIRTUAL) {
 955        qemu_notify_event();
 956        return;
 957    }
 958
 959    if (qemu_in_vcpu_thread()) {
 960        /* A CPU is currently running; kick it back out to the
 961         * tcg_cpu_exec() loop so it will recalculate its
 962         * icount deadline immediately.
 963         */
 964        qemu_cpu_kick(current_cpu);
 965    } else if (first_cpu) {
 966        /* qemu_cpu_kick is not enough to kick a halted CPU out of
 967         * qemu_tcg_wait_io_event.  async_run_on_cpu, instead,
 968         * causes cpu_thread_is_idle to return false.  This way,
 969         * handle_icount_deadline can run.
 970         * If we have no CPUs at all for some reason, we don't
 971         * need to do anything.
 972         */
 973        async_run_on_cpu(first_cpu, do_nothing, RUN_ON_CPU_NULL);
 974    }
 975}
 976
 977static void kick_tcg_thread(void *opaque)
 978{
 979    timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 980    qemu_cpu_kick_rr_cpu();
 981}
 982
 983static void start_tcg_kick_timer(void)
 984{
 985    assert(!mttcg_enabled);
 986    if (!tcg_kick_vcpu_timer && CPU_NEXT(first_cpu)) {
 987        tcg_kick_vcpu_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
 988                                           kick_tcg_thread, NULL);
 989    }
 990    if (tcg_kick_vcpu_timer && !timer_pending(tcg_kick_vcpu_timer)) {
 991        timer_mod(tcg_kick_vcpu_timer, qemu_tcg_next_kick());
 992    }
 993}
 994
 995static void stop_tcg_kick_timer(void)
 996{
 997    assert(!mttcg_enabled);
 998    if (tcg_kick_vcpu_timer && timer_pending(tcg_kick_vcpu_timer)) {
 999        timer_del(tcg_kick_vcpu_timer);
1000    }
1001}
1002
1003/***********************************************************/
1004void hw_error(const char *fmt, ...)
1005{
1006    va_list ap;
1007    CPUState *cpu;
1008
1009    va_start(ap, fmt);
1010    fprintf(stderr, "qemu: hardware error: ");
1011    vfprintf(stderr, fmt, ap);
1012    fprintf(stderr, "\n");
1013    CPU_FOREACH(cpu) {
1014        fprintf(stderr, "CPU #%d:\n", cpu->cpu_index);
1015        cpu_dump_state(cpu, stderr, CPU_DUMP_FPU);
1016    }
1017    va_end(ap);
1018    abort();
1019}
1020
1021void cpu_synchronize_all_states(void)
1022{
1023    CPUState *cpu;
1024
1025    CPU_FOREACH(cpu) {
1026        cpu_synchronize_state(cpu);
1027        /* TODO: move to cpu_synchronize_state() */
1028        if (hvf_enabled()) {
1029            hvf_cpu_synchronize_state(cpu);
1030        }
1031    }
1032}
1033
1034void cpu_synchronize_all_post_reset(void)
1035{
1036    CPUState *cpu;
1037
1038    CPU_FOREACH(cpu) {
1039        cpu_synchronize_post_reset(cpu);
1040        /* TODO: move to cpu_synchronize_post_reset() */
1041        if (hvf_enabled()) {
1042            hvf_cpu_synchronize_post_reset(cpu);
1043        }
1044    }
1045}
1046
1047void cpu_synchronize_all_post_init(void)
1048{
1049    CPUState *cpu;
1050
1051    CPU_FOREACH(cpu) {
1052        cpu_synchronize_post_init(cpu);
1053        /* TODO: move to cpu_synchronize_post_init() */
1054        if (hvf_enabled()) {
1055            hvf_cpu_synchronize_post_init(cpu);
1056        }
1057    }
1058}
1059
1060void cpu_synchronize_all_pre_loadvm(void)
1061{
1062    CPUState *cpu;
1063
1064    CPU_FOREACH(cpu) {
1065        cpu_synchronize_pre_loadvm(cpu);
1066    }
1067}
1068
1069static int do_vm_stop(RunState state, bool send_stop)
1070{
1071    int ret = 0;
1072
1073    if (runstate_is_running()) {
1074        cpu_disable_ticks();
1075        pause_all_vcpus();
1076        runstate_set(state);
1077        vm_state_notify(0, state);
1078        if (send_stop) {
1079            qapi_event_send_stop();
1080        }
1081    }
1082
1083    bdrv_drain_all();
1084    replay_disable_events();
1085    ret = bdrv_flush_all();
1086
1087    return ret;
1088}
1089
1090/* Special vm_stop() variant for terminating the process.  Historically clients
1091 * did not expect a QMP STOP event and so we need to retain compatibility.
1092 */
1093int vm_shutdown(void)
1094{
1095    return do_vm_stop(RUN_STATE_SHUTDOWN, false);
1096}
1097
1098static bool cpu_can_run(CPUState *cpu)
1099{
1100    if (cpu->stop) {
1101        return false;
1102    }
1103    if (cpu_is_stopped(cpu)) {
1104        return false;
1105    }
1106    return true;
1107}
1108
1109static void cpu_handle_guest_debug(CPUState *cpu)
1110{
1111    gdb_set_stop_cpu(cpu);
1112    qemu_system_debug_request();
1113    cpu->stopped = true;
1114}
1115
1116#ifdef CONFIG_LINUX
1117static void sigbus_reraise(void)
1118{
1119    sigset_t set;
1120    struct sigaction action;
1121
1122    memset(&action, 0, sizeof(action));
1123    action.sa_handler = SIG_DFL;
1124    if (!sigaction(SIGBUS, &action, NULL)) {
1125        raise(SIGBUS);
1126        sigemptyset(&set);
1127        sigaddset(&set, SIGBUS);
1128        pthread_sigmask(SIG_UNBLOCK, &set, NULL);
1129    }
1130    perror("Failed to re-raise SIGBUS!\n");
1131    abort();
1132}
1133
1134static void sigbus_handler(int n, siginfo_t *siginfo, void *ctx)
1135{
1136    if (siginfo->si_code != BUS_MCEERR_AO && siginfo->si_code != BUS_MCEERR_AR) {
1137        sigbus_reraise();
1138    }
1139
1140    if (current_cpu) {
1141        /* Called asynchronously in VCPU thread.  */
1142        if (kvm_on_sigbus_vcpu(current_cpu, siginfo->si_code, siginfo->si_addr)) {
1143            sigbus_reraise();
1144        }
1145    } else {
1146        /* Called synchronously (via signalfd) in main thread.  */
1147        if (kvm_on_sigbus(siginfo->si_code, siginfo->si_addr)) {
1148            sigbus_reraise();
1149        }
1150    }
1151}
1152
1153static void qemu_init_sigbus(void)
1154{
1155    struct sigaction action;
1156
1157    memset(&action, 0, sizeof(action));
1158    action.sa_flags = SA_SIGINFO;
1159    action.sa_sigaction = sigbus_handler;
1160    sigaction(SIGBUS, &action, NULL);
1161
1162    prctl(PR_MCE_KILL, PR_MCE_KILL_SET, PR_MCE_KILL_EARLY, 0, 0);
1163}
1164#else /* !CONFIG_LINUX */
1165static void qemu_init_sigbus(void)
1166{
1167}
1168#endif /* !CONFIG_LINUX */
1169
1170static QemuMutex qemu_global_mutex;
1171
1172static QemuThread io_thread;
1173
1174/* cpu creation */
1175static QemuCond qemu_cpu_cond;
1176/* system init */
1177static QemuCond qemu_pause_cond;
1178
1179void qemu_init_cpu_loop(void)
1180{
1181    qemu_init_sigbus();
1182    qemu_cond_init(&qemu_cpu_cond);
1183    qemu_cond_init(&qemu_pause_cond);
1184    qemu_mutex_init(&qemu_global_mutex);
1185
1186    qemu_thread_get_self(&io_thread);
1187}
1188
1189void run_on_cpu(CPUState *cpu, run_on_cpu_func func, run_on_cpu_data data)
1190{
1191    do_run_on_cpu(cpu, func, data, &qemu_global_mutex);
1192}
1193
1194static void qemu_kvm_destroy_vcpu(CPUState *cpu)
1195{
1196    if (kvm_destroy_vcpu(cpu) < 0) {
1197        error_report("kvm_destroy_vcpu failed");
1198        exit(EXIT_FAILURE);
1199    }
1200}
1201
1202static void qemu_tcg_destroy_vcpu(CPUState *cpu)
1203{
1204}
1205
1206static void qemu_cpu_stop(CPUState *cpu, bool exit)
1207{
1208    g_assert(qemu_cpu_is_self(cpu));
1209    cpu->stop = false;
1210    cpu->stopped = true;
1211    if (exit) {
1212        cpu_exit(cpu);
1213    }
1214    qemu_cond_broadcast(&qemu_pause_cond);
1215}
1216
1217static void qemu_wait_io_event_common(CPUState *cpu)
1218{
1219    atomic_mb_set(&cpu->thread_kicked, false);
1220    if (cpu->stop) {
1221        qemu_cpu_stop(cpu, false);
1222    }
1223    process_queued_cpu_work(cpu);
1224}
1225
1226static void qemu_tcg_rr_wait_io_event(void)
1227{
1228    CPUState *cpu;
1229
1230    while (all_cpu_threads_idle()) {
1231        stop_tcg_kick_timer();
1232        qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1233    }
1234
1235    start_tcg_kick_timer();
1236
1237    CPU_FOREACH(cpu) {
1238        qemu_wait_io_event_common(cpu);
1239    }
1240}
1241
1242static void qemu_wait_io_event(CPUState *cpu)
1243{
1244    while (cpu_thread_is_idle(cpu)) {
1245        qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1246    }
1247
1248#ifdef _WIN32
1249    /* Eat dummy APC queued by qemu_cpu_kick_thread.  */
1250    if (!tcg_enabled()) {
1251        SleepEx(0, TRUE);
1252    }
1253#endif
1254    qemu_wait_io_event_common(cpu);
1255}
1256
1257static void *qemu_kvm_cpu_thread_fn(void *arg)
1258{
1259    CPUState *cpu = arg;
1260    int r;
1261
1262    rcu_register_thread();
1263
1264    qemu_mutex_lock_iothread();
1265    qemu_thread_get_self(cpu->thread);
1266    cpu->thread_id = qemu_get_thread_id();
1267    cpu->can_do_io = 1;
1268    current_cpu = cpu;
1269
1270    r = kvm_init_vcpu(cpu);
1271    if (r < 0) {
1272        error_report("kvm_init_vcpu failed: %s", strerror(-r));
1273        exit(1);
1274    }
1275
1276    kvm_init_cpu_signals(cpu);
1277
1278    /* signal CPU creation */
1279    cpu->created = true;
1280    qemu_cond_signal(&qemu_cpu_cond);
1281    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1282
1283    do {
1284        if (cpu_can_run(cpu)) {
1285            r = kvm_cpu_exec(cpu);
1286            if (r == EXCP_DEBUG) {
1287                cpu_handle_guest_debug(cpu);
1288            }
1289        }
1290        qemu_wait_io_event(cpu);
1291    } while (!cpu->unplug || cpu_can_run(cpu));
1292
1293    qemu_kvm_destroy_vcpu(cpu);
1294    cpu->created = false;
1295    qemu_cond_signal(&qemu_cpu_cond);
1296    qemu_mutex_unlock_iothread();
1297    rcu_unregister_thread();
1298    return NULL;
1299}
1300
1301static void *qemu_dummy_cpu_thread_fn(void *arg)
1302{
1303#ifdef _WIN32
1304    error_report("qtest is not supported under Windows");
1305    exit(1);
1306#else
1307    CPUState *cpu = arg;
1308    sigset_t waitset;
1309    int r;
1310
1311    rcu_register_thread();
1312
1313    qemu_mutex_lock_iothread();
1314    qemu_thread_get_self(cpu->thread);
1315    cpu->thread_id = qemu_get_thread_id();
1316    cpu->can_do_io = 1;
1317    current_cpu = cpu;
1318
1319    sigemptyset(&waitset);
1320    sigaddset(&waitset, SIG_IPI);
1321
1322    /* signal CPU creation */
1323    cpu->created = true;
1324    qemu_cond_signal(&qemu_cpu_cond);
1325    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1326
1327    do {
1328        qemu_mutex_unlock_iothread();
1329        do {
1330            int sig;
1331            r = sigwait(&waitset, &sig);
1332        } while (r == -1 && (errno == EAGAIN || errno == EINTR));
1333        if (r == -1) {
1334            perror("sigwait");
1335            exit(1);
1336        }
1337        qemu_mutex_lock_iothread();
1338        qemu_wait_io_event(cpu);
1339    } while (!cpu->unplug);
1340
1341    qemu_mutex_unlock_iothread();
1342    rcu_unregister_thread();
1343    return NULL;
1344#endif
1345}
1346
1347static int64_t tcg_get_icount_limit(void)
1348{
1349    int64_t deadline;
1350
1351    if (replay_mode != REPLAY_MODE_PLAY) {
1352        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1353
1354        /* Maintain prior (possibly buggy) behaviour where if no deadline
1355         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
1356         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
1357         * nanoseconds.
1358         */
1359        if ((deadline < 0) || (deadline > INT32_MAX)) {
1360            deadline = INT32_MAX;
1361        }
1362
1363        return qemu_icount_round(deadline);
1364    } else {
1365        return replay_get_instructions();
1366    }
1367}
1368
1369static void handle_icount_deadline(void)
1370{
1371    assert(qemu_in_vcpu_thread());
1372    if (use_icount) {
1373        int64_t deadline =
1374            qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
1375
1376        if (deadline == 0) {
1377            /* Wake up other AioContexts.  */
1378            qemu_clock_notify(QEMU_CLOCK_VIRTUAL);
1379            qemu_clock_run_timers(QEMU_CLOCK_VIRTUAL);
1380        }
1381    }
1382}
1383
1384static void prepare_icount_for_run(CPUState *cpu)
1385{
1386    if (use_icount) {
1387        int insns_left;
1388
1389        /* These should always be cleared by process_icount_data after
1390         * each vCPU execution. However u16.high can be raised
1391         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
1392         */
1393        g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
1394        g_assert(cpu->icount_extra == 0);
1395
1396        cpu->icount_budget = tcg_get_icount_limit();
1397        insns_left = MIN(0xffff, cpu->icount_budget);
1398        cpu_neg(cpu)->icount_decr.u16.low = insns_left;
1399        cpu->icount_extra = cpu->icount_budget - insns_left;
1400
1401        replay_mutex_lock();
1402    }
1403}
1404
1405static void process_icount_data(CPUState *cpu)
1406{
1407    if (use_icount) {
1408        /* Account for executed instructions */
1409        cpu_update_icount(cpu);
1410
1411        /* Reset the counters */
1412        cpu_neg(cpu)->icount_decr.u16.low = 0;
1413        cpu->icount_extra = 0;
1414        cpu->icount_budget = 0;
1415
1416        replay_account_executed_instructions();
1417
1418        replay_mutex_unlock();
1419    }
1420}
1421
1422
1423static int tcg_cpu_exec(CPUState *cpu)
1424{
1425    int ret;
1426#ifdef CONFIG_PROFILER
1427    int64_t ti;
1428#endif
1429
1430    assert(tcg_enabled());
1431#ifdef CONFIG_PROFILER
1432    ti = profile_getclock();
1433#endif
1434    cpu_exec_start(cpu);
1435    ret = cpu_exec(cpu);
1436    cpu_exec_end(cpu);
1437#ifdef CONFIG_PROFILER
1438    atomic_set(&tcg_ctx->prof.cpu_exec_time,
1439               tcg_ctx->prof.cpu_exec_time + profile_getclock() - ti);
1440#endif
1441    return ret;
1442}
1443
1444/* Destroy any remaining vCPUs which have been unplugged and have
1445 * finished running
1446 */
1447static void deal_with_unplugged_cpus(void)
1448{
1449    CPUState *cpu;
1450
1451    CPU_FOREACH(cpu) {
1452        if (cpu->unplug && !cpu_can_run(cpu)) {
1453            qemu_tcg_destroy_vcpu(cpu);
1454            cpu->created = false;
1455            qemu_cond_signal(&qemu_cpu_cond);
1456            break;
1457        }
1458    }
1459}
1460
1461/* Single-threaded TCG
1462 *
1463 * In the single-threaded case each vCPU is simulated in turn. If
1464 * there is more than a single vCPU we create a simple timer to kick
1465 * the vCPU and ensure we don't get stuck in a tight loop in one vCPU.
1466 * This is done explicitly rather than relying on side-effects
1467 * elsewhere.
1468 */
1469
1470static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
1471{
1472    CPUState *cpu = arg;
1473
1474    assert(tcg_enabled());
1475    rcu_register_thread();
1476    tcg_register_thread();
1477
1478    qemu_mutex_lock_iothread();
1479    qemu_thread_get_self(cpu->thread);
1480
1481    cpu->thread_id = qemu_get_thread_id();
1482    cpu->created = true;
1483    cpu->can_do_io = 1;
1484    qemu_cond_signal(&qemu_cpu_cond);
1485    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1486
1487    /* wait for initial kick-off after machine start */
1488    while (first_cpu->stopped) {
1489        qemu_cond_wait(first_cpu->halt_cond, &qemu_global_mutex);
1490
1491        /* process any pending work */
1492        CPU_FOREACH(cpu) {
1493            current_cpu = cpu;
1494            qemu_wait_io_event_common(cpu);
1495        }
1496    }
1497
1498    start_tcg_kick_timer();
1499
1500    cpu = first_cpu;
1501
1502    /* process any pending work */
1503    cpu->exit_request = 1;
1504
1505    while (1) {
1506        qemu_mutex_unlock_iothread();
1507        replay_mutex_lock();
1508        qemu_mutex_lock_iothread();
1509        /* Account partial waits to QEMU_CLOCK_VIRTUAL.  */
1510        qemu_account_warp_timer();
1511
1512        /* Run the timers here.  This is much more efficient than
1513         * waking up the I/O thread and waiting for completion.
1514         */
1515        handle_icount_deadline();
1516
1517        replay_mutex_unlock();
1518
1519        if (!cpu) {
1520            cpu = first_cpu;
1521        }
1522
1523        while (cpu && !cpu->queued_work_first && !cpu->exit_request) {
1524
1525            atomic_mb_set(&tcg_current_rr_cpu, cpu);
1526            current_cpu = cpu;
1527
1528            qemu_clock_enable(QEMU_CLOCK_VIRTUAL,
1529                              (cpu->singlestep_enabled & SSTEP_NOTIMER) == 0);
1530
1531            if (cpu_can_run(cpu)) {
1532                int r;
1533
1534                qemu_mutex_unlock_iothread();
1535                prepare_icount_for_run(cpu);
1536
1537                r = tcg_cpu_exec(cpu);
1538
1539                process_icount_data(cpu);
1540                qemu_mutex_lock_iothread();
1541
1542                if (r == EXCP_DEBUG) {
1543                    cpu_handle_guest_debug(cpu);
1544                    break;
1545                } else if (r == EXCP_ATOMIC) {
1546                    qemu_mutex_unlock_iothread();
1547                    cpu_exec_step_atomic(cpu);
1548                    qemu_mutex_lock_iothread();
1549                    break;
1550                }
1551            } else if (cpu->stop) {
1552                if (cpu->unplug) {
1553                    cpu = CPU_NEXT(cpu);
1554                }
1555                break;
1556            }
1557
1558            cpu = CPU_NEXT(cpu);
1559        } /* while (cpu && !cpu->exit_request).. */
1560
1561        /* Does not need atomic_mb_set because a spurious wakeup is okay.  */
1562        atomic_set(&tcg_current_rr_cpu, NULL);
1563
1564        if (cpu && cpu->exit_request) {
1565            atomic_mb_set(&cpu->exit_request, 0);
1566        }
1567
1568        if (use_icount && all_cpu_threads_idle()) {
1569            /*
1570             * When all cpus are sleeping (e.g in WFI), to avoid a deadlock
1571             * in the main_loop, wake it up in order to start the warp timer.
1572             */
1573            qemu_notify_event();
1574        }
1575
1576        qemu_tcg_rr_wait_io_event();
1577        deal_with_unplugged_cpus();
1578    }
1579
1580    rcu_unregister_thread();
1581    return NULL;
1582}
1583
1584static void *qemu_hax_cpu_thread_fn(void *arg)
1585{
1586    CPUState *cpu = arg;
1587    int r;
1588
1589    rcu_register_thread();
1590    qemu_mutex_lock_iothread();
1591    qemu_thread_get_self(cpu->thread);
1592
1593    cpu->thread_id = qemu_get_thread_id();
1594    cpu->created = true;
1595    current_cpu = cpu;
1596
1597    hax_init_vcpu(cpu);
1598    qemu_cond_signal(&qemu_cpu_cond);
1599    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1600
1601    do {
1602        if (cpu_can_run(cpu)) {
1603            r = hax_smp_cpu_exec(cpu);
1604            if (r == EXCP_DEBUG) {
1605                cpu_handle_guest_debug(cpu);
1606            }
1607        }
1608
1609        qemu_wait_io_event(cpu);
1610    } while (!cpu->unplug || cpu_can_run(cpu));
1611    rcu_unregister_thread();
1612    return NULL;
1613}
1614
1615/* The HVF-specific vCPU thread function. This one should only run when the host
1616 * CPU supports the VMX "unrestricted guest" feature. */
1617static void *qemu_hvf_cpu_thread_fn(void *arg)
1618{
1619    CPUState *cpu = arg;
1620
1621    int r;
1622
1623    assert(hvf_enabled());
1624
1625    rcu_register_thread();
1626
1627    qemu_mutex_lock_iothread();
1628    qemu_thread_get_self(cpu->thread);
1629
1630    cpu->thread_id = qemu_get_thread_id();
1631    cpu->can_do_io = 1;
1632    current_cpu = cpu;
1633
1634    hvf_init_vcpu(cpu);
1635
1636    /* signal CPU creation */
1637    cpu->created = true;
1638    qemu_cond_signal(&qemu_cpu_cond);
1639    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1640
1641    do {
1642        if (cpu_can_run(cpu)) {
1643            r = hvf_vcpu_exec(cpu);
1644            if (r == EXCP_DEBUG) {
1645                cpu_handle_guest_debug(cpu);
1646            }
1647        }
1648        qemu_wait_io_event(cpu);
1649    } while (!cpu->unplug || cpu_can_run(cpu));
1650
1651    hvf_vcpu_destroy(cpu);
1652    cpu->created = false;
1653    qemu_cond_signal(&qemu_cpu_cond);
1654    qemu_mutex_unlock_iothread();
1655    rcu_unregister_thread();
1656    return NULL;
1657}
1658
1659static void *qemu_whpx_cpu_thread_fn(void *arg)
1660{
1661    CPUState *cpu = arg;
1662    int r;
1663
1664    rcu_register_thread();
1665
1666    qemu_mutex_lock_iothread();
1667    qemu_thread_get_self(cpu->thread);
1668    cpu->thread_id = qemu_get_thread_id();
1669    current_cpu = cpu;
1670
1671    r = whpx_init_vcpu(cpu);
1672    if (r < 0) {
1673        fprintf(stderr, "whpx_init_vcpu failed: %s\n", strerror(-r));
1674        exit(1);
1675    }
1676
1677    /* signal CPU creation */
1678    cpu->created = true;
1679    qemu_cond_signal(&qemu_cpu_cond);
1680    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1681
1682    do {
1683        if (cpu_can_run(cpu)) {
1684            r = whpx_vcpu_exec(cpu);
1685            if (r == EXCP_DEBUG) {
1686                cpu_handle_guest_debug(cpu);
1687            }
1688        }
1689        while (cpu_thread_is_idle(cpu)) {
1690            qemu_cond_wait(cpu->halt_cond, &qemu_global_mutex);
1691        }
1692        qemu_wait_io_event_common(cpu);
1693    } while (!cpu->unplug || cpu_can_run(cpu));
1694
1695    whpx_destroy_vcpu(cpu);
1696    cpu->created = false;
1697    qemu_cond_signal(&qemu_cpu_cond);
1698    qemu_mutex_unlock_iothread();
1699    rcu_unregister_thread();
1700    return NULL;
1701}
1702
1703#ifdef _WIN32
1704static void CALLBACK dummy_apc_func(ULONG_PTR unused)
1705{
1706}
1707#endif
1708
1709/* Multi-threaded TCG
1710 *
1711 * In the multi-threaded case each vCPU has its own thread. The TLS
1712 * variable current_cpu can be used deep in the code to find the
1713 * current CPUState for a given thread.
1714 */
1715
1716static void *qemu_tcg_cpu_thread_fn(void *arg)
1717{
1718    CPUState *cpu = arg;
1719
1720    assert(tcg_enabled());
1721    g_assert(!use_icount);
1722
1723    rcu_register_thread();
1724    tcg_register_thread();
1725
1726    qemu_mutex_lock_iothread();
1727    qemu_thread_get_self(cpu->thread);
1728
1729    cpu->thread_id = qemu_get_thread_id();
1730    cpu->created = true;
1731    cpu->can_do_io = 1;
1732    current_cpu = cpu;
1733    qemu_cond_signal(&qemu_cpu_cond);
1734    qemu_guest_random_seed_thread_part2(cpu->random_seed);
1735
1736    /* process any pending work */
1737    cpu->exit_request = 1;
1738
1739    do {
1740        if (cpu_can_run(cpu)) {
1741            int r;
1742            qemu_mutex_unlock_iothread();
1743            r = tcg_cpu_exec(cpu);
1744            qemu_mutex_lock_iothread();
1745            switch (r) {
1746            case EXCP_DEBUG:
1747                cpu_handle_guest_debug(cpu);
1748                break;
1749            case EXCP_HALTED:
1750                /* during start-up the vCPU is reset and the thread is
1751                 * kicked several times. If we don't ensure we go back
1752                 * to sleep in the halted state we won't cleanly
1753                 * start-up when the vCPU is enabled.
1754                 *
1755                 * cpu->halted should ensure we sleep in wait_io_event
1756                 */
1757                g_assert(cpu->halted);
1758                break;
1759            case EXCP_ATOMIC:
1760                qemu_mutex_unlock_iothread();
1761                cpu_exec_step_atomic(cpu);
1762                qemu_mutex_lock_iothread();
1763            default:
1764                /* Ignore everything else? */
1765                break;
1766            }
1767        }
1768
1769        atomic_mb_set(&cpu->exit_request, 0);
1770        qemu_wait_io_event(cpu);
1771    } while (!cpu->unplug || cpu_can_run(cpu));
1772
1773    qemu_tcg_destroy_vcpu(cpu);
1774    cpu->created = false;
1775    qemu_cond_signal(&qemu_cpu_cond);
1776    qemu_mutex_unlock_iothread();
1777    rcu_unregister_thread();
1778    return NULL;
1779}
1780
1781static void qemu_cpu_kick_thread(CPUState *cpu)
1782{
1783#ifndef _WIN32
1784    int err;
1785
1786    if (cpu->thread_kicked) {
1787        return;
1788    }
1789    cpu->thread_kicked = true;
1790    err = pthread_kill(cpu->thread->thread, SIG_IPI);
1791    if (err && err != ESRCH) {
1792        fprintf(stderr, "qemu:%s: %s", __func__, strerror(err));
1793        exit(1);
1794    }
1795#else /* _WIN32 */
1796    if (!qemu_cpu_is_self(cpu)) {
1797        if (whpx_enabled()) {
1798            whpx_vcpu_kick(cpu);
1799        } else if (!QueueUserAPC(dummy_apc_func, cpu->hThread, 0)) {
1800            fprintf(stderr, "%s: QueueUserAPC failed with error %lu\n",
1801                    __func__, GetLastError());
1802            exit(1);
1803        }
1804    }
1805#endif
1806}
1807
1808void qemu_cpu_kick(CPUState *cpu)
1809{
1810    qemu_cond_broadcast(cpu->halt_cond);
1811    if (tcg_enabled()) {
1812        cpu_exit(cpu);
1813        /* NOP unless doing single-thread RR */
1814        qemu_cpu_kick_rr_cpu();
1815    } else {
1816        if (hax_enabled()) {
1817            /*
1818             * FIXME: race condition with the exit_request check in
1819             * hax_vcpu_hax_exec
1820             */
1821            cpu->exit_request = 1;
1822        }
1823        qemu_cpu_kick_thread(cpu);
1824    }
1825}
1826
1827void qemu_cpu_kick_self(void)
1828{
1829    assert(current_cpu);
1830    qemu_cpu_kick_thread(current_cpu);
1831}
1832
1833bool qemu_cpu_is_self(CPUState *cpu)
1834{
1835    return qemu_thread_is_self(cpu->thread);
1836}
1837
1838bool qemu_in_vcpu_thread(void)
1839{
1840    return current_cpu && qemu_cpu_is_self(current_cpu);
1841}
1842
1843static __thread bool iothread_locked = false;
1844
1845bool qemu_mutex_iothread_locked(void)
1846{
1847    return iothread_locked;
1848}
1849
1850/*
1851 * The BQL is taken from so many places that it is worth profiling the
1852 * callers directly, instead of funneling them all through a single function.
1853 */
1854void qemu_mutex_lock_iothread_impl(const char *file, int line)
1855{
1856    QemuMutexLockFunc bql_lock = atomic_read(&qemu_bql_mutex_lock_func);
1857
1858    g_assert(!qemu_mutex_iothread_locked());
1859    bql_lock(&qemu_global_mutex, file, line);
1860    iothread_locked = true;
1861}
1862
1863void qemu_mutex_unlock_iothread(void)
1864{
1865    g_assert(qemu_mutex_iothread_locked());
1866    iothread_locked = false;
1867    qemu_mutex_unlock(&qemu_global_mutex);
1868}
1869
1870static bool all_vcpus_paused(void)
1871{
1872    CPUState *cpu;
1873
1874    CPU_FOREACH(cpu) {
1875        if (!cpu->stopped) {
1876            return false;
1877        }
1878    }
1879
1880    return true;
1881}
1882
1883void pause_all_vcpus(void)
1884{
1885    CPUState *cpu;
1886
1887    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, false);
1888    CPU_FOREACH(cpu) {
1889        if (qemu_cpu_is_self(cpu)) {
1890            qemu_cpu_stop(cpu, true);
1891        } else {
1892            cpu->stop = true;
1893            qemu_cpu_kick(cpu);
1894        }
1895    }
1896
1897    /* We need to drop the replay_lock so any vCPU threads woken up
1898     * can finish their replay tasks
1899     */
1900    replay_mutex_unlock();
1901
1902    while (!all_vcpus_paused()) {
1903        qemu_cond_wait(&qemu_pause_cond, &qemu_global_mutex);
1904        CPU_FOREACH(cpu) {
1905            qemu_cpu_kick(cpu);
1906        }
1907    }
1908
1909    qemu_mutex_unlock_iothread();
1910    replay_mutex_lock();
1911    qemu_mutex_lock_iothread();
1912}
1913
1914void cpu_resume(CPUState *cpu)
1915{
1916    cpu->stop = false;
1917    cpu->stopped = false;
1918    qemu_cpu_kick(cpu);
1919}
1920
1921void resume_all_vcpus(void)
1922{
1923    CPUState *cpu;
1924
1925    qemu_clock_enable(QEMU_CLOCK_VIRTUAL, true);
1926    CPU_FOREACH(cpu) {
1927        cpu_resume(cpu);
1928    }
1929}
1930
1931void cpu_remove_sync(CPUState *cpu)
1932{
1933    cpu->stop = true;
1934    cpu->unplug = true;
1935    qemu_cpu_kick(cpu);
1936    qemu_mutex_unlock_iothread();
1937    qemu_thread_join(cpu->thread);
1938    qemu_mutex_lock_iothread();
1939}
1940
1941/* For temporary buffers for forming a name */
1942#define VCPU_THREAD_NAME_SIZE 16
1943
1944static void qemu_tcg_init_vcpu(CPUState *cpu)
1945{
1946    char thread_name[VCPU_THREAD_NAME_SIZE];
1947    static QemuCond *single_tcg_halt_cond;
1948    static QemuThread *single_tcg_cpu_thread;
1949    static int tcg_region_inited;
1950
1951    assert(tcg_enabled());
1952    /*
1953     * Initialize TCG regions--once. Now is a good time, because:
1954     * (1) TCG's init context, prologue and target globals have been set up.
1955     * (2) qemu_tcg_mttcg_enabled() works now (TCG init code runs before the
1956     *     -accel flag is processed, so the check doesn't work then).
1957     */
1958    if (!tcg_region_inited) {
1959        tcg_region_inited = 1;
1960        tcg_region_init();
1961    }
1962
1963    if (qemu_tcg_mttcg_enabled() || !single_tcg_cpu_thread) {
1964        cpu->thread = g_malloc0(sizeof(QemuThread));
1965        cpu->halt_cond = g_malloc0(sizeof(QemuCond));
1966        qemu_cond_init(cpu->halt_cond);
1967
1968        if (qemu_tcg_mttcg_enabled()) {
1969            /* create a thread per vCPU with TCG (MTTCG) */
1970            parallel_cpus = true;
1971            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/TCG",
1972                 cpu->cpu_index);
1973
1974            qemu_thread_create(cpu->thread, thread_name, qemu_tcg_cpu_thread_fn,
1975                               cpu, QEMU_THREAD_JOINABLE);
1976
1977        } else {
1978            /* share a single thread for all cpus with TCG */
1979            snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "ALL CPUs/TCG");
1980            qemu_thread_create(cpu->thread, thread_name,
1981                               qemu_tcg_rr_cpu_thread_fn,
1982                               cpu, QEMU_THREAD_JOINABLE);
1983
1984            single_tcg_halt_cond = cpu->halt_cond;
1985            single_tcg_cpu_thread = cpu->thread;
1986        }
1987#ifdef _WIN32
1988        cpu->hThread = qemu_thread_get_handle(cpu->thread);
1989#endif
1990    } else {
1991        /* For non-MTTCG cases we share the thread */
1992        cpu->thread = single_tcg_cpu_thread;
1993        cpu->halt_cond = single_tcg_halt_cond;
1994        cpu->thread_id = first_cpu->thread_id;
1995        cpu->can_do_io = 1;
1996        cpu->created = true;
1997    }
1998}
1999
2000static void qemu_hax_start_vcpu(CPUState *cpu)
2001{
2002    char thread_name[VCPU_THREAD_NAME_SIZE];
2003
2004    cpu->thread = g_malloc0(sizeof(QemuThread));
2005    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2006    qemu_cond_init(cpu->halt_cond);
2007
2008    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HAX",
2009             cpu->cpu_index);
2010    qemu_thread_create(cpu->thread, thread_name, qemu_hax_cpu_thread_fn,
2011                       cpu, QEMU_THREAD_JOINABLE);
2012#ifdef _WIN32
2013    cpu->hThread = qemu_thread_get_handle(cpu->thread);
2014#endif
2015}
2016
2017static void qemu_kvm_start_vcpu(CPUState *cpu)
2018{
2019    char thread_name[VCPU_THREAD_NAME_SIZE];
2020
2021    cpu->thread = g_malloc0(sizeof(QemuThread));
2022    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2023    qemu_cond_init(cpu->halt_cond);
2024    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/KVM",
2025             cpu->cpu_index);
2026    qemu_thread_create(cpu->thread, thread_name, qemu_kvm_cpu_thread_fn,
2027                       cpu, QEMU_THREAD_JOINABLE);
2028}
2029
2030static void qemu_hvf_start_vcpu(CPUState *cpu)
2031{
2032    char thread_name[VCPU_THREAD_NAME_SIZE];
2033
2034    /* HVF currently does not support TCG, and only runs in
2035     * unrestricted-guest mode. */
2036    assert(hvf_enabled());
2037
2038    cpu->thread = g_malloc0(sizeof(QemuThread));
2039    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2040    qemu_cond_init(cpu->halt_cond);
2041
2042    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/HVF",
2043             cpu->cpu_index);
2044    qemu_thread_create(cpu->thread, thread_name, qemu_hvf_cpu_thread_fn,
2045                       cpu, QEMU_THREAD_JOINABLE);
2046}
2047
2048static void qemu_whpx_start_vcpu(CPUState *cpu)
2049{
2050    char thread_name[VCPU_THREAD_NAME_SIZE];
2051
2052    cpu->thread = g_malloc0(sizeof(QemuThread));
2053    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2054    qemu_cond_init(cpu->halt_cond);
2055    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/WHPX",
2056             cpu->cpu_index);
2057    qemu_thread_create(cpu->thread, thread_name, qemu_whpx_cpu_thread_fn,
2058                       cpu, QEMU_THREAD_JOINABLE);
2059#ifdef _WIN32
2060    cpu->hThread = qemu_thread_get_handle(cpu->thread);
2061#endif
2062}
2063
2064static void qemu_dummy_start_vcpu(CPUState *cpu)
2065{
2066    char thread_name[VCPU_THREAD_NAME_SIZE];
2067
2068    cpu->thread = g_malloc0(sizeof(QemuThread));
2069    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
2070    qemu_cond_init(cpu->halt_cond);
2071    snprintf(thread_name, VCPU_THREAD_NAME_SIZE, "CPU %d/DUMMY",
2072             cpu->cpu_index);
2073    qemu_thread_create(cpu->thread, thread_name, qemu_dummy_cpu_thread_fn, cpu,
2074                       QEMU_THREAD_JOINABLE);
2075}
2076
2077void qemu_init_vcpu(CPUState *cpu)
2078{
2079    MachineState *ms = MACHINE(qdev_get_machine());
2080
2081    cpu->nr_cores = ms->smp.cores;
2082    cpu->nr_threads =  ms->smp.threads;
2083    cpu->stopped = true;
2084    cpu->random_seed = qemu_guest_random_seed_thread_part1();
2085
2086    if (!cpu->as) {
2087        /* If the target cpu hasn't set up any address spaces itself,
2088         * give it the default one.
2089         */
2090        cpu->num_ases = 1;
2091        cpu_address_space_init(cpu, 0, "cpu-memory", cpu->memory);
2092    }
2093
2094    if (kvm_enabled()) {
2095        qemu_kvm_start_vcpu(cpu);
2096    } else if (hax_enabled()) {
2097        qemu_hax_start_vcpu(cpu);
2098    } else if (hvf_enabled()) {
2099        qemu_hvf_start_vcpu(cpu);
2100    } else if (tcg_enabled()) {
2101        qemu_tcg_init_vcpu(cpu);
2102    } else if (whpx_enabled()) {
2103        qemu_whpx_start_vcpu(cpu);
2104    } else {
2105        qemu_dummy_start_vcpu(cpu);
2106    }
2107
2108    while (!cpu->created) {
2109        qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
2110    }
2111}
2112
2113void cpu_stop_current(void)
2114{
2115    if (current_cpu) {
2116        current_cpu->stop = true;
2117        cpu_exit(current_cpu);
2118    }
2119}
2120
2121int vm_stop(RunState state)
2122{
2123    if (qemu_in_vcpu_thread()) {
2124        qemu_system_vmstop_request_prepare();
2125        qemu_system_vmstop_request(state);
2126        /*
2127         * FIXME: should not return to device code in case
2128         * vm_stop() has been requested.
2129         */
2130        cpu_stop_current();
2131        return 0;
2132    }
2133
2134    return do_vm_stop(state, true);
2135}
2136
2137/**
2138 * Prepare for (re)starting the VM.
2139 * Returns -1 if the vCPUs are not to be restarted (e.g. if they are already
2140 * running or in case of an error condition), 0 otherwise.
2141 */
2142int vm_prepare_start(void)
2143{
2144    RunState requested;
2145
2146    qemu_vmstop_requested(&requested);
2147    if (runstate_is_running() && requested == RUN_STATE__MAX) {
2148        return -1;
2149    }
2150
2151    /* Ensure that a STOP/RESUME pair of events is emitted if a
2152     * vmstop request was pending.  The BLOCK_IO_ERROR event, for
2153     * example, according to documentation is always followed by
2154     * the STOP event.
2155     */
2156    if (runstate_is_running()) {
2157        qapi_event_send_stop();
2158        qapi_event_send_resume();
2159        return -1;
2160    }
2161
2162    /* We are sending this now, but the CPUs will be resumed shortly later */
2163    qapi_event_send_resume();
2164
2165    replay_enable_events();
2166    cpu_enable_ticks();
2167    runstate_set(RUN_STATE_RUNNING);
2168    vm_state_notify(1, RUN_STATE_RUNNING);
2169    return 0;
2170}
2171
2172void vm_start(void)
2173{
2174    if (!vm_prepare_start()) {
2175        resume_all_vcpus();
2176    }
2177}
2178
2179/* does a state transition even if the VM is already stopped,
2180   current state is forgotten forever */
2181int vm_stop_force_state(RunState state)
2182{
2183    if (runstate_is_running()) {
2184        return vm_stop(state);
2185    } else {
2186        runstate_set(state);
2187
2188        bdrv_drain_all();
2189        /* Make sure to return an error if the flush in a previous vm_stop()
2190         * failed. */
2191        return bdrv_flush_all();
2192    }
2193}
2194
2195void list_cpus(const char *optarg)
2196{
2197    /* XXX: implement xxx_cpu_list for targets that still miss it */
2198#if defined(cpu_list)
2199    cpu_list();
2200#endif
2201}
2202
2203void qmp_memsave(int64_t addr, int64_t size, const char *filename,
2204                 bool has_cpu, int64_t cpu_index, Error **errp)
2205{
2206    FILE *f;
2207    uint32_t l;
2208    CPUState *cpu;
2209    uint8_t buf[1024];
2210    int64_t orig_addr = addr, orig_size = size;
2211
2212    if (!has_cpu) {
2213        cpu_index = 0;
2214    }
2215
2216    cpu = qemu_get_cpu(cpu_index);
2217    if (cpu == NULL) {
2218        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cpu-index",
2219                   "a CPU number");
2220        return;
2221    }
2222
2223    f = fopen(filename, "wb");
2224    if (!f) {
2225        error_setg_file_open(errp, errno, filename);
2226        return;
2227    }
2228
2229    while (size != 0) {
2230        l = sizeof(buf);
2231        if (l > size)
2232            l = size;
2233        if (cpu_memory_rw_debug(cpu, addr, buf, l, 0) != 0) {
2234            error_setg(errp, "Invalid addr 0x%016" PRIx64 "/size %" PRId64
2235                             " specified", orig_addr, orig_size);
2236            goto exit;
2237        }
2238        if (fwrite(buf, 1, l, f) != l) {
2239            error_setg(errp, QERR_IO_ERROR);
2240            goto exit;
2241        }
2242        addr += l;
2243        size -= l;
2244    }
2245
2246exit:
2247    fclose(f);
2248}
2249
2250void qmp_pmemsave(int64_t addr, int64_t size, const char *filename,
2251                  Error **errp)
2252{
2253    FILE *f;
2254    uint32_t l;
2255    uint8_t buf[1024];
2256
2257    f = fopen(filename, "wb");
2258    if (!f) {
2259        error_setg_file_open(errp, errno, filename);
2260        return;
2261    }
2262
2263    while (size != 0) {
2264        l = sizeof(buf);
2265        if (l > size)
2266            l = size;
2267        cpu_physical_memory_read(addr, buf, l);
2268        if (fwrite(buf, 1, l, f) != l) {
2269            error_setg(errp, QERR_IO_ERROR);
2270            goto exit;
2271        }
2272        addr += l;
2273        size -= l;
2274    }
2275
2276exit:
2277    fclose(f);
2278}
2279
2280void qmp_inject_nmi(Error **errp)
2281{
2282    nmi_monitor_handle(monitor_get_cpu_index(), errp);
2283}
2284
2285void dump_drift_info(void)
2286{
2287    if (!use_icount) {
2288        return;
2289    }
2290
2291    qemu_printf("Host - Guest clock  %"PRIi64" ms\n",
2292                (cpu_get_clock() - cpu_get_icount())/SCALE_MS);
2293    if (icount_align_option) {
2294        qemu_printf("Max guest delay     %"PRIi64" ms\n",
2295                    -max_delay / SCALE_MS);
2296        qemu_printf("Max guest advance   %"PRIi64" ms\n",
2297                    max_advance / SCALE_MS);
2298    } else {
2299        qemu_printf("Max guest delay     NA\n");
2300        qemu_printf("Max guest advance   NA\n");
2301    }
2302}
2303