qemu/migration/migration.c
<<
>>
Prefs
   1/*
   2 * QEMU live migration
   3 *
   4 * Copyright IBM, Corp. 2008
   5 *
   6 * Authors:
   7 *  Anthony Liguori   <aliguori@us.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 * Contributions after 2012-01-13 are licensed under the terms of the
  13 * GNU GPL, version 2 or (at your option) any later version.
  14 */
  15
  16#include "qemu/osdep.h"
  17#include "qemu/cutils.h"
  18#include "qemu/error-report.h"
  19#include "qemu/main-loop.h"
  20#include "migration/blocker.h"
  21#include "exec.h"
  22#include "fd.h"
  23#include "socket.h"
  24#include "sysemu/runstate.h"
  25#include "sysemu/sysemu.h"
  26#include "sysemu/cpu-throttle.h"
  27#include "rdma.h"
  28#include "ram.h"
  29#include "migration/global_state.h"
  30#include "migration/misc.h"
  31#include "migration.h"
  32#include "savevm.h"
  33#include "qemu-file.h"
  34#include "migration/vmstate.h"
  35#include "block/block.h"
  36#include "qapi/error.h"
  37#include "qapi/clone-visitor.h"
  38#include "qapi/qapi-visit-migration.h"
  39#include "qapi/qapi-visit-sockets.h"
  40#include "qapi/qapi-commands-migration.h"
  41#include "qapi/qapi-events-migration.h"
  42#include "qapi/qmp/qerror.h"
  43#include "qapi/qmp/qnull.h"
  44#include "qemu/rcu.h"
  45#include "block.h"
  46#include "postcopy-ram.h"
  47#include "qemu/thread.h"
  48#include "trace.h"
  49#include "exec/target_page.h"
  50#include "io/channel-buffer.h"
  51#include "io/channel-tls.h"
  52#include "migration/colo.h"
  53#include "hw/boards.h"
  54#include "hw/qdev-properties.h"
  55#include "hw/qdev-properties-system.h"
  56#include "monitor/monitor.h"
  57#include "net/announce.h"
  58#include "qemu/queue.h"
  59#include "multifd.h"
  60#include "qemu/yank.h"
  61#include "sysemu/cpus.h"
  62#include "yank_functions.h"
  63#include "sysemu/qtest.h"
  64
  65#define MAX_THROTTLE  (128 << 20)      /* Migration transfer speed throttling */
  66
  67/* Amount of time to allocate to each "chunk" of bandwidth-throttled
  68 * data. */
  69#define BUFFER_DELAY     100
  70#define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY)
  71
  72/* Time in milliseconds we are allowed to stop the source,
  73 * for sending the last part */
  74#define DEFAULT_MIGRATE_SET_DOWNTIME 300
  75
  76/* Maximum migrate downtime set to 2000 seconds */
  77#define MAX_MIGRATE_DOWNTIME_SECONDS 2000
  78#define MAX_MIGRATE_DOWNTIME (MAX_MIGRATE_DOWNTIME_SECONDS * 1000)
  79
  80/* Default compression thread count */
  81#define DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT 8
  82/* Default decompression thread count, usually decompression is at
  83 * least 4 times as fast as compression.*/
  84#define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2
  85/*0: means nocompress, 1: best speed, ... 9: best compress ratio */
  86#define DEFAULT_MIGRATE_COMPRESS_LEVEL 1
  87/* Define default autoconverge cpu throttle migration parameters */
  88#define DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD 50
  89#define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20
  90#define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10
  91#define DEFAULT_MIGRATE_MAX_CPU_THROTTLE 99
  92
  93/* Migration XBZRLE default cache size */
  94#define DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE (64 * 1024 * 1024)
  95
  96/* The delay time (in ms) between two COLO checkpoints */
  97#define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY (200 * 100)
  98#define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
  99#define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE
 100/* 0: means nocompress, 1: best speed, ... 9: best compress ratio */
 101#define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1
 102/* 0: means nocompress, 1: best speed, ... 20: best compress ratio */
 103#define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1
 104
 105/* Background transfer rate for postcopy, 0 means unlimited, note
 106 * that page requests can still exceed this limit.
 107 */
 108#define DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH 0
 109
 110/*
 111 * Parameters for self_announce_delay giving a stream of RARP/ARP
 112 * packets after migration.
 113 */
 114#define DEFAULT_MIGRATE_ANNOUNCE_INITIAL  50
 115#define DEFAULT_MIGRATE_ANNOUNCE_MAX     550
 116#define DEFAULT_MIGRATE_ANNOUNCE_ROUNDS    5
 117#define DEFAULT_MIGRATE_ANNOUNCE_STEP    100
 118
 119static NotifierList migration_state_notifiers =
 120    NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
 121
 122/* Messages sent on the return path from destination to source */
 123enum mig_rp_message_type {
 124    MIG_RP_MSG_INVALID = 0,  /* Must be 0 */
 125    MIG_RP_MSG_SHUT,         /* sibling will not send any more RP messages */
 126    MIG_RP_MSG_PONG,         /* Response to a PING; data (seq: be32 ) */
 127
 128    MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */
 129    MIG_RP_MSG_REQ_PAGES,    /* data (start: be64, len: be32) */
 130    MIG_RP_MSG_RECV_BITMAP,  /* send recved_bitmap back to source */
 131    MIG_RP_MSG_RESUME_ACK,   /* tell source that we are ready to resume */
 132
 133    MIG_RP_MSG_MAX
 134};
 135
 136/* Migration capabilities set */
 137struct MigrateCapsSet {
 138    int size;                       /* Capability set size */
 139    MigrationCapability caps[];     /* Variadic array of capabilities */
 140};
 141typedef struct MigrateCapsSet MigrateCapsSet;
 142
 143/* Define and initialize MigrateCapsSet */
 144#define INITIALIZE_MIGRATE_CAPS_SET(_name, ...)   \
 145    MigrateCapsSet _name = {    \
 146        .size = sizeof((int []) { __VA_ARGS__ }) / sizeof(int), \
 147        .caps = { __VA_ARGS__ } \
 148    }
 149
 150/* Background-snapshot compatibility check list */
 151static const
 152INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot,
 153    MIGRATION_CAPABILITY_POSTCOPY_RAM,
 154    MIGRATION_CAPABILITY_DIRTY_BITMAPS,
 155    MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME,
 156    MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE,
 157    MIGRATION_CAPABILITY_RETURN_PATH,
 158    MIGRATION_CAPABILITY_MULTIFD,
 159    MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER,
 160    MIGRATION_CAPABILITY_AUTO_CONVERGE,
 161    MIGRATION_CAPABILITY_RELEASE_RAM,
 162    MIGRATION_CAPABILITY_RDMA_PIN_ALL,
 163    MIGRATION_CAPABILITY_COMPRESS,
 164    MIGRATION_CAPABILITY_XBZRLE,
 165    MIGRATION_CAPABILITY_X_COLO,
 166    MIGRATION_CAPABILITY_VALIDATE_UUID,
 167    MIGRATION_CAPABILITY_ZERO_COPY_SEND);
 168
 169/* When we add fault tolerance, we could have several
 170   migrations at once.  For now we don't need to add
 171   dynamic creation of migration */
 172
 173static MigrationState *current_migration;
 174static MigrationIncomingState *current_incoming;
 175
 176static GSList *migration_blockers;
 177
 178static bool migration_object_check(MigrationState *ms, Error **errp);
 179static int migration_maybe_pause(MigrationState *s,
 180                                 int *current_active_state,
 181                                 int new_state);
 182static void migrate_fd_cancel(MigrationState *s);
 183
 184static bool migrate_allow_multi_channels = true;
 185
 186void migrate_protocol_allow_multi_channels(bool allow)
 187{
 188    migrate_allow_multi_channels = allow;
 189}
 190
 191bool migrate_multi_channels_is_allowed(void)
 192{
 193    return migrate_allow_multi_channels;
 194}
 195
 196static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp)
 197{
 198    uintptr_t a = (uintptr_t) ap, b = (uintptr_t) bp;
 199
 200    return (a > b) - (a < b);
 201}
 202
 203void migration_object_init(void)
 204{
 205    /* This can only be called once. */
 206    assert(!current_migration);
 207    current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION));
 208
 209    /*
 210     * Init the migrate incoming object as well no matter whether
 211     * we'll use it or not.
 212     */
 213    assert(!current_incoming);
 214    current_incoming = g_new0(MigrationIncomingState, 1);
 215    current_incoming->state = MIGRATION_STATUS_NONE;
 216    current_incoming->postcopy_remote_fds =
 217        g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD));
 218    qemu_mutex_init(&current_incoming->rp_mutex);
 219    qemu_mutex_init(&current_incoming->postcopy_prio_thread_mutex);
 220    qemu_event_init(&current_incoming->main_thread_load_event, false);
 221    qemu_sem_init(&current_incoming->postcopy_pause_sem_dst, 0);
 222    qemu_sem_init(&current_incoming->postcopy_pause_sem_fault, 0);
 223    qemu_sem_init(&current_incoming->postcopy_pause_sem_fast_load, 0);
 224    qemu_mutex_init(&current_incoming->page_request_mutex);
 225    current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
 226
 227    migration_object_check(current_migration, &error_fatal);
 228
 229    blk_mig_init();
 230    ram_mig_init();
 231    dirty_bitmap_mig_init();
 232}
 233
 234void migration_cancel(const Error *error)
 235{
 236    if (error) {
 237        migrate_set_error(current_migration, error);
 238    }
 239    migrate_fd_cancel(current_migration);
 240}
 241
 242void migration_shutdown(void)
 243{
 244    /*
 245     * When the QEMU main thread exit, the COLO thread
 246     * may wait a semaphore. So, we should wakeup the
 247     * COLO thread before migration shutdown.
 248     */
 249    colo_shutdown();
 250    /*
 251     * Cancel the current migration - that will (eventually)
 252     * stop the migration using this structure
 253     */
 254    migration_cancel(NULL);
 255    object_unref(OBJECT(current_migration));
 256
 257    /*
 258     * Cancel outgoing migration of dirty bitmaps. It should
 259     * at least unref used block nodes.
 260     */
 261    dirty_bitmap_mig_cancel_outgoing();
 262
 263    /*
 264     * Cancel incoming migration of dirty bitmaps. Dirty bitmaps
 265     * are non-critical data, and their loss never considered as
 266     * something serious.
 267     */
 268    dirty_bitmap_mig_cancel_incoming();
 269}
 270
 271/* For outgoing */
 272MigrationState *migrate_get_current(void)
 273{
 274    /* This can only be called after the object created. */
 275    assert(current_migration);
 276    return current_migration;
 277}
 278
 279MigrationIncomingState *migration_incoming_get_current(void)
 280{
 281    assert(current_incoming);
 282    return current_incoming;
 283}
 284
 285void migration_incoming_transport_cleanup(MigrationIncomingState *mis)
 286{
 287    if (mis->socket_address_list) {
 288        qapi_free_SocketAddressList(mis->socket_address_list);
 289        mis->socket_address_list = NULL;
 290    }
 291
 292    if (mis->transport_cleanup) {
 293        mis->transport_cleanup(mis->transport_data);
 294        mis->transport_data = mis->transport_cleanup = NULL;
 295    }
 296}
 297
 298void migration_incoming_state_destroy(void)
 299{
 300    struct MigrationIncomingState *mis = migration_incoming_get_current();
 301
 302    if (mis->to_src_file) {
 303        /* Tell source that we are done */
 304        migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
 305        qemu_fclose(mis->to_src_file);
 306        mis->to_src_file = NULL;
 307    }
 308
 309    if (mis->from_src_file) {
 310        migration_ioc_unregister_yank_from_file(mis->from_src_file);
 311        qemu_fclose(mis->from_src_file);
 312        mis->from_src_file = NULL;
 313    }
 314    if (mis->postcopy_remote_fds) {
 315        g_array_free(mis->postcopy_remote_fds, TRUE);
 316        mis->postcopy_remote_fds = NULL;
 317    }
 318
 319    migration_incoming_transport_cleanup(mis);
 320    qemu_event_reset(&mis->main_thread_load_event);
 321
 322    if (mis->page_requested) {
 323        g_tree_destroy(mis->page_requested);
 324        mis->page_requested = NULL;
 325    }
 326
 327    if (mis->postcopy_qemufile_dst) {
 328        migration_ioc_unregister_yank_from_file(mis->postcopy_qemufile_dst);
 329        qemu_fclose(mis->postcopy_qemufile_dst);
 330        mis->postcopy_qemufile_dst = NULL;
 331    }
 332
 333    yank_unregister_instance(MIGRATION_YANK_INSTANCE);
 334}
 335
 336static void migrate_generate_event(int new_state)
 337{
 338    if (migrate_use_events()) {
 339        qapi_event_send_migration(new_state);
 340    }
 341}
 342
 343static bool migrate_late_block_activate(void)
 344{
 345    MigrationState *s;
 346
 347    s = migrate_get_current();
 348
 349    return s->enabled_capabilities[
 350        MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE];
 351}
 352
 353/*
 354 * Send a message on the return channel back to the source
 355 * of the migration.
 356 */
 357static int migrate_send_rp_message(MigrationIncomingState *mis,
 358                                   enum mig_rp_message_type message_type,
 359                                   uint16_t len, void *data)
 360{
 361    int ret = 0;
 362
 363    trace_migrate_send_rp_message((int)message_type, len);
 364    QEMU_LOCK_GUARD(&mis->rp_mutex);
 365
 366    /*
 367     * It's possible that the file handle got lost due to network
 368     * failures.
 369     */
 370    if (!mis->to_src_file) {
 371        ret = -EIO;
 372        return ret;
 373    }
 374
 375    qemu_put_be16(mis->to_src_file, (unsigned int)message_type);
 376    qemu_put_be16(mis->to_src_file, len);
 377    qemu_put_buffer(mis->to_src_file, data, len);
 378    qemu_fflush(mis->to_src_file);
 379
 380    /* It's possible that qemu file got error during sending */
 381    ret = qemu_file_get_error(mis->to_src_file);
 382
 383    return ret;
 384}
 385
 386/* Request one page from the source VM at the given start address.
 387 *   rb: the RAMBlock to request the page in
 388 *   Start: Address offset within the RB
 389 *   Len: Length in bytes required - must be a multiple of pagesize
 390 */
 391int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
 392                                      RAMBlock *rb, ram_addr_t start)
 393{
 394    uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname up to 256 */
 395    size_t msglen = 12; /* start + len */
 396    size_t len = qemu_ram_pagesize(rb);
 397    enum mig_rp_message_type msg_type;
 398    const char *rbname;
 399    int rbname_len;
 400
 401    *(uint64_t *)bufc = cpu_to_be64((uint64_t)start);
 402    *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len);
 403
 404    /*
 405     * We maintain the last ramblock that we requested for page.  Note that we
 406     * don't need locking because this function will only be called within the
 407     * postcopy ram fault thread.
 408     */
 409    if (rb != mis->last_rb) {
 410        mis->last_rb = rb;
 411
 412        rbname = qemu_ram_get_idstr(rb);
 413        rbname_len = strlen(rbname);
 414
 415        assert(rbname_len < 256);
 416
 417        bufc[msglen++] = rbname_len;
 418        memcpy(bufc + msglen, rbname, rbname_len);
 419        msglen += rbname_len;
 420        msg_type = MIG_RP_MSG_REQ_PAGES_ID;
 421    } else {
 422        msg_type = MIG_RP_MSG_REQ_PAGES;
 423    }
 424
 425    return migrate_send_rp_message(mis, msg_type, msglen, bufc);
 426}
 427
 428int migrate_send_rp_req_pages(MigrationIncomingState *mis,
 429                              RAMBlock *rb, ram_addr_t start, uint64_t haddr)
 430{
 431    void *aligned = (void *)(uintptr_t)ROUND_DOWN(haddr, qemu_ram_pagesize(rb));
 432    bool received = false;
 433
 434    WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
 435        received = ramblock_recv_bitmap_test_byte_offset(rb, start);
 436        if (!received && !g_tree_lookup(mis->page_requested, aligned)) {
 437            /*
 438             * The page has not been received, and it's not yet in the page
 439             * request list.  Queue it.  Set the value of element to 1, so that
 440             * things like g_tree_lookup() will return TRUE (1) when found.
 441             */
 442            g_tree_insert(mis->page_requested, aligned, (gpointer)1);
 443            mis->page_requested_count++;
 444            trace_postcopy_page_req_add(aligned, mis->page_requested_count);
 445        }
 446    }
 447
 448    /*
 449     * If the page is there, skip sending the message.  We don't even need the
 450     * lock because as long as the page arrived, it'll be there forever.
 451     */
 452    if (received) {
 453        return 0;
 454    }
 455
 456    return migrate_send_rp_message_req_pages(mis, rb, start);
 457}
 458
 459static bool migration_colo_enabled;
 460bool migration_incoming_colo_enabled(void)
 461{
 462    return migration_colo_enabled;
 463}
 464
 465void migration_incoming_disable_colo(void)
 466{
 467    ram_block_discard_disable(false);
 468    migration_colo_enabled = false;
 469}
 470
 471int migration_incoming_enable_colo(void)
 472{
 473    if (ram_block_discard_disable(true)) {
 474        error_report("COLO: cannot disable RAM discard");
 475        return -EBUSY;
 476    }
 477    migration_colo_enabled = true;
 478    return 0;
 479}
 480
 481void migrate_add_address(SocketAddress *address)
 482{
 483    MigrationIncomingState *mis = migration_incoming_get_current();
 484
 485    QAPI_LIST_PREPEND(mis->socket_address_list,
 486                      QAPI_CLONE(SocketAddress, address));
 487}
 488
 489static void qemu_start_incoming_migration(const char *uri, Error **errp)
 490{
 491    const char *p = NULL;
 492
 493    migrate_protocol_allow_multi_channels(false); /* reset it anyway */
 494    qapi_event_send_migration(MIGRATION_STATUS_SETUP);
 495    if (strstart(uri, "tcp:", &p) ||
 496        strstart(uri, "unix:", NULL) ||
 497        strstart(uri, "vsock:", NULL)) {
 498        migrate_protocol_allow_multi_channels(true);
 499        socket_start_incoming_migration(p ? p : uri, errp);
 500#ifdef CONFIG_RDMA
 501    } else if (strstart(uri, "rdma:", &p)) {
 502        rdma_start_incoming_migration(p, errp);
 503#endif
 504    } else if (strstart(uri, "exec:", &p)) {
 505        exec_start_incoming_migration(p, errp);
 506    } else if (strstart(uri, "fd:", &p)) {
 507        fd_start_incoming_migration(p, errp);
 508    } else {
 509        error_setg(errp, "unknown migration protocol: %s", uri);
 510    }
 511}
 512
 513static void process_incoming_migration_bh(void *opaque)
 514{
 515    Error *local_err = NULL;
 516    MigrationIncomingState *mis = opaque;
 517
 518    /* If capability late_block_activate is set:
 519     * Only fire up the block code now if we're going to restart the
 520     * VM, else 'cont' will do it.
 521     * This causes file locking to happen; so we don't want it to happen
 522     * unless we really are starting the VM.
 523     */
 524    if (!migrate_late_block_activate() ||
 525         (autostart && (!global_state_received() ||
 526            global_state_get_runstate() == RUN_STATE_RUNNING))) {
 527        /* Make sure all file formats throw away their mutable metadata.
 528         * If we get an error here, just don't restart the VM yet. */
 529        bdrv_activate_all(&local_err);
 530        if (local_err) {
 531            error_report_err(local_err);
 532            local_err = NULL;
 533            autostart = false;
 534        }
 535    }
 536
 537    /*
 538     * This must happen after all error conditions are dealt with and
 539     * we're sure the VM is going to be running on this host.
 540     */
 541    qemu_announce_self(&mis->announce_timer, migrate_announce_params());
 542
 543    if (multifd_load_cleanup(&local_err) != 0) {
 544        error_report_err(local_err);
 545        autostart = false;
 546    }
 547    /* If global state section was not received or we are in running
 548       state, we need to obey autostart. Any other state is set with
 549       runstate_set. */
 550
 551    dirty_bitmap_mig_before_vm_start();
 552
 553    if (!global_state_received() ||
 554        global_state_get_runstate() == RUN_STATE_RUNNING) {
 555        if (autostart) {
 556            vm_start();
 557        } else {
 558            runstate_set(RUN_STATE_PAUSED);
 559        }
 560    } else if (migration_incoming_colo_enabled()) {
 561        migration_incoming_disable_colo();
 562        vm_start();
 563    } else {
 564        runstate_set(global_state_get_runstate());
 565    }
 566    /*
 567     * This must happen after any state changes since as soon as an external
 568     * observer sees this event they might start to prod at the VM assuming
 569     * it's ready to use.
 570     */
 571    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
 572                      MIGRATION_STATUS_COMPLETED);
 573    qemu_bh_delete(mis->bh);
 574    migration_incoming_state_destroy();
 575}
 576
 577static void coroutine_fn
 578process_incoming_migration_co(void *opaque)
 579{
 580    MigrationIncomingState *mis = migration_incoming_get_current();
 581    PostcopyState ps;
 582    int ret;
 583    Error *local_err = NULL;
 584
 585    assert(mis->from_src_file);
 586    mis->migration_incoming_co = qemu_coroutine_self();
 587    mis->largest_page_size = qemu_ram_pagesize_largest();
 588    postcopy_state_set(POSTCOPY_INCOMING_NONE);
 589    migrate_set_state(&mis->state, MIGRATION_STATUS_NONE,
 590                      MIGRATION_STATUS_ACTIVE);
 591    ret = qemu_loadvm_state(mis->from_src_file);
 592
 593    ps = postcopy_state_get();
 594    trace_process_incoming_migration_co_end(ret, ps);
 595    if (ps != POSTCOPY_INCOMING_NONE) {
 596        if (ps == POSTCOPY_INCOMING_ADVISE) {
 597            /*
 598             * Where a migration had postcopy enabled (and thus went to advise)
 599             * but managed to complete within the precopy period, we can use
 600             * the normal exit.
 601             */
 602            postcopy_ram_incoming_cleanup(mis);
 603        } else if (ret >= 0) {
 604            /*
 605             * Postcopy was started, cleanup should happen at the end of the
 606             * postcopy thread.
 607             */
 608            trace_process_incoming_migration_co_postcopy_end_main();
 609            return;
 610        }
 611        /* Else if something went wrong then just fall out of the normal exit */
 612    }
 613
 614    /* we get COLO info, and know if we are in COLO mode */
 615    if (!ret && migration_incoming_colo_enabled()) {
 616        /* Make sure all file formats throw away their mutable metadata */
 617        bdrv_activate_all(&local_err);
 618        if (local_err) {
 619            error_report_err(local_err);
 620            goto fail;
 621        }
 622
 623        qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming",
 624             colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
 625        mis->have_colo_incoming_thread = true;
 626        qemu_coroutine_yield();
 627
 628        qemu_mutex_unlock_iothread();
 629        /* Wait checkpoint incoming thread exit before free resource */
 630        qemu_thread_join(&mis->colo_incoming_thread);
 631        qemu_mutex_lock_iothread();
 632        /* We hold the global iothread lock, so it is safe here */
 633        colo_release_ram_cache();
 634    }
 635
 636    if (ret < 0) {
 637        error_report("load of migration failed: %s", strerror(-ret));
 638        goto fail;
 639    }
 640    mis->bh = qemu_bh_new(process_incoming_migration_bh, mis);
 641    qemu_bh_schedule(mis->bh);
 642    mis->migration_incoming_co = NULL;
 643    return;
 644fail:
 645    local_err = NULL;
 646    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
 647                      MIGRATION_STATUS_FAILED);
 648    qemu_fclose(mis->from_src_file);
 649    if (multifd_load_cleanup(&local_err) != 0) {
 650        error_report_err(local_err);
 651    }
 652    exit(EXIT_FAILURE);
 653}
 654
 655/**
 656 * migration_incoming_setup: Setup incoming migration
 657 * @f: file for main migration channel
 658 * @errp: where to put errors
 659 *
 660 * Returns: %true on success, %false on error.
 661 */
 662static bool migration_incoming_setup(QEMUFile *f, Error **errp)
 663{
 664    MigrationIncomingState *mis = migration_incoming_get_current();
 665
 666    if (multifd_load_setup(errp) != 0) {
 667        return false;
 668    }
 669
 670    if (!mis->from_src_file) {
 671        mis->from_src_file = f;
 672    }
 673    qemu_file_set_blocking(f, false);
 674    return true;
 675}
 676
 677void migration_incoming_process(void)
 678{
 679    Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
 680    qemu_coroutine_enter(co);
 681}
 682
 683/* Returns true if recovered from a paused migration, otherwise false */
 684static bool postcopy_try_recover(void)
 685{
 686    MigrationIncomingState *mis = migration_incoming_get_current();
 687
 688    if (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
 689        /* Resumed from a paused postcopy migration */
 690
 691        /* This should be set already in migration_incoming_setup() */
 692        assert(mis->from_src_file);
 693        /* Postcopy has standalone thread to do vm load */
 694        qemu_file_set_blocking(mis->from_src_file, true);
 695
 696        /* Re-configure the return path */
 697        mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
 698
 699        migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
 700                          MIGRATION_STATUS_POSTCOPY_RECOVER);
 701
 702        /*
 703         * Here, we only wake up the main loading thread (while the
 704         * rest threads will still be waiting), so that we can receive
 705         * commands from source now, and answer it if needed. The
 706         * rest threads will be woken up afterwards until we are sure
 707         * that source is ready to reply to page requests.
 708         */
 709        qemu_sem_post(&mis->postcopy_pause_sem_dst);
 710        return true;
 711    }
 712
 713    return false;
 714}
 715
 716void migration_fd_process_incoming(QEMUFile *f, Error **errp)
 717{
 718    if (!migration_incoming_setup(f, errp)) {
 719        return;
 720    }
 721    if (postcopy_try_recover()) {
 722        return;
 723    }
 724    migration_incoming_process();
 725}
 726
 727static bool migration_needs_multiple_sockets(void)
 728{
 729    return migrate_use_multifd() || migrate_postcopy_preempt();
 730}
 731
 732void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
 733{
 734    MigrationIncomingState *mis = migration_incoming_get_current();
 735    Error *local_err = NULL;
 736    bool start_migration;
 737    QEMUFile *f;
 738
 739    if (!mis->from_src_file) {
 740        /* The first connection (multifd may have multiple) */
 741        f = qemu_file_new_input(ioc);
 742
 743        if (!migration_incoming_setup(f, errp)) {
 744            return;
 745        }
 746
 747        /*
 748         * Common migration only needs one channel, so we can start
 749         * right now.  Some features need more than one channel, we wait.
 750         */
 751        start_migration = !migration_needs_multiple_sockets();
 752    } else {
 753        /* Multiple connections */
 754        assert(migration_needs_multiple_sockets());
 755        if (migrate_use_multifd()) {
 756            start_migration = multifd_recv_new_channel(ioc, &local_err);
 757        } else {
 758            assert(migrate_postcopy_preempt());
 759            f = qemu_file_new_input(ioc);
 760            start_migration = postcopy_preempt_new_channel(mis, f);
 761        }
 762        if (local_err) {
 763            error_propagate(errp, local_err);
 764            return;
 765        }
 766    }
 767
 768    if (start_migration) {
 769        /* If it's a recovery, we're done */
 770        if (postcopy_try_recover()) {
 771            return;
 772        }
 773        migration_incoming_process();
 774    }
 775}
 776
 777/**
 778 * @migration_has_all_channels: We have received all channels that we need
 779 *
 780 * Returns true when we have got connections to all the channels that
 781 * we need for migration.
 782 */
 783bool migration_has_all_channels(void)
 784{
 785    MigrationIncomingState *mis = migration_incoming_get_current();
 786
 787    if (!mis->from_src_file) {
 788        return false;
 789    }
 790
 791    if (migrate_use_multifd()) {
 792        return multifd_recv_all_channels_created();
 793    }
 794
 795    if (migrate_postcopy_preempt()) {
 796        return mis->postcopy_qemufile_dst != NULL;
 797    }
 798
 799    return true;
 800}
 801
 802/*
 803 * Send a 'SHUT' message on the return channel with the given value
 804 * to indicate that we've finished with the RP.  Non-0 value indicates
 805 * error.
 806 */
 807void migrate_send_rp_shut(MigrationIncomingState *mis,
 808                          uint32_t value)
 809{
 810    uint32_t buf;
 811
 812    buf = cpu_to_be32(value);
 813    migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf);
 814}
 815
 816/*
 817 * Send a 'PONG' message on the return channel with the given value
 818 * (normally in response to a 'PING')
 819 */
 820void migrate_send_rp_pong(MigrationIncomingState *mis,
 821                          uint32_t value)
 822{
 823    uint32_t buf;
 824
 825    buf = cpu_to_be32(value);
 826    migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
 827}
 828
 829void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
 830                                 char *block_name)
 831{
 832    char buf[512];
 833    int len;
 834    int64_t res;
 835
 836    /*
 837     * First, we send the header part. It contains only the len of
 838     * idstr, and the idstr itself.
 839     */
 840    len = strlen(block_name);
 841    buf[0] = len;
 842    memcpy(buf + 1, block_name, len);
 843
 844    if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
 845        error_report("%s: MSG_RP_RECV_BITMAP only used for recovery",
 846                     __func__);
 847        return;
 848    }
 849
 850    migrate_send_rp_message(mis, MIG_RP_MSG_RECV_BITMAP, len + 1, buf);
 851
 852    /*
 853     * Next, we dump the received bitmap to the stream.
 854     *
 855     * TODO: currently we are safe since we are the only one that is
 856     * using the to_src_file handle (fault thread is still paused),
 857     * and it's ok even not taking the mutex. However the best way is
 858     * to take the lock before sending the message header, and release
 859     * the lock after sending the bitmap.
 860     */
 861    qemu_mutex_lock(&mis->rp_mutex);
 862    res = ramblock_recv_bitmap_send(mis->to_src_file, block_name);
 863    qemu_mutex_unlock(&mis->rp_mutex);
 864
 865    trace_migrate_send_rp_recv_bitmap(block_name, res);
 866}
 867
 868void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
 869{
 870    uint32_t buf;
 871
 872    buf = cpu_to_be32(value);
 873    migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf);
 874}
 875
 876MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp)
 877{
 878    MigrationCapabilityStatusList *head = NULL, **tail = &head;
 879    MigrationCapabilityStatus *caps;
 880    MigrationState *s = migrate_get_current();
 881    int i;
 882
 883    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 884#ifndef CONFIG_LIVE_BLOCK_MIGRATION
 885        if (i == MIGRATION_CAPABILITY_BLOCK) {
 886            continue;
 887        }
 888#endif
 889        caps = g_malloc0(sizeof(*caps));
 890        caps->capability = i;
 891        caps->state = s->enabled_capabilities[i];
 892        QAPI_LIST_APPEND(tail, caps);
 893    }
 894
 895    return head;
 896}
 897
 898MigrationParameters *qmp_query_migrate_parameters(Error **errp)
 899{
 900    MigrationParameters *params;
 901    MigrationState *s = migrate_get_current();
 902
 903    /* TODO use QAPI_CLONE() instead of duplicating it inline */
 904    params = g_malloc0(sizeof(*params));
 905    params->has_compress_level = true;
 906    params->compress_level = s->parameters.compress_level;
 907    params->has_compress_threads = true;
 908    params->compress_threads = s->parameters.compress_threads;
 909    params->has_compress_wait_thread = true;
 910    params->compress_wait_thread = s->parameters.compress_wait_thread;
 911    params->has_decompress_threads = true;
 912    params->decompress_threads = s->parameters.decompress_threads;
 913    params->has_throttle_trigger_threshold = true;
 914    params->throttle_trigger_threshold = s->parameters.throttle_trigger_threshold;
 915    params->has_cpu_throttle_initial = true;
 916    params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
 917    params->has_cpu_throttle_increment = true;
 918    params->cpu_throttle_increment = s->parameters.cpu_throttle_increment;
 919    params->has_cpu_throttle_tailslow = true;
 920    params->cpu_throttle_tailslow = s->parameters.cpu_throttle_tailslow;
 921    params->has_tls_creds = true;
 922    params->tls_creds = g_strdup(s->parameters.tls_creds);
 923    params->has_tls_hostname = true;
 924    params->tls_hostname = g_strdup(s->parameters.tls_hostname);
 925    params->has_tls_authz = true;
 926    params->tls_authz = g_strdup(s->parameters.tls_authz ?
 927                                 s->parameters.tls_authz : "");
 928    params->has_max_bandwidth = true;
 929    params->max_bandwidth = s->parameters.max_bandwidth;
 930    params->has_downtime_limit = true;
 931    params->downtime_limit = s->parameters.downtime_limit;
 932    params->has_x_checkpoint_delay = true;
 933    params->x_checkpoint_delay = s->parameters.x_checkpoint_delay;
 934    params->has_block_incremental = true;
 935    params->block_incremental = s->parameters.block_incremental;
 936    params->has_multifd_channels = true;
 937    params->multifd_channels = s->parameters.multifd_channels;
 938    params->has_multifd_compression = true;
 939    params->multifd_compression = s->parameters.multifd_compression;
 940    params->has_multifd_zlib_level = true;
 941    params->multifd_zlib_level = s->parameters.multifd_zlib_level;
 942    params->has_multifd_zstd_level = true;
 943    params->multifd_zstd_level = s->parameters.multifd_zstd_level;
 944    params->has_xbzrle_cache_size = true;
 945    params->xbzrle_cache_size = s->parameters.xbzrle_cache_size;
 946    params->has_max_postcopy_bandwidth = true;
 947    params->max_postcopy_bandwidth = s->parameters.max_postcopy_bandwidth;
 948    params->has_max_cpu_throttle = true;
 949    params->max_cpu_throttle = s->parameters.max_cpu_throttle;
 950    params->has_announce_initial = true;
 951    params->announce_initial = s->parameters.announce_initial;
 952    params->has_announce_max = true;
 953    params->announce_max = s->parameters.announce_max;
 954    params->has_announce_rounds = true;
 955    params->announce_rounds = s->parameters.announce_rounds;
 956    params->has_announce_step = true;
 957    params->announce_step = s->parameters.announce_step;
 958
 959    if (s->parameters.has_block_bitmap_mapping) {
 960        params->has_block_bitmap_mapping = true;
 961        params->block_bitmap_mapping =
 962            QAPI_CLONE(BitmapMigrationNodeAliasList,
 963                       s->parameters.block_bitmap_mapping);
 964    }
 965
 966    return params;
 967}
 968
 969AnnounceParameters *migrate_announce_params(void)
 970{
 971    static AnnounceParameters ap;
 972
 973    MigrationState *s = migrate_get_current();
 974
 975    ap.initial = s->parameters.announce_initial;
 976    ap.max = s->parameters.announce_max;
 977    ap.rounds = s->parameters.announce_rounds;
 978    ap.step = s->parameters.announce_step;
 979
 980    return &ap;
 981}
 982
 983/*
 984 * Return true if we're already in the middle of a migration
 985 * (i.e. any of the active or setup states)
 986 */
 987bool migration_is_setup_or_active(int state)
 988{
 989    switch (state) {
 990    case MIGRATION_STATUS_ACTIVE:
 991    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
 992    case MIGRATION_STATUS_POSTCOPY_PAUSED:
 993    case MIGRATION_STATUS_POSTCOPY_RECOVER:
 994    case MIGRATION_STATUS_SETUP:
 995    case MIGRATION_STATUS_PRE_SWITCHOVER:
 996    case MIGRATION_STATUS_DEVICE:
 997    case MIGRATION_STATUS_WAIT_UNPLUG:
 998    case MIGRATION_STATUS_COLO:
 999        return true;
1000
1001    default:
1002        return false;
1003
1004    }
1005}
1006
1007bool migration_is_running(int state)
1008{
1009    switch (state) {
1010    case MIGRATION_STATUS_ACTIVE:
1011    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1012    case MIGRATION_STATUS_POSTCOPY_PAUSED:
1013    case MIGRATION_STATUS_POSTCOPY_RECOVER:
1014    case MIGRATION_STATUS_SETUP:
1015    case MIGRATION_STATUS_PRE_SWITCHOVER:
1016    case MIGRATION_STATUS_DEVICE:
1017    case MIGRATION_STATUS_WAIT_UNPLUG:
1018    case MIGRATION_STATUS_CANCELLING:
1019        return true;
1020
1021    default:
1022        return false;
1023
1024    }
1025}
1026
1027static void populate_time_info(MigrationInfo *info, MigrationState *s)
1028{
1029    info->has_status = true;
1030    info->has_setup_time = true;
1031    info->setup_time = s->setup_time;
1032    if (s->state == MIGRATION_STATUS_COMPLETED) {
1033        info->has_total_time = true;
1034        info->total_time = s->total_time;
1035        info->has_downtime = true;
1036        info->downtime = s->downtime;
1037    } else {
1038        info->has_total_time = true;
1039        info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
1040                           s->start_time;
1041        info->has_expected_downtime = true;
1042        info->expected_downtime = s->expected_downtime;
1043    }
1044}
1045
1046static void populate_ram_info(MigrationInfo *info, MigrationState *s)
1047{
1048    size_t page_size = qemu_target_page_size();
1049
1050    info->has_ram = true;
1051    info->ram = g_malloc0(sizeof(*info->ram));
1052    info->ram->transferred = ram_counters.transferred;
1053    info->ram->total = ram_bytes_total();
1054    info->ram->duplicate = ram_counters.duplicate;
1055    /* legacy value.  It is not used anymore */
1056    info->ram->skipped = 0;
1057    info->ram->normal = ram_counters.normal;
1058    info->ram->normal_bytes = ram_counters.normal * page_size;
1059    info->ram->mbps = s->mbps;
1060    info->ram->dirty_sync_count = ram_counters.dirty_sync_count;
1061    info->ram->dirty_sync_missed_zero_copy =
1062            ram_counters.dirty_sync_missed_zero_copy;
1063    info->ram->postcopy_requests = ram_counters.postcopy_requests;
1064    info->ram->page_size = page_size;
1065    info->ram->multifd_bytes = ram_counters.multifd_bytes;
1066    info->ram->pages_per_second = s->pages_per_second;
1067    info->ram->precopy_bytes = ram_counters.precopy_bytes;
1068    info->ram->downtime_bytes = ram_counters.downtime_bytes;
1069    info->ram->postcopy_bytes = ram_counters.postcopy_bytes;
1070
1071    if (migrate_use_xbzrle()) {
1072        info->has_xbzrle_cache = true;
1073        info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache));
1074        info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size();
1075        info->xbzrle_cache->bytes = xbzrle_counters.bytes;
1076        info->xbzrle_cache->pages = xbzrle_counters.pages;
1077        info->xbzrle_cache->cache_miss = xbzrle_counters.cache_miss;
1078        info->xbzrle_cache->cache_miss_rate = xbzrle_counters.cache_miss_rate;
1079        info->xbzrle_cache->encoding_rate = xbzrle_counters.encoding_rate;
1080        info->xbzrle_cache->overflow = xbzrle_counters.overflow;
1081    }
1082
1083    if (migrate_use_compression()) {
1084        info->has_compression = true;
1085        info->compression = g_malloc0(sizeof(*info->compression));
1086        info->compression->pages = compression_counters.pages;
1087        info->compression->busy = compression_counters.busy;
1088        info->compression->busy_rate = compression_counters.busy_rate;
1089        info->compression->compressed_size =
1090                                    compression_counters.compressed_size;
1091        info->compression->compression_rate =
1092                                    compression_counters.compression_rate;
1093    }
1094
1095    if (cpu_throttle_active()) {
1096        info->has_cpu_throttle_percentage = true;
1097        info->cpu_throttle_percentage = cpu_throttle_get_percentage();
1098    }
1099
1100    if (s->state != MIGRATION_STATUS_COMPLETED) {
1101        info->ram->remaining = ram_bytes_remaining();
1102        info->ram->dirty_pages_rate = ram_counters.dirty_pages_rate;
1103    }
1104}
1105
1106static void populate_disk_info(MigrationInfo *info)
1107{
1108    if (blk_mig_active()) {
1109        info->has_disk = true;
1110        info->disk = g_malloc0(sizeof(*info->disk));
1111        info->disk->transferred = blk_mig_bytes_transferred();
1112        info->disk->remaining = blk_mig_bytes_remaining();
1113        info->disk->total = blk_mig_bytes_total();
1114    }
1115}
1116
1117static void fill_source_migration_info(MigrationInfo *info)
1118{
1119    MigrationState *s = migrate_get_current();
1120    int state = qatomic_read(&s->state);
1121    GSList *cur_blocker = migration_blockers;
1122
1123    info->blocked_reasons = NULL;
1124
1125    /*
1126     * There are two types of reasons a migration might be blocked;
1127     * a) devices marked in VMState as non-migratable, and
1128     * b) Explicit migration blockers
1129     * We need to add both of them here.
1130     */
1131    qemu_savevm_non_migratable_list(&info->blocked_reasons);
1132
1133    while (cur_blocker) {
1134        QAPI_LIST_PREPEND(info->blocked_reasons,
1135                          g_strdup(error_get_pretty(cur_blocker->data)));
1136        cur_blocker = g_slist_next(cur_blocker);
1137    }
1138    info->has_blocked_reasons = info->blocked_reasons != NULL;
1139
1140    switch (state) {
1141    case MIGRATION_STATUS_NONE:
1142        /* no migration has happened ever */
1143        /* do not overwrite destination migration status */
1144        return;
1145    case MIGRATION_STATUS_SETUP:
1146        info->has_status = true;
1147        info->has_total_time = false;
1148        break;
1149    case MIGRATION_STATUS_ACTIVE:
1150    case MIGRATION_STATUS_CANCELLING:
1151    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1152    case MIGRATION_STATUS_PRE_SWITCHOVER:
1153    case MIGRATION_STATUS_DEVICE:
1154    case MIGRATION_STATUS_POSTCOPY_PAUSED:
1155    case MIGRATION_STATUS_POSTCOPY_RECOVER:
1156        /* TODO add some postcopy stats */
1157        populate_time_info(info, s);
1158        populate_ram_info(info, s);
1159        populate_disk_info(info);
1160        populate_vfio_info(info);
1161        break;
1162    case MIGRATION_STATUS_COLO:
1163        info->has_status = true;
1164        /* TODO: display COLO specific information (checkpoint info etc.) */
1165        break;
1166    case MIGRATION_STATUS_COMPLETED:
1167        populate_time_info(info, s);
1168        populate_ram_info(info, s);
1169        populate_vfio_info(info);
1170        break;
1171    case MIGRATION_STATUS_FAILED:
1172        info->has_status = true;
1173        if (s->error) {
1174            info->has_error_desc = true;
1175            info->error_desc = g_strdup(error_get_pretty(s->error));
1176        }
1177        break;
1178    case MIGRATION_STATUS_CANCELLED:
1179        info->has_status = true;
1180        break;
1181    case MIGRATION_STATUS_WAIT_UNPLUG:
1182        info->has_status = true;
1183        break;
1184    }
1185    info->status = state;
1186}
1187
1188typedef enum WriteTrackingSupport {
1189    WT_SUPPORT_UNKNOWN = 0,
1190    WT_SUPPORT_ABSENT,
1191    WT_SUPPORT_AVAILABLE,
1192    WT_SUPPORT_COMPATIBLE
1193} WriteTrackingSupport;
1194
1195static
1196WriteTrackingSupport migrate_query_write_tracking(void)
1197{
1198    /* Check if kernel supports required UFFD features */
1199    if (!ram_write_tracking_available()) {
1200        return WT_SUPPORT_ABSENT;
1201    }
1202    /*
1203     * Check if current memory configuration is
1204     * compatible with required UFFD features.
1205     */
1206    if (!ram_write_tracking_compatible()) {
1207        return WT_SUPPORT_AVAILABLE;
1208    }
1209
1210    return WT_SUPPORT_COMPATIBLE;
1211}
1212
1213/**
1214 * @migration_caps_check - check capability validity
1215 *
1216 * @cap_list: old capability list, array of bool
1217 * @params: new capabilities to be applied soon
1218 * @errp: set *errp if the check failed, with reason
1219 *
1220 * Returns true if check passed, otherwise false.
1221 */
1222static bool migrate_caps_check(bool *cap_list,
1223                               MigrationCapabilityStatusList *params,
1224                               Error **errp)
1225{
1226    MigrationCapabilityStatusList *cap;
1227    bool old_postcopy_cap;
1228    MigrationIncomingState *mis = migration_incoming_get_current();
1229
1230    old_postcopy_cap = cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM];
1231
1232    for (cap = params; cap; cap = cap->next) {
1233        cap_list[cap->value->capability] = cap->value->state;
1234    }
1235
1236#ifndef CONFIG_LIVE_BLOCK_MIGRATION
1237    if (cap_list[MIGRATION_CAPABILITY_BLOCK]) {
1238        error_setg(errp, "QEMU compiled without old-style (blk/-b, inc/-i) "
1239                   "block migration");
1240        error_append_hint(errp, "Use drive_mirror+NBD instead.\n");
1241        return false;
1242    }
1243#endif
1244
1245#ifndef CONFIG_REPLICATION
1246    if (cap_list[MIGRATION_CAPABILITY_X_COLO]) {
1247        error_setg(errp, "QEMU compiled without replication module"
1248                   " can't enable COLO");
1249        error_append_hint(errp, "Please enable replication before COLO.\n");
1250        return false;
1251    }
1252#endif
1253
1254    if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
1255        /* This check is reasonably expensive, so only when it's being
1256         * set the first time, also it's only the destination that needs
1257         * special support.
1258         */
1259        if (!old_postcopy_cap && runstate_check(RUN_STATE_INMIGRATE) &&
1260            !postcopy_ram_supported_by_host(mis)) {
1261            /* postcopy_ram_supported_by_host will have emitted a more
1262             * detailed message
1263             */
1264            error_setg(errp, "Postcopy is not supported");
1265            return false;
1266        }
1267
1268        if (cap_list[MIGRATION_CAPABILITY_X_IGNORE_SHARED]) {
1269            error_setg(errp, "Postcopy is not compatible with ignore-shared");
1270            return false;
1271        }
1272    }
1273
1274    if (cap_list[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
1275        WriteTrackingSupport wt_support;
1276        int idx;
1277        /*
1278         * Check if 'background-snapshot' capability is supported by
1279         * host kernel and compatible with guest memory configuration.
1280         */
1281        wt_support = migrate_query_write_tracking();
1282        if (wt_support < WT_SUPPORT_AVAILABLE) {
1283            error_setg(errp, "Background-snapshot is not supported by host kernel");
1284            return false;
1285        }
1286        if (wt_support < WT_SUPPORT_COMPATIBLE) {
1287            error_setg(errp, "Background-snapshot is not compatible "
1288                    "with guest memory configuration");
1289            return false;
1290        }
1291
1292        /*
1293         * Check if there are any migration capabilities
1294         * incompatible with 'background-snapshot'.
1295         */
1296        for (idx = 0; idx < check_caps_background_snapshot.size; idx++) {
1297            int incomp_cap = check_caps_background_snapshot.caps[idx];
1298            if (cap_list[incomp_cap]) {
1299                error_setg(errp,
1300                        "Background-snapshot is not compatible with %s",
1301                        MigrationCapability_str(incomp_cap));
1302                return false;
1303            }
1304        }
1305    }
1306
1307#ifdef CONFIG_LINUX
1308    if (cap_list[MIGRATION_CAPABILITY_ZERO_COPY_SEND] &&
1309        (!cap_list[MIGRATION_CAPABILITY_MULTIFD] ||
1310         cap_list[MIGRATION_CAPABILITY_COMPRESS] ||
1311         cap_list[MIGRATION_CAPABILITY_XBZRLE] ||
1312         migrate_multifd_compression() ||
1313         migrate_use_tls())) {
1314        error_setg(errp,
1315                   "Zero copy only available for non-compressed non-TLS multifd migration");
1316        return false;
1317    }
1318#else
1319    if (cap_list[MIGRATION_CAPABILITY_ZERO_COPY_SEND]) {
1320        error_setg(errp,
1321                   "Zero copy currently only available on Linux");
1322        return false;
1323    }
1324#endif
1325
1326
1327    /* incoming side only */
1328    if (runstate_check(RUN_STATE_INMIGRATE) &&
1329        !migrate_multi_channels_is_allowed() &&
1330        cap_list[MIGRATION_CAPABILITY_MULTIFD]) {
1331        error_setg(errp, "multifd is not supported by current protocol");
1332        return false;
1333    }
1334
1335    if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT]) {
1336        if (!cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
1337            error_setg(errp, "Postcopy preempt requires postcopy-ram");
1338            return false;
1339        }
1340
1341        /*
1342         * Preempt mode requires urgent pages to be sent in separate
1343         * channel, OTOH compression logic will disorder all pages into
1344         * different compression channels, which is not compatible with the
1345         * preempt assumptions on channel assignments.
1346         */
1347        if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) {
1348            error_setg(errp, "Postcopy preempt not compatible with compress");
1349            return false;
1350        }
1351    }
1352
1353    if (cap_list[MIGRATION_CAPABILITY_MULTIFD]) {
1354        if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) {
1355            error_setg(errp, "Multifd is not compatible with compress");
1356            return false;
1357        }
1358    }
1359
1360    return true;
1361}
1362
1363static void fill_destination_migration_info(MigrationInfo *info)
1364{
1365    MigrationIncomingState *mis = migration_incoming_get_current();
1366
1367    if (mis->socket_address_list) {
1368        info->has_socket_address = true;
1369        info->socket_address =
1370            QAPI_CLONE(SocketAddressList, mis->socket_address_list);
1371    }
1372
1373    switch (mis->state) {
1374    case MIGRATION_STATUS_NONE:
1375        return;
1376    case MIGRATION_STATUS_SETUP:
1377    case MIGRATION_STATUS_CANCELLING:
1378    case MIGRATION_STATUS_CANCELLED:
1379    case MIGRATION_STATUS_ACTIVE:
1380    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1381    case MIGRATION_STATUS_POSTCOPY_PAUSED:
1382    case MIGRATION_STATUS_POSTCOPY_RECOVER:
1383    case MIGRATION_STATUS_FAILED:
1384    case MIGRATION_STATUS_COLO:
1385        info->has_status = true;
1386        break;
1387    case MIGRATION_STATUS_COMPLETED:
1388        info->has_status = true;
1389        fill_destination_postcopy_migration_info(info);
1390        break;
1391    }
1392    info->status = mis->state;
1393}
1394
1395MigrationInfo *qmp_query_migrate(Error **errp)
1396{
1397    MigrationInfo *info = g_malloc0(sizeof(*info));
1398
1399    fill_destination_migration_info(info);
1400    fill_source_migration_info(info);
1401
1402    return info;
1403}
1404
1405void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
1406                                  Error **errp)
1407{
1408    MigrationState *s = migrate_get_current();
1409    MigrationCapabilityStatusList *cap;
1410    bool cap_list[MIGRATION_CAPABILITY__MAX];
1411
1412    if (migration_is_running(s->state)) {
1413        error_setg(errp, QERR_MIGRATION_ACTIVE);
1414        return;
1415    }
1416
1417    memcpy(cap_list, s->enabled_capabilities, sizeof(cap_list));
1418    if (!migrate_caps_check(cap_list, params, errp)) {
1419        return;
1420    }
1421
1422    for (cap = params; cap; cap = cap->next) {
1423        s->enabled_capabilities[cap->value->capability] = cap->value->state;
1424    }
1425}
1426
1427/*
1428 * Check whether the parameters are valid. Error will be put into errp
1429 * (if provided). Return true if valid, otherwise false.
1430 */
1431static bool migrate_params_check(MigrationParameters *params, Error **errp)
1432{
1433    if (params->has_compress_level &&
1434        (params->compress_level > 9)) {
1435        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level",
1436                   "a value between 0 and 9");
1437        return false;
1438    }
1439
1440    if (params->has_compress_threads && (params->compress_threads < 1)) {
1441        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1442                   "compress_threads",
1443                   "a value between 1 and 255");
1444        return false;
1445    }
1446
1447    if (params->has_decompress_threads && (params->decompress_threads < 1)) {
1448        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1449                   "decompress_threads",
1450                   "a value between 1 and 255");
1451        return false;
1452    }
1453
1454    if (params->has_throttle_trigger_threshold &&
1455        (params->throttle_trigger_threshold < 1 ||
1456         params->throttle_trigger_threshold > 100)) {
1457        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1458                   "throttle_trigger_threshold",
1459                   "an integer in the range of 1 to 100");
1460        return false;
1461    }
1462
1463    if (params->has_cpu_throttle_initial &&
1464        (params->cpu_throttle_initial < 1 ||
1465         params->cpu_throttle_initial > 99)) {
1466        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1467                   "cpu_throttle_initial",
1468                   "an integer in the range of 1 to 99");
1469        return false;
1470    }
1471
1472    if (params->has_cpu_throttle_increment &&
1473        (params->cpu_throttle_increment < 1 ||
1474         params->cpu_throttle_increment > 99)) {
1475        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1476                   "cpu_throttle_increment",
1477                   "an integer in the range of 1 to 99");
1478        return false;
1479    }
1480
1481    if (params->has_max_bandwidth && (params->max_bandwidth > SIZE_MAX)) {
1482        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1483                   "max_bandwidth",
1484                   "an integer in the range of 0 to "stringify(SIZE_MAX)
1485                   " bytes/second");
1486        return false;
1487    }
1488
1489    if (params->has_downtime_limit &&
1490        (params->downtime_limit > MAX_MIGRATE_DOWNTIME)) {
1491        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1492                   "downtime_limit",
1493                   "an integer in the range of 0 to "
1494                    stringify(MAX_MIGRATE_DOWNTIME)" ms");
1495        return false;
1496    }
1497
1498    /* x_checkpoint_delay is now always positive */
1499
1500    if (params->has_multifd_channels && (params->multifd_channels < 1)) {
1501        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1502                   "multifd_channels",
1503                   "a value between 1 and 255");
1504        return false;
1505    }
1506
1507    if (params->has_multifd_zlib_level &&
1508        (params->multifd_zlib_level > 9)) {
1509        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zlib_level",
1510                   "a value between 0 and 9");
1511        return false;
1512    }
1513
1514    if (params->has_multifd_zstd_level &&
1515        (params->multifd_zstd_level > 20)) {
1516        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level",
1517                   "a value between 0 and 20");
1518        return false;
1519    }
1520
1521    if (params->has_xbzrle_cache_size &&
1522        (params->xbzrle_cache_size < qemu_target_page_size() ||
1523         !is_power_of_2(params->xbzrle_cache_size))) {
1524        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1525                   "xbzrle_cache_size",
1526                   "a power of two no less than the target page size");
1527        return false;
1528    }
1529
1530    if (params->has_max_cpu_throttle &&
1531        (params->max_cpu_throttle < params->cpu_throttle_initial ||
1532         params->max_cpu_throttle > 99)) {
1533        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1534                   "max_cpu_throttle",
1535                   "an integer in the range of cpu_throttle_initial to 99");
1536        return false;
1537    }
1538
1539    if (params->has_announce_initial &&
1540        params->announce_initial > 100000) {
1541        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1542                   "announce_initial",
1543                   "a value between 0 and 100000");
1544        return false;
1545    }
1546    if (params->has_announce_max &&
1547        params->announce_max > 100000) {
1548        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1549                   "announce_max",
1550                   "a value between 0 and 100000");
1551       return false;
1552    }
1553    if (params->has_announce_rounds &&
1554        params->announce_rounds > 1000) {
1555        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1556                   "announce_rounds",
1557                   "a value between 0 and 1000");
1558       return false;
1559    }
1560    if (params->has_announce_step &&
1561        (params->announce_step < 1 ||
1562        params->announce_step > 10000)) {
1563        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1564                   "announce_step",
1565                   "a value between 0 and 10000");
1566       return false;
1567    }
1568
1569    if (params->has_block_bitmap_mapping &&
1570        !check_dirty_bitmap_mig_alias_map(params->block_bitmap_mapping, errp)) {
1571        error_prepend(errp, "Invalid mapping given for block-bitmap-mapping: ");
1572        return false;
1573    }
1574
1575#ifdef CONFIG_LINUX
1576    if (migrate_use_zero_copy_send() &&
1577        ((params->has_multifd_compression && params->multifd_compression) ||
1578         (params->has_tls_creds && params->tls_creds && *params->tls_creds))) {
1579        error_setg(errp,
1580                   "Zero copy only available for non-compressed non-TLS multifd migration");
1581        return false;
1582    }
1583#endif
1584
1585    return true;
1586}
1587
1588static void migrate_params_test_apply(MigrateSetParameters *params,
1589                                      MigrationParameters *dest)
1590{
1591    *dest = migrate_get_current()->parameters;
1592
1593    /* TODO use QAPI_CLONE() instead of duplicating it inline */
1594
1595    if (params->has_compress_level) {
1596        dest->compress_level = params->compress_level;
1597    }
1598
1599    if (params->has_compress_threads) {
1600        dest->compress_threads = params->compress_threads;
1601    }
1602
1603    if (params->has_compress_wait_thread) {
1604        dest->compress_wait_thread = params->compress_wait_thread;
1605    }
1606
1607    if (params->has_decompress_threads) {
1608        dest->decompress_threads = params->decompress_threads;
1609    }
1610
1611    if (params->has_throttle_trigger_threshold) {
1612        dest->throttle_trigger_threshold = params->throttle_trigger_threshold;
1613    }
1614
1615    if (params->has_cpu_throttle_initial) {
1616        dest->cpu_throttle_initial = params->cpu_throttle_initial;
1617    }
1618
1619    if (params->has_cpu_throttle_increment) {
1620        dest->cpu_throttle_increment = params->cpu_throttle_increment;
1621    }
1622
1623    if (params->has_cpu_throttle_tailslow) {
1624        dest->cpu_throttle_tailslow = params->cpu_throttle_tailslow;
1625    }
1626
1627    if (params->has_tls_creds) {
1628        assert(params->tls_creds->type == QTYPE_QSTRING);
1629        dest->tls_creds = params->tls_creds->u.s;
1630    }
1631
1632    if (params->has_tls_hostname) {
1633        assert(params->tls_hostname->type == QTYPE_QSTRING);
1634        dest->tls_hostname = params->tls_hostname->u.s;
1635    }
1636
1637    if (params->has_max_bandwidth) {
1638        dest->max_bandwidth = params->max_bandwidth;
1639    }
1640
1641    if (params->has_downtime_limit) {
1642        dest->downtime_limit = params->downtime_limit;
1643    }
1644
1645    if (params->has_x_checkpoint_delay) {
1646        dest->x_checkpoint_delay = params->x_checkpoint_delay;
1647    }
1648
1649    if (params->has_block_incremental) {
1650        dest->block_incremental = params->block_incremental;
1651    }
1652    if (params->has_multifd_channels) {
1653        dest->multifd_channels = params->multifd_channels;
1654    }
1655    if (params->has_multifd_compression) {
1656        dest->multifd_compression = params->multifd_compression;
1657    }
1658    if (params->has_xbzrle_cache_size) {
1659        dest->xbzrle_cache_size = params->xbzrle_cache_size;
1660    }
1661    if (params->has_max_postcopy_bandwidth) {
1662        dest->max_postcopy_bandwidth = params->max_postcopy_bandwidth;
1663    }
1664    if (params->has_max_cpu_throttle) {
1665        dest->max_cpu_throttle = params->max_cpu_throttle;
1666    }
1667    if (params->has_announce_initial) {
1668        dest->announce_initial = params->announce_initial;
1669    }
1670    if (params->has_announce_max) {
1671        dest->announce_max = params->announce_max;
1672    }
1673    if (params->has_announce_rounds) {
1674        dest->announce_rounds = params->announce_rounds;
1675    }
1676    if (params->has_announce_step) {
1677        dest->announce_step = params->announce_step;
1678    }
1679
1680    if (params->has_block_bitmap_mapping) {
1681        dest->has_block_bitmap_mapping = true;
1682        dest->block_bitmap_mapping = params->block_bitmap_mapping;
1683    }
1684}
1685
1686static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
1687{
1688    MigrationState *s = migrate_get_current();
1689
1690    /* TODO use QAPI_CLONE() instead of duplicating it inline */
1691
1692    if (params->has_compress_level) {
1693        s->parameters.compress_level = params->compress_level;
1694    }
1695
1696    if (params->has_compress_threads) {
1697        s->parameters.compress_threads = params->compress_threads;
1698    }
1699
1700    if (params->has_compress_wait_thread) {
1701        s->parameters.compress_wait_thread = params->compress_wait_thread;
1702    }
1703
1704    if (params->has_decompress_threads) {
1705        s->parameters.decompress_threads = params->decompress_threads;
1706    }
1707
1708    if (params->has_throttle_trigger_threshold) {
1709        s->parameters.throttle_trigger_threshold = params->throttle_trigger_threshold;
1710    }
1711
1712    if (params->has_cpu_throttle_initial) {
1713        s->parameters.cpu_throttle_initial = params->cpu_throttle_initial;
1714    }
1715
1716    if (params->has_cpu_throttle_increment) {
1717        s->parameters.cpu_throttle_increment = params->cpu_throttle_increment;
1718    }
1719
1720    if (params->has_cpu_throttle_tailslow) {
1721        s->parameters.cpu_throttle_tailslow = params->cpu_throttle_tailslow;
1722    }
1723
1724    if (params->has_tls_creds) {
1725        g_free(s->parameters.tls_creds);
1726        assert(params->tls_creds->type == QTYPE_QSTRING);
1727        s->parameters.tls_creds = g_strdup(params->tls_creds->u.s);
1728    }
1729
1730    if (params->has_tls_hostname) {
1731        g_free(s->parameters.tls_hostname);
1732        assert(params->tls_hostname->type == QTYPE_QSTRING);
1733        s->parameters.tls_hostname = g_strdup(params->tls_hostname->u.s);
1734    }
1735
1736    if (params->has_tls_authz) {
1737        g_free(s->parameters.tls_authz);
1738        assert(params->tls_authz->type == QTYPE_QSTRING);
1739        s->parameters.tls_authz = g_strdup(params->tls_authz->u.s);
1740    }
1741
1742    if (params->has_max_bandwidth) {
1743        s->parameters.max_bandwidth = params->max_bandwidth;
1744        if (s->to_dst_file && !migration_in_postcopy()) {
1745            qemu_file_set_rate_limit(s->to_dst_file,
1746                                s->parameters.max_bandwidth / XFER_LIMIT_RATIO);
1747        }
1748    }
1749
1750    if (params->has_downtime_limit) {
1751        s->parameters.downtime_limit = params->downtime_limit;
1752    }
1753
1754    if (params->has_x_checkpoint_delay) {
1755        s->parameters.x_checkpoint_delay = params->x_checkpoint_delay;
1756        if (migration_in_colo_state()) {
1757            colo_checkpoint_notify(s);
1758        }
1759    }
1760
1761    if (params->has_block_incremental) {
1762        s->parameters.block_incremental = params->block_incremental;
1763    }
1764    if (params->has_multifd_channels) {
1765        s->parameters.multifd_channels = params->multifd_channels;
1766    }
1767    if (params->has_multifd_compression) {
1768        s->parameters.multifd_compression = params->multifd_compression;
1769    }
1770    if (params->has_xbzrle_cache_size) {
1771        s->parameters.xbzrle_cache_size = params->xbzrle_cache_size;
1772        xbzrle_cache_resize(params->xbzrle_cache_size, errp);
1773    }
1774    if (params->has_max_postcopy_bandwidth) {
1775        s->parameters.max_postcopy_bandwidth = params->max_postcopy_bandwidth;
1776        if (s->to_dst_file && migration_in_postcopy()) {
1777            qemu_file_set_rate_limit(s->to_dst_file,
1778                    s->parameters.max_postcopy_bandwidth / XFER_LIMIT_RATIO);
1779        }
1780    }
1781    if (params->has_max_cpu_throttle) {
1782        s->parameters.max_cpu_throttle = params->max_cpu_throttle;
1783    }
1784    if (params->has_announce_initial) {
1785        s->parameters.announce_initial = params->announce_initial;
1786    }
1787    if (params->has_announce_max) {
1788        s->parameters.announce_max = params->announce_max;
1789    }
1790    if (params->has_announce_rounds) {
1791        s->parameters.announce_rounds = params->announce_rounds;
1792    }
1793    if (params->has_announce_step) {
1794        s->parameters.announce_step = params->announce_step;
1795    }
1796
1797    if (params->has_block_bitmap_mapping) {
1798        qapi_free_BitmapMigrationNodeAliasList(
1799            s->parameters.block_bitmap_mapping);
1800
1801        s->parameters.has_block_bitmap_mapping = true;
1802        s->parameters.block_bitmap_mapping =
1803            QAPI_CLONE(BitmapMigrationNodeAliasList,
1804                       params->block_bitmap_mapping);
1805    }
1806}
1807
1808void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
1809{
1810    MigrationParameters tmp;
1811
1812    /* TODO Rewrite "" to null instead */
1813    if (params->has_tls_creds
1814        && params->tls_creds->type == QTYPE_QNULL) {
1815        qobject_unref(params->tls_creds->u.n);
1816        params->tls_creds->type = QTYPE_QSTRING;
1817        params->tls_creds->u.s = strdup("");
1818    }
1819    /* TODO Rewrite "" to null instead */
1820    if (params->has_tls_hostname
1821        && params->tls_hostname->type == QTYPE_QNULL) {
1822        qobject_unref(params->tls_hostname->u.n);
1823        params->tls_hostname->type = QTYPE_QSTRING;
1824        params->tls_hostname->u.s = strdup("");
1825    }
1826
1827    migrate_params_test_apply(params, &tmp);
1828
1829    if (!migrate_params_check(&tmp, errp)) {
1830        /* Invalid parameter */
1831        return;
1832    }
1833
1834    migrate_params_apply(params, errp);
1835}
1836
1837
1838void qmp_migrate_start_postcopy(Error **errp)
1839{
1840    MigrationState *s = migrate_get_current();
1841
1842    if (!migrate_postcopy()) {
1843        error_setg(errp, "Enable postcopy with migrate_set_capability before"
1844                         " the start of migration");
1845        return;
1846    }
1847
1848    if (s->state == MIGRATION_STATUS_NONE) {
1849        error_setg(errp, "Postcopy must be started after migration has been"
1850                         " started");
1851        return;
1852    }
1853    /*
1854     * we don't error if migration has finished since that would be racy
1855     * with issuing this command.
1856     */
1857    qatomic_set(&s->start_postcopy, true);
1858}
1859
1860/* shared migration helpers */
1861
1862void migrate_set_state(int *state, int old_state, int new_state)
1863{
1864    assert(new_state < MIGRATION_STATUS__MAX);
1865    if (qatomic_cmpxchg(state, old_state, new_state) == old_state) {
1866        trace_migrate_set_state(MigrationStatus_str(new_state));
1867        migrate_generate_event(new_state);
1868    }
1869}
1870
1871static MigrationCapabilityStatus *migrate_cap_add(MigrationCapability index,
1872                                                  bool state)
1873{
1874    MigrationCapabilityStatus *cap;
1875
1876    cap = g_new0(MigrationCapabilityStatus, 1);
1877    cap->capability = index;
1878    cap->state = state;
1879
1880    return cap;
1881}
1882
1883void migrate_set_block_enabled(bool value, Error **errp)
1884{
1885    MigrationCapabilityStatusList *cap = NULL;
1886
1887    QAPI_LIST_PREPEND(cap, migrate_cap_add(MIGRATION_CAPABILITY_BLOCK, value));
1888    qmp_migrate_set_capabilities(cap, errp);
1889    qapi_free_MigrationCapabilityStatusList(cap);
1890}
1891
1892static void migrate_set_block_incremental(MigrationState *s, bool value)
1893{
1894    s->parameters.block_incremental = value;
1895}
1896
1897static void block_cleanup_parameters(MigrationState *s)
1898{
1899    if (s->must_remove_block_options) {
1900        /* setting to false can never fail */
1901        migrate_set_block_enabled(false, &error_abort);
1902        migrate_set_block_incremental(s, false);
1903        s->must_remove_block_options = false;
1904    }
1905}
1906
1907static void migrate_fd_cleanup(MigrationState *s)
1908{
1909    qemu_bh_delete(s->cleanup_bh);
1910    s->cleanup_bh = NULL;
1911
1912    g_free(s->hostname);
1913    s->hostname = NULL;
1914
1915    qemu_savevm_state_cleanup();
1916
1917    if (s->to_dst_file) {
1918        QEMUFile *tmp;
1919
1920        trace_migrate_fd_cleanup();
1921        qemu_mutex_unlock_iothread();
1922        if (s->migration_thread_running) {
1923            qemu_thread_join(&s->thread);
1924            s->migration_thread_running = false;
1925        }
1926        qemu_mutex_lock_iothread();
1927
1928        multifd_save_cleanup();
1929        qemu_mutex_lock(&s->qemu_file_lock);
1930        tmp = s->to_dst_file;
1931        s->to_dst_file = NULL;
1932        qemu_mutex_unlock(&s->qemu_file_lock);
1933        /*
1934         * Close the file handle without the lock to make sure the
1935         * critical section won't block for long.
1936         */
1937        migration_ioc_unregister_yank_from_file(tmp);
1938        qemu_fclose(tmp);
1939    }
1940
1941    if (s->postcopy_qemufile_src) {
1942        migration_ioc_unregister_yank_from_file(s->postcopy_qemufile_src);
1943        qemu_fclose(s->postcopy_qemufile_src);
1944        s->postcopy_qemufile_src = NULL;
1945    }
1946
1947    assert(!migration_is_active(s));
1948
1949    if (s->state == MIGRATION_STATUS_CANCELLING) {
1950        migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING,
1951                          MIGRATION_STATUS_CANCELLED);
1952    }
1953
1954    if (s->error) {
1955        /* It is used on info migrate.  We can't free it */
1956        error_report_err(error_copy(s->error));
1957    }
1958    notifier_list_notify(&migration_state_notifiers, s);
1959    block_cleanup_parameters(s);
1960    yank_unregister_instance(MIGRATION_YANK_INSTANCE);
1961}
1962
1963static void migrate_fd_cleanup_schedule(MigrationState *s)
1964{
1965    /*
1966     * Ref the state for bh, because it may be called when
1967     * there're already no other refs
1968     */
1969    object_ref(OBJECT(s));
1970    qemu_bh_schedule(s->cleanup_bh);
1971}
1972
1973static void migrate_fd_cleanup_bh(void *opaque)
1974{
1975    MigrationState *s = opaque;
1976    migrate_fd_cleanup(s);
1977    object_unref(OBJECT(s));
1978}
1979
1980void migrate_set_error(MigrationState *s, const Error *error)
1981{
1982    QEMU_LOCK_GUARD(&s->error_mutex);
1983    if (!s->error) {
1984        s->error = error_copy(error);
1985    }
1986}
1987
1988static void migrate_error_free(MigrationState *s)
1989{
1990    QEMU_LOCK_GUARD(&s->error_mutex);
1991    if (s->error) {
1992        error_free(s->error);
1993        s->error = NULL;
1994    }
1995}
1996
1997void migrate_fd_error(MigrationState *s, const Error *error)
1998{
1999    trace_migrate_fd_error(error_get_pretty(error));
2000    assert(s->to_dst_file == NULL);
2001    migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
2002                      MIGRATION_STATUS_FAILED);
2003    migrate_set_error(s, error);
2004}
2005
2006static void migrate_fd_cancel(MigrationState *s)
2007{
2008    int old_state ;
2009    QEMUFile *f = migrate_get_current()->to_dst_file;
2010    trace_migrate_fd_cancel();
2011
2012    WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
2013        if (s->rp_state.from_dst_file) {
2014            /* shutdown the rp socket, so causing the rp thread to shutdown */
2015            qemu_file_shutdown(s->rp_state.from_dst_file);
2016        }
2017    }
2018
2019    do {
2020        old_state = s->state;
2021        if (!migration_is_running(old_state)) {
2022            break;
2023        }
2024        /* If the migration is paused, kick it out of the pause */
2025        if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) {
2026            qemu_sem_post(&s->pause_sem);
2027        }
2028        migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING);
2029    } while (s->state != MIGRATION_STATUS_CANCELLING);
2030
2031    /*
2032     * If we're unlucky the migration code might be stuck somewhere in a
2033     * send/write while the network has failed and is waiting to timeout;
2034     * if we've got shutdown(2) available then we can force it to quit.
2035     * The outgoing qemu file gets closed in migrate_fd_cleanup that is
2036     * called in a bh, so there is no race against this cancel.
2037     */
2038    if (s->state == MIGRATION_STATUS_CANCELLING && f) {
2039        qemu_file_shutdown(f);
2040    }
2041    if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) {
2042        Error *local_err = NULL;
2043
2044        bdrv_activate_all(&local_err);
2045        if (local_err) {
2046            error_report_err(local_err);
2047        } else {
2048            s->block_inactive = false;
2049        }
2050    }
2051}
2052
2053void add_migration_state_change_notifier(Notifier *notify)
2054{
2055    notifier_list_add(&migration_state_notifiers, notify);
2056}
2057
2058void remove_migration_state_change_notifier(Notifier *notify)
2059{
2060    notifier_remove(notify);
2061}
2062
2063bool migration_in_setup(MigrationState *s)
2064{
2065    return s->state == MIGRATION_STATUS_SETUP;
2066}
2067
2068bool migration_has_finished(MigrationState *s)
2069{
2070    return s->state == MIGRATION_STATUS_COMPLETED;
2071}
2072
2073bool migration_has_failed(MigrationState *s)
2074{
2075    return (s->state == MIGRATION_STATUS_CANCELLED ||
2076            s->state == MIGRATION_STATUS_FAILED);
2077}
2078
2079bool migration_in_postcopy(void)
2080{
2081    MigrationState *s = migrate_get_current();
2082
2083    switch (s->state) {
2084    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
2085    case MIGRATION_STATUS_POSTCOPY_PAUSED:
2086    case MIGRATION_STATUS_POSTCOPY_RECOVER:
2087        return true;
2088    default:
2089        return false;
2090    }
2091}
2092
2093bool migration_in_postcopy_after_devices(MigrationState *s)
2094{
2095    return migration_in_postcopy() && s->postcopy_after_devices;
2096}
2097
2098bool migration_in_incoming_postcopy(void)
2099{
2100    PostcopyState ps = postcopy_state_get();
2101
2102    return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END;
2103}
2104
2105bool migration_in_bg_snapshot(void)
2106{
2107    MigrationState *s = migrate_get_current();
2108
2109    return migrate_background_snapshot() &&
2110            migration_is_setup_or_active(s->state);
2111}
2112
2113bool migration_is_idle(void)
2114{
2115    MigrationState *s = current_migration;
2116
2117    if (!s) {
2118        return true;
2119    }
2120
2121    switch (s->state) {
2122    case MIGRATION_STATUS_NONE:
2123    case MIGRATION_STATUS_CANCELLED:
2124    case MIGRATION_STATUS_COMPLETED:
2125    case MIGRATION_STATUS_FAILED:
2126        return true;
2127    case MIGRATION_STATUS_SETUP:
2128    case MIGRATION_STATUS_CANCELLING:
2129    case MIGRATION_STATUS_ACTIVE:
2130    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
2131    case MIGRATION_STATUS_COLO:
2132    case MIGRATION_STATUS_PRE_SWITCHOVER:
2133    case MIGRATION_STATUS_DEVICE:
2134    case MIGRATION_STATUS_WAIT_UNPLUG:
2135        return false;
2136    case MIGRATION_STATUS__MAX:
2137        g_assert_not_reached();
2138    }
2139
2140    return false;
2141}
2142
2143bool migration_is_active(MigrationState *s)
2144{
2145    return (s->state == MIGRATION_STATUS_ACTIVE ||
2146            s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
2147}
2148
2149void migrate_init(MigrationState *s)
2150{
2151    /*
2152     * Reinitialise all migration state, except
2153     * parameters/capabilities that the user set, and
2154     * locks.
2155     */
2156    s->cleanup_bh = 0;
2157    s->vm_start_bh = 0;
2158    s->to_dst_file = NULL;
2159    s->state = MIGRATION_STATUS_NONE;
2160    s->rp_state.from_dst_file = NULL;
2161    s->rp_state.error = false;
2162    s->mbps = 0.0;
2163    s->pages_per_second = 0.0;
2164    s->downtime = 0;
2165    s->expected_downtime = 0;
2166    s->setup_time = 0;
2167    s->start_postcopy = false;
2168    s->postcopy_after_devices = false;
2169    s->migration_thread_running = false;
2170    error_free(s->error);
2171    s->error = NULL;
2172    s->hostname = NULL;
2173
2174    migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
2175
2176    s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2177    s->total_time = 0;
2178    s->vm_was_running = false;
2179    s->iteration_initial_bytes = 0;
2180    s->threshold_size = 0;
2181}
2182
2183int migrate_add_blocker_internal(Error *reason, Error **errp)
2184{
2185    /* Snapshots are similar to migrations, so check RUN_STATE_SAVE_VM too. */
2186    if (runstate_check(RUN_STATE_SAVE_VM) || !migration_is_idle()) {
2187        error_propagate_prepend(errp, error_copy(reason),
2188                                "disallowing migration blocker "
2189                                "(migration/snapshot in progress) for: ");
2190        return -EBUSY;
2191    }
2192
2193    migration_blockers = g_slist_prepend(migration_blockers, reason);
2194    return 0;
2195}
2196
2197int migrate_add_blocker(Error *reason, Error **errp)
2198{
2199    if (only_migratable) {
2200        error_propagate_prepend(errp, error_copy(reason),
2201                                "disallowing migration blocker "
2202                                "(--only-migratable) for: ");
2203        return -EACCES;
2204    }
2205
2206    return migrate_add_blocker_internal(reason, errp);
2207}
2208
2209void migrate_del_blocker(Error *reason)
2210{
2211    migration_blockers = g_slist_remove(migration_blockers, reason);
2212}
2213
2214void qmp_migrate_incoming(const char *uri, Error **errp)
2215{
2216    Error *local_err = NULL;
2217    static bool once = true;
2218
2219    if (!once) {
2220        error_setg(errp, "The incoming migration has already been started");
2221        return;
2222    }
2223    if (!runstate_check(RUN_STATE_INMIGRATE)) {
2224        error_setg(errp, "'-incoming' was not specified on the command line");
2225        return;
2226    }
2227
2228    if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
2229        return;
2230    }
2231
2232    qemu_start_incoming_migration(uri, &local_err);
2233
2234    if (local_err) {
2235        yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2236        error_propagate(errp, local_err);
2237        return;
2238    }
2239
2240    once = false;
2241}
2242
2243void qmp_migrate_recover(const char *uri, Error **errp)
2244{
2245    MigrationIncomingState *mis = migration_incoming_get_current();
2246
2247    /*
2248     * Don't even bother to use ERRP_GUARD() as it _must_ always be set by
2249     * callers (no one should ignore a recover failure); if there is, it's a
2250     * programming error.
2251     */
2252    assert(errp);
2253
2254    if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
2255        error_setg(errp, "Migrate recover can only be run "
2256                   "when postcopy is paused.");
2257        return;
2258    }
2259
2260    /* If there's an existing transport, release it */
2261    migration_incoming_transport_cleanup(mis);
2262
2263    /*
2264     * Note that this call will never start a real migration; it will
2265     * only re-setup the migration stream and poke existing migration
2266     * to continue using that newly established channel.
2267     */
2268    qemu_start_incoming_migration(uri, errp);
2269}
2270
2271void qmp_migrate_pause(Error **errp)
2272{
2273    MigrationState *ms = migrate_get_current();
2274    MigrationIncomingState *mis = migration_incoming_get_current();
2275    int ret;
2276
2277    if (ms->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2278        /* Source side, during postcopy */
2279        qemu_mutex_lock(&ms->qemu_file_lock);
2280        ret = qemu_file_shutdown(ms->to_dst_file);
2281        qemu_mutex_unlock(&ms->qemu_file_lock);
2282        if (ret) {
2283            error_setg(errp, "Failed to pause source migration");
2284        }
2285        return;
2286    }
2287
2288    if (mis->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2289        ret = qemu_file_shutdown(mis->from_src_file);
2290        if (ret) {
2291            error_setg(errp, "Failed to pause destination migration");
2292        }
2293        return;
2294    }
2295
2296    error_setg(errp, "migrate-pause is currently only supported "
2297               "during postcopy-active state");
2298}
2299
2300bool migration_is_blocked(Error **errp)
2301{
2302    if (qemu_savevm_state_blocked(errp)) {
2303        return true;
2304    }
2305
2306    if (migration_blockers) {
2307        error_propagate(errp, error_copy(migration_blockers->data));
2308        return true;
2309    }
2310
2311    return false;
2312}
2313
2314/* Returns true if continue to migrate, or false if error detected */
2315static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc,
2316                            bool resume, Error **errp)
2317{
2318    Error *local_err = NULL;
2319
2320    if (resume) {
2321        if (s->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
2322            error_setg(errp, "Cannot resume if there is no "
2323                       "paused migration");
2324            return false;
2325        }
2326
2327        /*
2328         * Postcopy recovery won't work well with release-ram
2329         * capability since release-ram will drop the page buffer as
2330         * long as the page is put into the send buffer.  So if there
2331         * is a network failure happened, any page buffers that have
2332         * not yet reached the destination VM but have already been
2333         * sent from the source VM will be lost forever.  Let's refuse
2334         * the client from resuming such a postcopy migration.
2335         * Luckily release-ram was designed to only be used when src
2336         * and destination VMs are on the same host, so it should be
2337         * fine.
2338         */
2339        if (migrate_release_ram()) {
2340            error_setg(errp, "Postcopy recovery cannot work "
2341                       "when release-ram capability is set");
2342            return false;
2343        }
2344
2345        /* This is a resume, skip init status */
2346        return true;
2347    }
2348
2349    if (migration_is_running(s->state)) {
2350        error_setg(errp, QERR_MIGRATION_ACTIVE);
2351        return false;
2352    }
2353
2354    if (runstate_check(RUN_STATE_INMIGRATE)) {
2355        error_setg(errp, "Guest is waiting for an incoming migration");
2356        return false;
2357    }
2358
2359    if (runstate_check(RUN_STATE_POSTMIGRATE)) {
2360        error_setg(errp, "Can't migrate the vm that was paused due to "
2361                   "previous migration");
2362        return false;
2363    }
2364
2365    if (migration_is_blocked(errp)) {
2366        return false;
2367    }
2368
2369    if (blk || blk_inc) {
2370        if (migrate_colo_enabled()) {
2371            error_setg(errp, "No disk migration is required in COLO mode");
2372            return false;
2373        }
2374        if (migrate_use_block() || migrate_use_block_incremental()) {
2375            error_setg(errp, "Command options are incompatible with "
2376                       "current migration capabilities");
2377            return false;
2378        }
2379        migrate_set_block_enabled(true, &local_err);
2380        if (local_err) {
2381            error_propagate(errp, local_err);
2382            return false;
2383        }
2384        s->must_remove_block_options = true;
2385    }
2386
2387    if (blk_inc) {
2388        migrate_set_block_incremental(s, true);
2389    }
2390
2391    migrate_init(s);
2392    /*
2393     * set ram_counters compression_counters memory to zero for a
2394     * new migration
2395     */
2396    memset(&ram_counters, 0, sizeof(ram_counters));
2397    memset(&compression_counters, 0, sizeof(compression_counters));
2398
2399    return true;
2400}
2401
2402void qmp_migrate(const char *uri, bool has_blk, bool blk,
2403                 bool has_inc, bool inc, bool has_detach, bool detach,
2404                 bool has_resume, bool resume, Error **errp)
2405{
2406    Error *local_err = NULL;
2407    MigrationState *s = migrate_get_current();
2408    const char *p = NULL;
2409
2410    if (!migrate_prepare(s, has_blk && blk, has_inc && inc,
2411                         has_resume && resume, errp)) {
2412        /* Error detected, put into errp */
2413        return;
2414    }
2415
2416    if (!(has_resume && resume)) {
2417        if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
2418            return;
2419        }
2420    }
2421
2422    migrate_protocol_allow_multi_channels(false);
2423    if (strstart(uri, "tcp:", &p) ||
2424        strstart(uri, "unix:", NULL) ||
2425        strstart(uri, "vsock:", NULL)) {
2426        migrate_protocol_allow_multi_channels(true);
2427        socket_start_outgoing_migration(s, p ? p : uri, &local_err);
2428#ifdef CONFIG_RDMA
2429    } else if (strstart(uri, "rdma:", &p)) {
2430        rdma_start_outgoing_migration(s, p, &local_err);
2431#endif
2432    } else if (strstart(uri, "exec:", &p)) {
2433        exec_start_outgoing_migration(s, p, &local_err);
2434    } else if (strstart(uri, "fd:", &p)) {
2435        fd_start_outgoing_migration(s, p, &local_err);
2436    } else {
2437        if (!(has_resume && resume)) {
2438            yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2439        }
2440        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri",
2441                   "a valid migration protocol");
2442        migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
2443                          MIGRATION_STATUS_FAILED);
2444        block_cleanup_parameters(s);
2445        return;
2446    }
2447
2448    if (local_err) {
2449        if (!(has_resume && resume)) {
2450            yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2451        }
2452        migrate_fd_error(s, local_err);
2453        error_propagate(errp, local_err);
2454        return;
2455    }
2456}
2457
2458void qmp_migrate_cancel(Error **errp)
2459{
2460    migration_cancel(NULL);
2461}
2462
2463void qmp_migrate_continue(MigrationStatus state, Error **errp)
2464{
2465    MigrationState *s = migrate_get_current();
2466    if (s->state != state) {
2467        error_setg(errp,  "Migration not in expected state: %s",
2468                   MigrationStatus_str(s->state));
2469        return;
2470    }
2471    qemu_sem_post(&s->pause_sem);
2472}
2473
2474bool migrate_release_ram(void)
2475{
2476    MigrationState *s;
2477
2478    s = migrate_get_current();
2479
2480    return s->enabled_capabilities[MIGRATION_CAPABILITY_RELEASE_RAM];
2481}
2482
2483bool migrate_postcopy_ram(void)
2484{
2485    MigrationState *s;
2486
2487    s = migrate_get_current();
2488
2489    return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM];
2490}
2491
2492bool migrate_postcopy(void)
2493{
2494    return migrate_postcopy_ram() || migrate_dirty_bitmaps();
2495}
2496
2497bool migrate_auto_converge(void)
2498{
2499    MigrationState *s;
2500
2501    s = migrate_get_current();
2502
2503    return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE];
2504}
2505
2506bool migrate_zero_blocks(void)
2507{
2508    MigrationState *s;
2509
2510    s = migrate_get_current();
2511
2512    return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS];
2513}
2514
2515bool migrate_postcopy_blocktime(void)
2516{
2517    MigrationState *s;
2518
2519    s = migrate_get_current();
2520
2521    return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME];
2522}
2523
2524bool migrate_use_compression(void)
2525{
2526    MigrationState *s;
2527
2528    s = migrate_get_current();
2529
2530    return s->enabled_capabilities[MIGRATION_CAPABILITY_COMPRESS];
2531}
2532
2533int migrate_compress_level(void)
2534{
2535    MigrationState *s;
2536
2537    s = migrate_get_current();
2538
2539    return s->parameters.compress_level;
2540}
2541
2542int migrate_compress_threads(void)
2543{
2544    MigrationState *s;
2545
2546    s = migrate_get_current();
2547
2548    return s->parameters.compress_threads;
2549}
2550
2551int migrate_compress_wait_thread(void)
2552{
2553    MigrationState *s;
2554
2555    s = migrate_get_current();
2556
2557    return s->parameters.compress_wait_thread;
2558}
2559
2560int migrate_decompress_threads(void)
2561{
2562    MigrationState *s;
2563
2564    s = migrate_get_current();
2565
2566    return s->parameters.decompress_threads;
2567}
2568
2569bool migrate_dirty_bitmaps(void)
2570{
2571    MigrationState *s;
2572
2573    s = migrate_get_current();
2574
2575    return s->enabled_capabilities[MIGRATION_CAPABILITY_DIRTY_BITMAPS];
2576}
2577
2578bool migrate_ignore_shared(void)
2579{
2580    MigrationState *s;
2581
2582    s = migrate_get_current();
2583
2584    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_IGNORE_SHARED];
2585}
2586
2587bool migrate_validate_uuid(void)
2588{
2589    MigrationState *s;
2590
2591    s = migrate_get_current();
2592
2593    return s->enabled_capabilities[MIGRATION_CAPABILITY_VALIDATE_UUID];
2594}
2595
2596bool migrate_use_events(void)
2597{
2598    MigrationState *s;
2599
2600    s = migrate_get_current();
2601
2602    return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS];
2603}
2604
2605bool migrate_use_multifd(void)
2606{
2607    MigrationState *s;
2608
2609    s = migrate_get_current();
2610
2611    return s->enabled_capabilities[MIGRATION_CAPABILITY_MULTIFD];
2612}
2613
2614bool migrate_pause_before_switchover(void)
2615{
2616    MigrationState *s;
2617
2618    s = migrate_get_current();
2619
2620    return s->enabled_capabilities[
2621        MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER];
2622}
2623
2624int migrate_multifd_channels(void)
2625{
2626    MigrationState *s;
2627
2628    s = migrate_get_current();
2629
2630    return s->parameters.multifd_channels;
2631}
2632
2633MultiFDCompression migrate_multifd_compression(void)
2634{
2635    MigrationState *s;
2636
2637    s = migrate_get_current();
2638
2639    assert(s->parameters.multifd_compression < MULTIFD_COMPRESSION__MAX);
2640    return s->parameters.multifd_compression;
2641}
2642
2643int migrate_multifd_zlib_level(void)
2644{
2645    MigrationState *s;
2646
2647    s = migrate_get_current();
2648
2649    return s->parameters.multifd_zlib_level;
2650}
2651
2652int migrate_multifd_zstd_level(void)
2653{
2654    MigrationState *s;
2655
2656    s = migrate_get_current();
2657
2658    return s->parameters.multifd_zstd_level;
2659}
2660
2661#ifdef CONFIG_LINUX
2662bool migrate_use_zero_copy_send(void)
2663{
2664    MigrationState *s;
2665
2666    s = migrate_get_current();
2667
2668    return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_COPY_SEND];
2669}
2670#endif
2671
2672int migrate_use_tls(void)
2673{
2674    MigrationState *s;
2675
2676    s = migrate_get_current();
2677
2678    return s->parameters.tls_creds && *s->parameters.tls_creds;
2679}
2680
2681int migrate_use_xbzrle(void)
2682{
2683    MigrationState *s;
2684
2685    s = migrate_get_current();
2686
2687    return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE];
2688}
2689
2690uint64_t migrate_xbzrle_cache_size(void)
2691{
2692    MigrationState *s;
2693
2694    s = migrate_get_current();
2695
2696    return s->parameters.xbzrle_cache_size;
2697}
2698
2699static int64_t migrate_max_postcopy_bandwidth(void)
2700{
2701    MigrationState *s;
2702
2703    s = migrate_get_current();
2704
2705    return s->parameters.max_postcopy_bandwidth;
2706}
2707
2708bool migrate_use_block(void)
2709{
2710    MigrationState *s;
2711
2712    s = migrate_get_current();
2713
2714    return s->enabled_capabilities[MIGRATION_CAPABILITY_BLOCK];
2715}
2716
2717bool migrate_use_return_path(void)
2718{
2719    MigrationState *s;
2720
2721    s = migrate_get_current();
2722
2723    return s->enabled_capabilities[MIGRATION_CAPABILITY_RETURN_PATH];
2724}
2725
2726bool migrate_use_block_incremental(void)
2727{
2728    MigrationState *s;
2729
2730    s = migrate_get_current();
2731
2732    return s->parameters.block_incremental;
2733}
2734
2735bool migrate_background_snapshot(void)
2736{
2737    MigrationState *s;
2738
2739    s = migrate_get_current();
2740
2741    return s->enabled_capabilities[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT];
2742}
2743
2744bool migrate_postcopy_preempt(void)
2745{
2746    MigrationState *s;
2747
2748    s = migrate_get_current();
2749
2750    return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT];
2751}
2752
2753/* migration thread support */
2754/*
2755 * Something bad happened to the RP stream, mark an error
2756 * The caller shall print or trace something to indicate why
2757 */
2758static void mark_source_rp_bad(MigrationState *s)
2759{
2760    s->rp_state.error = true;
2761}
2762
2763static struct rp_cmd_args {
2764    ssize_t     len; /* -1 = variable */
2765    const char *name;
2766} rp_cmd_args[] = {
2767    [MIG_RP_MSG_INVALID]        = { .len = -1, .name = "INVALID" },
2768    [MIG_RP_MSG_SHUT]           = { .len =  4, .name = "SHUT" },
2769    [MIG_RP_MSG_PONG]           = { .len =  4, .name = "PONG" },
2770    [MIG_RP_MSG_REQ_PAGES]      = { .len = 12, .name = "REQ_PAGES" },
2771    [MIG_RP_MSG_REQ_PAGES_ID]   = { .len = -1, .name = "REQ_PAGES_ID" },
2772    [MIG_RP_MSG_RECV_BITMAP]    = { .len = -1, .name = "RECV_BITMAP" },
2773    [MIG_RP_MSG_RESUME_ACK]     = { .len =  4, .name = "RESUME_ACK" },
2774    [MIG_RP_MSG_MAX]            = { .len = -1, .name = "MAX" },
2775};
2776
2777/*
2778 * Process a request for pages received on the return path,
2779 * We're allowed to send more than requested (e.g. to round to our page size)
2780 * and we don't need to send pages that have already been sent.
2781 */
2782static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
2783                                       ram_addr_t start, size_t len)
2784{
2785    long our_host_ps = qemu_real_host_page_size();
2786
2787    trace_migrate_handle_rp_req_pages(rbname, start, len);
2788
2789    /*
2790     * Since we currently insist on matching page sizes, just sanity check
2791     * we're being asked for whole host pages.
2792     */
2793    if (!QEMU_IS_ALIGNED(start, our_host_ps) ||
2794        !QEMU_IS_ALIGNED(len, our_host_ps)) {
2795        error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
2796                     " len: %zd", __func__, start, len);
2797        mark_source_rp_bad(ms);
2798        return;
2799    }
2800
2801    if (ram_save_queue_pages(rbname, start, len)) {
2802        mark_source_rp_bad(ms);
2803    }
2804}
2805
2806/* Return true to retry, false to quit */
2807static bool postcopy_pause_return_path_thread(MigrationState *s)
2808{
2809    trace_postcopy_pause_return_path();
2810
2811    qemu_sem_wait(&s->postcopy_pause_rp_sem);
2812
2813    trace_postcopy_pause_return_path_continued();
2814
2815    return true;
2816}
2817
2818static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name)
2819{
2820    RAMBlock *block = qemu_ram_block_by_name(block_name);
2821
2822    if (!block) {
2823        error_report("%s: invalid block name '%s'", __func__, block_name);
2824        return -EINVAL;
2825    }
2826
2827    /* Fetch the received bitmap and refresh the dirty bitmap */
2828    return ram_dirty_bitmap_reload(s, block);
2829}
2830
2831static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value)
2832{
2833    trace_source_return_path_thread_resume_ack(value);
2834
2835    if (value != MIGRATION_RESUME_ACK_VALUE) {
2836        error_report("%s: illegal resume_ack value %"PRIu32,
2837                     __func__, value);
2838        return -1;
2839    }
2840
2841    /* Now both sides are active. */
2842    migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2843                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
2844
2845    /* Notify send thread that time to continue send pages */
2846    qemu_sem_post(&s->rp_state.rp_sem);
2847
2848    return 0;
2849}
2850
2851/* Release ms->rp_state.from_dst_file in a safe way */
2852static void migration_release_from_dst_file(MigrationState *ms)
2853{
2854    QEMUFile *file;
2855
2856    WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
2857        /*
2858         * Reset the from_dst_file pointer first before releasing it, as we
2859         * can't block within lock section
2860         */
2861        file = ms->rp_state.from_dst_file;
2862        ms->rp_state.from_dst_file = NULL;
2863    }
2864
2865    qemu_fclose(file);
2866}
2867
2868/*
2869 * Handles messages sent on the return path towards the source VM
2870 *
2871 */
2872static void *source_return_path_thread(void *opaque)
2873{
2874    MigrationState *ms = opaque;
2875    QEMUFile *rp = ms->rp_state.from_dst_file;
2876    uint16_t header_len, header_type;
2877    uint8_t buf[512];
2878    uint32_t tmp32, sibling_error;
2879    ram_addr_t start = 0; /* =0 to silence warning */
2880    size_t  len = 0, expected_len;
2881    int res;
2882
2883    trace_source_return_path_thread_entry();
2884    rcu_register_thread();
2885
2886retry:
2887    while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
2888           migration_is_setup_or_active(ms->state)) {
2889        trace_source_return_path_thread_loop_top();
2890        header_type = qemu_get_be16(rp);
2891        header_len = qemu_get_be16(rp);
2892
2893        if (qemu_file_get_error(rp)) {
2894            mark_source_rp_bad(ms);
2895            goto out;
2896        }
2897
2898        if (header_type >= MIG_RP_MSG_MAX ||
2899            header_type == MIG_RP_MSG_INVALID) {
2900            error_report("RP: Received invalid message 0x%04x length 0x%04x",
2901                         header_type, header_len);
2902            mark_source_rp_bad(ms);
2903            goto out;
2904        }
2905
2906        if ((rp_cmd_args[header_type].len != -1 &&
2907            header_len != rp_cmd_args[header_type].len) ||
2908            header_len > sizeof(buf)) {
2909            error_report("RP: Received '%s' message (0x%04x) with"
2910                         "incorrect length %d expecting %zu",
2911                         rp_cmd_args[header_type].name, header_type, header_len,
2912                         (size_t)rp_cmd_args[header_type].len);
2913            mark_source_rp_bad(ms);
2914            goto out;
2915        }
2916
2917        /* We know we've got a valid header by this point */
2918        res = qemu_get_buffer(rp, buf, header_len);
2919        if (res != header_len) {
2920            error_report("RP: Failed reading data for message 0x%04x"
2921                         " read %d expected %d",
2922                         header_type, res, header_len);
2923            mark_source_rp_bad(ms);
2924            goto out;
2925        }
2926
2927        /* OK, we have the message and the data */
2928        switch (header_type) {
2929        case MIG_RP_MSG_SHUT:
2930            sibling_error = ldl_be_p(buf);
2931            trace_source_return_path_thread_shut(sibling_error);
2932            if (sibling_error) {
2933                error_report("RP: Sibling indicated error %d", sibling_error);
2934                mark_source_rp_bad(ms);
2935            }
2936            /*
2937             * We'll let the main thread deal with closing the RP
2938             * we could do a shutdown(2) on it, but we're the only user
2939             * anyway, so there's nothing gained.
2940             */
2941            goto out;
2942
2943        case MIG_RP_MSG_PONG:
2944            tmp32 = ldl_be_p(buf);
2945            trace_source_return_path_thread_pong(tmp32);
2946            break;
2947
2948        case MIG_RP_MSG_REQ_PAGES:
2949            start = ldq_be_p(buf);
2950            len = ldl_be_p(buf + 8);
2951            migrate_handle_rp_req_pages(ms, NULL, start, len);
2952            break;
2953
2954        case MIG_RP_MSG_REQ_PAGES_ID:
2955            expected_len = 12 + 1; /* header + termination */
2956
2957            if (header_len >= expected_len) {
2958                start = ldq_be_p(buf);
2959                len = ldl_be_p(buf + 8);
2960                /* Now we expect an idstr */
2961                tmp32 = buf[12]; /* Length of the following idstr */
2962                buf[13 + tmp32] = '\0';
2963                expected_len += tmp32;
2964            }
2965            if (header_len != expected_len) {
2966                error_report("RP: Req_Page_id with length %d expecting %zd",
2967                             header_len, expected_len);
2968                mark_source_rp_bad(ms);
2969                goto out;
2970            }
2971            migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
2972            break;
2973
2974        case MIG_RP_MSG_RECV_BITMAP:
2975            if (header_len < 1) {
2976                error_report("%s: missing block name", __func__);
2977                mark_source_rp_bad(ms);
2978                goto out;
2979            }
2980            /* Format: len (1B) + idstr (<255B). This ends the idstr. */
2981            buf[buf[0] + 1] = '\0';
2982            if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1))) {
2983                mark_source_rp_bad(ms);
2984                goto out;
2985            }
2986            break;
2987
2988        case MIG_RP_MSG_RESUME_ACK:
2989            tmp32 = ldl_be_p(buf);
2990            if (migrate_handle_rp_resume_ack(ms, tmp32)) {
2991                mark_source_rp_bad(ms);
2992                goto out;
2993            }
2994            break;
2995
2996        default:
2997            break;
2998        }
2999    }
3000
3001out:
3002    res = qemu_file_get_error(rp);
3003    if (res) {
3004        if (res && migration_in_postcopy()) {
3005            /*
3006             * Maybe there is something we can do: it looks like a
3007             * network down issue, and we pause for a recovery.
3008             */
3009            migration_release_from_dst_file(ms);
3010            rp = NULL;
3011            if (postcopy_pause_return_path_thread(ms)) {
3012                /*
3013                 * Reload rp, reset the rest.  Referencing it is safe since
3014                 * it's reset only by us above, or when migration completes
3015                 */
3016                rp = ms->rp_state.from_dst_file;
3017                ms->rp_state.error = false;
3018                goto retry;
3019            }
3020        }
3021
3022        trace_source_return_path_thread_bad_end();
3023        mark_source_rp_bad(ms);
3024    }
3025
3026    trace_source_return_path_thread_end();
3027    migration_release_from_dst_file(ms);
3028    rcu_unregister_thread();
3029    return NULL;
3030}
3031
3032static int open_return_path_on_source(MigrationState *ms,
3033                                      bool create_thread)
3034{
3035    ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
3036    if (!ms->rp_state.from_dst_file) {
3037        return -1;
3038    }
3039
3040    trace_open_return_path_on_source();
3041
3042    if (!create_thread) {
3043        /* We're done */
3044        return 0;
3045    }
3046
3047    qemu_thread_create(&ms->rp_state.rp_thread, "return path",
3048                       source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
3049    ms->rp_state.rp_thread_created = true;
3050
3051    trace_open_return_path_on_source_continue();
3052
3053    return 0;
3054}
3055
3056/* Returns 0 if the RP was ok, otherwise there was an error on the RP */
3057static int await_return_path_close_on_source(MigrationState *ms)
3058{
3059    /*
3060     * If this is a normal exit then the destination will send a SHUT and the
3061     * rp_thread will exit, however if there's an error we need to cause
3062     * it to exit.
3063     */
3064    if (qemu_file_get_error(ms->to_dst_file) && ms->rp_state.from_dst_file) {
3065        /*
3066         * shutdown(2), if we have it, will cause it to unblock if it's stuck
3067         * waiting for the destination.
3068         */
3069        qemu_file_shutdown(ms->rp_state.from_dst_file);
3070        mark_source_rp_bad(ms);
3071    }
3072    trace_await_return_path_close_on_source_joining();
3073    qemu_thread_join(&ms->rp_state.rp_thread);
3074    ms->rp_state.rp_thread_created = false;
3075    trace_await_return_path_close_on_source_close();
3076    return ms->rp_state.error;
3077}
3078
3079/*
3080 * Switch from normal iteration to postcopy
3081 * Returns non-0 on error
3082 */
3083static int postcopy_start(MigrationState *ms)
3084{
3085    int ret;
3086    QIOChannelBuffer *bioc;
3087    QEMUFile *fb;
3088    int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3089    int64_t bandwidth = migrate_max_postcopy_bandwidth();
3090    bool restart_block = false;
3091    int cur_state = MIGRATION_STATUS_ACTIVE;
3092
3093    if (postcopy_preempt_wait_channel(ms)) {
3094        migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED);
3095        return -1;
3096    }
3097
3098    if (!migrate_pause_before_switchover()) {
3099        migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
3100                          MIGRATION_STATUS_POSTCOPY_ACTIVE);
3101    }
3102
3103    trace_postcopy_start();
3104    qemu_mutex_lock_iothread();
3105    trace_postcopy_start_set_run();
3106
3107    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
3108    global_state_store();
3109    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
3110    if (ret < 0) {
3111        goto fail;
3112    }
3113
3114    ret = migration_maybe_pause(ms, &cur_state,
3115                                MIGRATION_STATUS_POSTCOPY_ACTIVE);
3116    if (ret < 0) {
3117        goto fail;
3118    }
3119
3120    ret = bdrv_inactivate_all();
3121    if (ret < 0) {
3122        goto fail;
3123    }
3124    restart_block = true;
3125
3126    /*
3127     * Cause any non-postcopiable, but iterative devices to
3128     * send out their final data.
3129     */
3130    qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false);
3131
3132    /*
3133     * in Finish migrate and with the io-lock held everything should
3134     * be quiet, but we've potentially still got dirty pages and we
3135     * need to tell the destination to throw any pages it's already received
3136     * that are dirty
3137     */
3138    if (migrate_postcopy_ram()) {
3139        ram_postcopy_send_discard_bitmap(ms);
3140    }
3141
3142    /*
3143     * send rest of state - note things that are doing postcopy
3144     * will notice we're in POSTCOPY_ACTIVE and not actually
3145     * wrap their state up here
3146     */
3147    /* 0 max-postcopy-bandwidth means unlimited */
3148    if (!bandwidth) {
3149        qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX);
3150    } else {
3151        qemu_file_set_rate_limit(ms->to_dst_file, bandwidth / XFER_LIMIT_RATIO);
3152    }
3153    if (migrate_postcopy_ram()) {
3154        /* Ping just for debugging, helps line traces up */
3155        qemu_savevm_send_ping(ms->to_dst_file, 2);
3156    }
3157
3158    /*
3159     * While loading the device state we may trigger page transfer
3160     * requests and the fd must be free to process those, and thus
3161     * the destination must read the whole device state off the fd before
3162     * it starts processing it.  Unfortunately the ad-hoc migration format
3163     * doesn't allow the destination to know the size to read without fully
3164     * parsing it through each devices load-state code (especially the open
3165     * coded devices that use get/put).
3166     * So we wrap the device state up in a package with a length at the start;
3167     * to do this we use a qemu_buf to hold the whole of the device state.
3168     */
3169    bioc = qio_channel_buffer_new(4096);
3170    qio_channel_set_name(QIO_CHANNEL(bioc), "migration-postcopy-buffer");
3171    fb = qemu_file_new_output(QIO_CHANNEL(bioc));
3172    object_unref(OBJECT(bioc));
3173
3174    /*
3175     * Make sure the receiver can get incoming pages before we send the rest
3176     * of the state
3177     */
3178    qemu_savevm_send_postcopy_listen(fb);
3179
3180    qemu_savevm_state_complete_precopy(fb, false, false);
3181    if (migrate_postcopy_ram()) {
3182        qemu_savevm_send_ping(fb, 3);
3183    }
3184
3185    qemu_savevm_send_postcopy_run(fb);
3186
3187    /* <><> end of stuff going into the package */
3188
3189    /* Last point of recovery; as soon as we send the package the destination
3190     * can open devices and potentially start running.
3191     * Lets just check again we've not got any errors.
3192     */
3193    ret = qemu_file_get_error(ms->to_dst_file);
3194    if (ret) {
3195        error_report("postcopy_start: Migration stream errored (pre package)");
3196        goto fail_closefb;
3197    }
3198
3199    restart_block = false;
3200
3201    /* Now send that blob */
3202    if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) {
3203        goto fail_closefb;
3204    }
3205    qemu_fclose(fb);
3206
3207    /* Send a notify to give a chance for anything that needs to happen
3208     * at the transition to postcopy and after the device state; in particular
3209     * spice needs to trigger a transition now
3210     */
3211    ms->postcopy_after_devices = true;
3212    notifier_list_notify(&migration_state_notifiers, ms);
3213
3214    ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
3215
3216    qemu_mutex_unlock_iothread();
3217
3218    if (migrate_postcopy_ram()) {
3219        /*
3220         * Although this ping is just for debug, it could potentially be
3221         * used for getting a better measurement of downtime at the source.
3222         */
3223        qemu_savevm_send_ping(ms->to_dst_file, 4);
3224    }
3225
3226    if (migrate_release_ram()) {
3227        ram_postcopy_migrated_memory_release(ms);
3228    }
3229
3230    ret = qemu_file_get_error(ms->to_dst_file);
3231    if (ret) {
3232        error_report("postcopy_start: Migration stream errored");
3233        migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
3234                              MIGRATION_STATUS_FAILED);
3235    }
3236
3237    trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
3238
3239    return ret;
3240
3241fail_closefb:
3242    qemu_fclose(fb);
3243fail:
3244    migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
3245                          MIGRATION_STATUS_FAILED);
3246    if (restart_block) {
3247        /* A failure happened early enough that we know the destination hasn't
3248         * accessed block devices, so we're safe to recover.
3249         */
3250        Error *local_err = NULL;
3251
3252        bdrv_activate_all(&local_err);
3253        if (local_err) {
3254            error_report_err(local_err);
3255        }
3256    }
3257    qemu_mutex_unlock_iothread();
3258    return -1;
3259}
3260
3261/**
3262 * migration_maybe_pause: Pause if required to by
3263 * migrate_pause_before_switchover called with the iothread locked
3264 * Returns: 0 on success
3265 */
3266static int migration_maybe_pause(MigrationState *s,
3267                                 int *current_active_state,
3268                                 int new_state)
3269{
3270    if (!migrate_pause_before_switchover()) {
3271        return 0;
3272    }
3273
3274    /* Since leaving this state is not atomic with posting the semaphore
3275     * it's possible that someone could have issued multiple migrate_continue
3276     * and the semaphore is incorrectly positive at this point;
3277     * the docs say it's undefined to reinit a semaphore that's already
3278     * init'd, so use timedwait to eat up any existing posts.
3279     */
3280    while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) {
3281        /* This block intentionally left blank */
3282    }
3283
3284    /*
3285     * If the migration is cancelled when it is in the completion phase,
3286     * the migration state is set to MIGRATION_STATUS_CANCELLING.
3287     * So we don't need to wait a semaphore, otherwise we would always
3288     * wait for the 'pause_sem' semaphore.
3289     */
3290    if (s->state != MIGRATION_STATUS_CANCELLING) {
3291        qemu_mutex_unlock_iothread();
3292        migrate_set_state(&s->state, *current_active_state,
3293                          MIGRATION_STATUS_PRE_SWITCHOVER);
3294        qemu_sem_wait(&s->pause_sem);
3295        migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER,
3296                          new_state);
3297        *current_active_state = new_state;
3298        qemu_mutex_lock_iothread();
3299    }
3300
3301    return s->state == new_state ? 0 : -EINVAL;
3302}
3303
3304/**
3305 * migration_completion: Used by migration_thread when there's not much left.
3306 *   The caller 'breaks' the loop when this returns.
3307 *
3308 * @s: Current migration state
3309 */
3310static void migration_completion(MigrationState *s)
3311{
3312    int ret;
3313    int current_active_state = s->state;
3314
3315    if (s->state == MIGRATION_STATUS_ACTIVE) {
3316        qemu_mutex_lock_iothread();
3317        s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3318        qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
3319        s->vm_was_running = runstate_is_running();
3320        ret = global_state_store();
3321
3322        if (!ret) {
3323            bool inactivate = !migrate_colo_enabled();
3324            ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
3325            trace_migration_completion_vm_stop(ret);
3326            if (ret >= 0) {
3327                ret = migration_maybe_pause(s, &current_active_state,
3328                                            MIGRATION_STATUS_DEVICE);
3329            }
3330            if (ret >= 0) {
3331                qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
3332                ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
3333                                                         inactivate);
3334            }
3335            if (inactivate && ret >= 0) {
3336                s->block_inactive = true;
3337            }
3338        }
3339        qemu_mutex_unlock_iothread();
3340
3341        if (ret < 0) {
3342            goto fail;
3343        }
3344    } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
3345        trace_migration_completion_postcopy_end();
3346
3347        qemu_mutex_lock_iothread();
3348        qemu_savevm_state_complete_postcopy(s->to_dst_file);
3349        qemu_mutex_unlock_iothread();
3350
3351        /* Shutdown the postcopy fast path thread */
3352        if (migrate_postcopy_preempt()) {
3353            postcopy_preempt_shutdown_file(s);
3354        }
3355
3356        trace_migration_completion_postcopy_end_after_complete();
3357    } else {
3358        goto fail;
3359    }
3360
3361    /*
3362     * If rp was opened we must clean up the thread before
3363     * cleaning everything else up (since if there are no failures
3364     * it will wait for the destination to send it's status in
3365     * a SHUT command).
3366     */
3367    if (s->rp_state.rp_thread_created) {
3368        int rp_error;
3369        trace_migration_return_path_end_before();
3370        rp_error = await_return_path_close_on_source(s);
3371        trace_migration_return_path_end_after(rp_error);
3372        if (rp_error) {
3373            goto fail_invalidate;
3374        }
3375    }
3376
3377    if (qemu_file_get_error(s->to_dst_file)) {
3378        trace_migration_completion_file_err();
3379        goto fail_invalidate;
3380    }
3381
3382    if (migrate_colo_enabled() && s->state == MIGRATION_STATUS_ACTIVE) {
3383        /* COLO does not support postcopy */
3384        migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
3385                          MIGRATION_STATUS_COLO);
3386    } else {
3387        migrate_set_state(&s->state, current_active_state,
3388                          MIGRATION_STATUS_COMPLETED);
3389    }
3390
3391    return;
3392
3393fail_invalidate:
3394    /* If not doing postcopy, vm_start() will be called: let's regain
3395     * control on images.
3396     */
3397    if (s->state == MIGRATION_STATUS_ACTIVE ||
3398        s->state == MIGRATION_STATUS_DEVICE) {
3399        Error *local_err = NULL;
3400
3401        qemu_mutex_lock_iothread();
3402        bdrv_activate_all(&local_err);
3403        if (local_err) {
3404            error_report_err(local_err);
3405        } else {
3406            s->block_inactive = false;
3407        }
3408        qemu_mutex_unlock_iothread();
3409    }
3410
3411fail:
3412    migrate_set_state(&s->state, current_active_state,
3413                      MIGRATION_STATUS_FAILED);
3414}
3415
3416/**
3417 * bg_migration_completion: Used by bg_migration_thread when after all the
3418 *   RAM has been saved. The caller 'breaks' the loop when this returns.
3419 *
3420 * @s: Current migration state
3421 */
3422static void bg_migration_completion(MigrationState *s)
3423{
3424    int current_active_state = s->state;
3425
3426    /*
3427     * Stop tracking RAM writes - un-protect memory, un-register UFFD
3428     * memory ranges, flush kernel wait queues and wake up threads
3429     * waiting for write fault to be resolved.
3430     */
3431    ram_write_tracking_stop();
3432
3433    if (s->state == MIGRATION_STATUS_ACTIVE) {
3434        /*
3435         * By this moment we have RAM content saved into the migration stream.
3436         * The next step is to flush the non-RAM content (device state)
3437         * right after the ram content. The device state has been stored into
3438         * the temporary buffer before RAM saving started.
3439         */
3440        qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage);
3441        qemu_fflush(s->to_dst_file);
3442    } else if (s->state == MIGRATION_STATUS_CANCELLING) {
3443        goto fail;
3444    }
3445
3446    if (qemu_file_get_error(s->to_dst_file)) {
3447        trace_migration_completion_file_err();
3448        goto fail;
3449    }
3450
3451    migrate_set_state(&s->state, current_active_state,
3452                      MIGRATION_STATUS_COMPLETED);
3453    return;
3454
3455fail:
3456    migrate_set_state(&s->state, current_active_state,
3457                      MIGRATION_STATUS_FAILED);
3458}
3459
3460bool migrate_colo_enabled(void)
3461{
3462    MigrationState *s = migrate_get_current();
3463    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO];
3464}
3465
3466typedef enum MigThrError {
3467    /* No error detected */
3468    MIG_THR_ERR_NONE = 0,
3469    /* Detected error, but resumed successfully */
3470    MIG_THR_ERR_RECOVERED = 1,
3471    /* Detected fatal error, need to exit */
3472    MIG_THR_ERR_FATAL = 2,
3473} MigThrError;
3474
3475static int postcopy_resume_handshake(MigrationState *s)
3476{
3477    qemu_savevm_send_postcopy_resume(s->to_dst_file);
3478
3479    while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
3480        qemu_sem_wait(&s->rp_state.rp_sem);
3481    }
3482
3483    if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
3484        return 0;
3485    }
3486
3487    return -1;
3488}
3489
3490/* Return zero if success, or <0 for error */
3491static int postcopy_do_resume(MigrationState *s)
3492{
3493    int ret;
3494
3495    /*
3496     * Call all the resume_prepare() hooks, so that modules can be
3497     * ready for the migration resume.
3498     */
3499    ret = qemu_savevm_state_resume_prepare(s);
3500    if (ret) {
3501        error_report("%s: resume_prepare() failure detected: %d",
3502                     __func__, ret);
3503        return ret;
3504    }
3505
3506    /*
3507     * Last handshake with destination on the resume (destination will
3508     * switch to postcopy-active afterwards)
3509     */
3510    ret = postcopy_resume_handshake(s);
3511    if (ret) {
3512        error_report("%s: handshake failed: %d", __func__, ret);
3513        return ret;
3514    }
3515
3516    return 0;
3517}
3518
3519/*
3520 * We don't return until we are in a safe state to continue current
3521 * postcopy migration.  Returns MIG_THR_ERR_RECOVERED if recovered, or
3522 * MIG_THR_ERR_FATAL if unrecovery failure happened.
3523 */
3524static MigThrError postcopy_pause(MigrationState *s)
3525{
3526    assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
3527
3528    while (true) {
3529        QEMUFile *file;
3530
3531        /*
3532         * Current channel is possibly broken. Release it.  Note that this is
3533         * guaranteed even without lock because to_dst_file should only be
3534         * modified by the migration thread.  That also guarantees that the
3535         * unregister of yank is safe too without the lock.  It should be safe
3536         * even to be within the qemu_file_lock, but we didn't do that to avoid
3537         * taking more mutex (yank_lock) within qemu_file_lock.  TL;DR: we make
3538         * the qemu_file_lock critical section as small as possible.
3539         */
3540        assert(s->to_dst_file);
3541        migration_ioc_unregister_yank_from_file(s->to_dst_file);
3542        qemu_mutex_lock(&s->qemu_file_lock);
3543        file = s->to_dst_file;
3544        s->to_dst_file = NULL;
3545        qemu_mutex_unlock(&s->qemu_file_lock);
3546
3547        qemu_file_shutdown(file);
3548        qemu_fclose(file);
3549
3550        /*
3551         * Do the same to postcopy fast path socket too if there is.  No
3552         * locking needed because no racer as long as we do this before setting
3553         * status to paused.
3554         */
3555        if (s->postcopy_qemufile_src) {
3556            migration_ioc_unregister_yank_from_file(s->postcopy_qemufile_src);
3557            qemu_file_shutdown(s->postcopy_qemufile_src);
3558            qemu_fclose(s->postcopy_qemufile_src);
3559            s->postcopy_qemufile_src = NULL;
3560        }
3561
3562        migrate_set_state(&s->state, s->state,
3563                          MIGRATION_STATUS_POSTCOPY_PAUSED);
3564
3565        error_report("Detected IO failure for postcopy. "
3566                     "Migration paused.");
3567
3568        /*
3569         * We wait until things fixed up. Then someone will setup the
3570         * status back for us.
3571         */
3572        while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
3573            qemu_sem_wait(&s->postcopy_pause_sem);
3574        }
3575
3576        if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
3577            /* Woken up by a recover procedure. Give it a shot */
3578
3579            if (postcopy_preempt_wait_channel(s)) {
3580                /*
3581                 * Preempt enabled, and new channel create failed; loop
3582                 * back to wait for another recovery.
3583                 */
3584                continue;
3585            }
3586
3587            /*
3588             * Firstly, let's wake up the return path now, with a new
3589             * return path channel.
3590             */
3591            qemu_sem_post(&s->postcopy_pause_rp_sem);
3592
3593            /* Do the resume logic */
3594            if (postcopy_do_resume(s) == 0) {
3595                /* Let's continue! */
3596                trace_postcopy_pause_continued();
3597                return MIG_THR_ERR_RECOVERED;
3598            } else {
3599                /*
3600                 * Something wrong happened during the recovery, let's
3601                 * pause again. Pause is always better than throwing
3602                 * data away.
3603                 */
3604                continue;
3605            }
3606        } else {
3607            /* This is not right... Time to quit. */
3608            return MIG_THR_ERR_FATAL;
3609        }
3610    }
3611}
3612
3613static MigThrError migration_detect_error(MigrationState *s)
3614{
3615    int ret;
3616    int state = s->state;
3617    Error *local_error = NULL;
3618
3619    if (state == MIGRATION_STATUS_CANCELLING ||
3620        state == MIGRATION_STATUS_CANCELLED) {
3621        /* End the migration, but don't set the state to failed */
3622        return MIG_THR_ERR_FATAL;
3623    }
3624
3625    /*
3626     * Try to detect any file errors.  Note that postcopy_qemufile_src will
3627     * be NULL when postcopy preempt is not enabled.
3628     */
3629    ret = qemu_file_get_error_obj_any(s->to_dst_file,
3630                                      s->postcopy_qemufile_src,
3631                                      &local_error);
3632    if (!ret) {
3633        /* Everything is fine */
3634        assert(!local_error);
3635        return MIG_THR_ERR_NONE;
3636    }
3637
3638    if (local_error) {
3639        migrate_set_error(s, local_error);
3640        error_free(local_error);
3641    }
3642
3643    if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret) {
3644        /*
3645         * For postcopy, we allow the network to be down for a
3646         * while. After that, it can be continued by a
3647         * recovery phase.
3648         */
3649        return postcopy_pause(s);
3650    } else {
3651        /*
3652         * For precopy (or postcopy with error outside IO), we fail
3653         * with no time.
3654         */
3655        migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED);
3656        trace_migration_thread_file_err();
3657
3658        /* Time to stop the migration, now. */
3659        return MIG_THR_ERR_FATAL;
3660    }
3661}
3662
3663/* How many bytes have we transferred since the beginning of the migration */
3664static uint64_t migration_total_bytes(MigrationState *s)
3665{
3666    return qemu_file_total_transferred(s->to_dst_file) +
3667        ram_counters.multifd_bytes;
3668}
3669
3670static void migration_calculate_complete(MigrationState *s)
3671{
3672    uint64_t bytes = migration_total_bytes(s);
3673    int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3674    int64_t transfer_time;
3675
3676    s->total_time = end_time - s->start_time;
3677    if (!s->downtime) {
3678        /*
3679         * It's still not set, so we are precopy migration.  For
3680         * postcopy, downtime is calculated during postcopy_start().
3681         */
3682        s->downtime = end_time - s->downtime_start;
3683    }
3684
3685    transfer_time = s->total_time - s->setup_time;
3686    if (transfer_time) {
3687        s->mbps = ((double) bytes * 8.0) / transfer_time / 1000;
3688    }
3689}
3690
3691static void update_iteration_initial_status(MigrationState *s)
3692{
3693    /*
3694     * Update these three fields at the same time to avoid mismatch info lead
3695     * wrong speed calculation.
3696     */
3697    s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3698    s->iteration_initial_bytes = migration_total_bytes(s);
3699    s->iteration_initial_pages = ram_get_total_transferred_pages();
3700}
3701
3702static void migration_update_counters(MigrationState *s,
3703                                      int64_t current_time)
3704{
3705    uint64_t transferred, transferred_pages, time_spent;
3706    uint64_t current_bytes; /* bytes transferred since the beginning */
3707    double bandwidth;
3708
3709    if (current_time < s->iteration_start_time + BUFFER_DELAY) {
3710        return;
3711    }
3712
3713    current_bytes = migration_total_bytes(s);
3714    transferred = current_bytes - s->iteration_initial_bytes;
3715    time_spent = current_time - s->iteration_start_time;
3716    bandwidth = (double)transferred / time_spent;
3717    s->threshold_size = bandwidth * s->parameters.downtime_limit;
3718
3719    s->mbps = (((double) transferred * 8.0) /
3720               ((double) time_spent / 1000.0)) / 1000.0 / 1000.0;
3721
3722    transferred_pages = ram_get_total_transferred_pages() -
3723                            s->iteration_initial_pages;
3724    s->pages_per_second = (double) transferred_pages /
3725                             (((double) time_spent / 1000.0));
3726
3727    /*
3728     * if we haven't sent anything, we don't want to
3729     * recalculate. 10000 is a small enough number for our purposes
3730     */
3731    if (ram_counters.dirty_pages_rate && transferred > 10000) {
3732        s->expected_downtime = ram_counters.remaining / bandwidth;
3733    }
3734
3735    qemu_file_reset_rate_limit(s->to_dst_file);
3736
3737    update_iteration_initial_status(s);
3738
3739    trace_migrate_transferred(transferred, time_spent,
3740                              bandwidth, s->threshold_size);
3741}
3742
3743/* Migration thread iteration status */
3744typedef enum {
3745    MIG_ITERATE_RESUME,         /* Resume current iteration */
3746    MIG_ITERATE_SKIP,           /* Skip current iteration */
3747    MIG_ITERATE_BREAK,          /* Break the loop */
3748} MigIterateState;
3749
3750/*
3751 * Return true if continue to the next iteration directly, false
3752 * otherwise.
3753 */
3754static MigIterateState migration_iteration_run(MigrationState *s)
3755{
3756    uint64_t pending_size, pend_pre, pend_compat, pend_post;
3757    bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
3758
3759    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
3760                              &pend_compat, &pend_post);
3761    pending_size = pend_pre + pend_compat + pend_post;
3762
3763    trace_migrate_pending(pending_size, s->threshold_size,
3764                          pend_pre, pend_compat, pend_post);
3765
3766    if (pending_size && pending_size >= s->threshold_size) {
3767        /* Still a significant amount to transfer */
3768        if (!in_postcopy && pend_pre <= s->threshold_size &&
3769            qatomic_read(&s->start_postcopy)) {
3770            if (postcopy_start(s)) {
3771                error_report("%s: postcopy failed to start", __func__);
3772            }
3773            return MIG_ITERATE_SKIP;
3774        }
3775        /* Just another iteration step */
3776        qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
3777    } else {
3778        trace_migration_thread_low_pending(pending_size);
3779        migration_completion(s);
3780        return MIG_ITERATE_BREAK;
3781    }
3782
3783    return MIG_ITERATE_RESUME;
3784}
3785
3786static void migration_iteration_finish(MigrationState *s)
3787{
3788    /* If we enabled cpu throttling for auto-converge, turn it off. */
3789    cpu_throttle_stop();
3790
3791    qemu_mutex_lock_iothread();
3792    switch (s->state) {
3793    case MIGRATION_STATUS_COMPLETED:
3794        migration_calculate_complete(s);
3795        runstate_set(RUN_STATE_POSTMIGRATE);
3796        break;
3797    case MIGRATION_STATUS_COLO:
3798        if (!migrate_colo_enabled()) {
3799            error_report("%s: critical error: calling COLO code without "
3800                         "COLO enabled", __func__);
3801        }
3802        migrate_start_colo_process(s);
3803        s->vm_was_running = true;
3804        /* Fallthrough */
3805    case MIGRATION_STATUS_FAILED:
3806    case MIGRATION_STATUS_CANCELLED:
3807    case MIGRATION_STATUS_CANCELLING:
3808        if (s->vm_was_running) {
3809            if (!runstate_check(RUN_STATE_SHUTDOWN)) {
3810                vm_start();
3811            }
3812        } else {
3813            if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
3814                runstate_set(RUN_STATE_POSTMIGRATE);
3815            }
3816        }
3817        break;
3818
3819    default:
3820        /* Should not reach here, but if so, forgive the VM. */
3821        error_report("%s: Unknown ending state %d", __func__, s->state);
3822        break;
3823    }
3824    migrate_fd_cleanup_schedule(s);
3825    qemu_mutex_unlock_iothread();
3826}
3827
3828static void bg_migration_iteration_finish(MigrationState *s)
3829{
3830    qemu_mutex_lock_iothread();
3831    switch (s->state) {
3832    case MIGRATION_STATUS_COMPLETED:
3833        migration_calculate_complete(s);
3834        break;
3835
3836    case MIGRATION_STATUS_ACTIVE:
3837    case MIGRATION_STATUS_FAILED:
3838    case MIGRATION_STATUS_CANCELLED:
3839    case MIGRATION_STATUS_CANCELLING:
3840        break;
3841
3842    default:
3843        /* Should not reach here, but if so, forgive the VM. */
3844        error_report("%s: Unknown ending state %d", __func__, s->state);
3845        break;
3846    }
3847
3848    migrate_fd_cleanup_schedule(s);
3849    qemu_mutex_unlock_iothread();
3850}
3851
3852/*
3853 * Return true if continue to the next iteration directly, false
3854 * otherwise.
3855 */
3856static MigIterateState bg_migration_iteration_run(MigrationState *s)
3857{
3858    int res;
3859
3860    res = qemu_savevm_state_iterate(s->to_dst_file, false);
3861    if (res > 0) {
3862        bg_migration_completion(s);
3863        return MIG_ITERATE_BREAK;
3864    }
3865
3866    return MIG_ITERATE_RESUME;
3867}
3868
3869void migration_make_urgent_request(void)
3870{
3871    qemu_sem_post(&migrate_get_current()->rate_limit_sem);
3872}
3873
3874void migration_consume_urgent_request(void)
3875{
3876    qemu_sem_wait(&migrate_get_current()->rate_limit_sem);
3877}
3878
3879/* Returns true if the rate limiting was broken by an urgent request */
3880bool migration_rate_limit(void)
3881{
3882    int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3883    MigrationState *s = migrate_get_current();
3884
3885    bool urgent = false;
3886    migration_update_counters(s, now);
3887    if (qemu_file_rate_limit(s->to_dst_file)) {
3888
3889        if (qemu_file_get_error(s->to_dst_file)) {
3890            return false;
3891        }
3892        /*
3893         * Wait for a delay to do rate limiting OR
3894         * something urgent to post the semaphore.
3895         */
3896        int ms = s->iteration_start_time + BUFFER_DELAY - now;
3897        trace_migration_rate_limit_pre(ms);
3898        if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) {
3899            /*
3900             * We were woken by one or more urgent things but
3901             * the timedwait will have consumed one of them.
3902             * The service routine for the urgent wake will dec
3903             * the semaphore itself for each item it consumes,
3904             * so add this one we just eat back.
3905             */
3906            qemu_sem_post(&s->rate_limit_sem);
3907            urgent = true;
3908        }
3909        trace_migration_rate_limit_post(urgent);
3910    }
3911    return urgent;
3912}
3913
3914/*
3915 * if failover devices are present, wait they are completely
3916 * unplugged
3917 */
3918
3919static void qemu_savevm_wait_unplug(MigrationState *s, int old_state,
3920                                    int new_state)
3921{
3922    if (qemu_savevm_state_guest_unplug_pending()) {
3923        migrate_set_state(&s->state, old_state, MIGRATION_STATUS_WAIT_UNPLUG);
3924
3925        while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
3926               qemu_savevm_state_guest_unplug_pending()) {
3927            qemu_sem_timedwait(&s->wait_unplug_sem, 250);
3928        }
3929        if (s->state != MIGRATION_STATUS_WAIT_UNPLUG) {
3930            int timeout = 120; /* 30 seconds */
3931            /*
3932             * migration has been canceled
3933             * but as we have started an unplug we must wait the end
3934             * to be able to plug back the card
3935             */
3936            while (timeout-- && qemu_savevm_state_guest_unplug_pending()) {
3937                qemu_sem_timedwait(&s->wait_unplug_sem, 250);
3938            }
3939            if (qemu_savevm_state_guest_unplug_pending() &&
3940                !qtest_enabled()) {
3941                warn_report("migration: partially unplugged device on "
3942                            "failure");
3943            }
3944        }
3945
3946        migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, new_state);
3947    } else {
3948        migrate_set_state(&s->state, old_state, new_state);
3949    }
3950}
3951
3952/*
3953 * Master migration thread on the source VM.
3954 * It drives the migration and pumps the data down the outgoing channel.
3955 */
3956static void *migration_thread(void *opaque)
3957{
3958    MigrationState *s = opaque;
3959    int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
3960    MigThrError thr_error;
3961    bool urgent = false;
3962
3963    rcu_register_thread();
3964
3965    object_ref(OBJECT(s));
3966    update_iteration_initial_status(s);
3967
3968    qemu_savevm_state_header(s->to_dst_file);
3969
3970    /*
3971     * If we opened the return path, we need to make sure dst has it
3972     * opened as well.
3973     */
3974    if (s->rp_state.rp_thread_created) {
3975        /* Now tell the dest that it should open its end so it can reply */
3976        qemu_savevm_send_open_return_path(s->to_dst_file);
3977
3978        /* And do a ping that will make stuff easier to debug */
3979        qemu_savevm_send_ping(s->to_dst_file, 1);
3980    }
3981
3982    if (migrate_postcopy()) {
3983        /*
3984         * Tell the destination that we *might* want to do postcopy later;
3985         * if the other end can't do postcopy it should fail now, nice and
3986         * early.
3987         */
3988        qemu_savevm_send_postcopy_advise(s->to_dst_file);
3989    }
3990
3991    if (migrate_colo_enabled()) {
3992        /* Notify migration destination that we enable COLO */
3993        qemu_savevm_send_colo_enable(s->to_dst_file);
3994    }
3995
3996    qemu_savevm_state_setup(s->to_dst_file);
3997
3998    qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
3999                               MIGRATION_STATUS_ACTIVE);
4000
4001    s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
4002
4003    trace_migration_thread_setup_complete();
4004
4005    while (migration_is_active(s)) {
4006        if (urgent || !qemu_file_rate_limit(s->to_dst_file)) {
4007            MigIterateState iter_state = migration_iteration_run(s);
4008            if (iter_state == MIG_ITERATE_SKIP) {
4009                continue;
4010            } else if (iter_state == MIG_ITERATE_BREAK) {
4011                break;
4012            }
4013        }
4014
4015        /*
4016         * Try to detect any kind of failures, and see whether we
4017         * should stop the migration now.
4018         */
4019        thr_error = migration_detect_error(s);
4020        if (thr_error == MIG_THR_ERR_FATAL) {
4021            /* Stop migration */
4022            break;
4023        } else if (thr_error == MIG_THR_ERR_RECOVERED) {
4024            /*
4025             * Just recovered from a e.g. network failure, reset all
4026             * the local variables. This is important to avoid
4027             * breaking transferred_bytes and bandwidth calculation
4028             */
4029            update_iteration_initial_status(s);
4030        }
4031
4032        urgent = migration_rate_limit();
4033    }
4034
4035    trace_migration_thread_after_loop();
4036    migration_iteration_finish(s);
4037    object_unref(OBJECT(s));
4038    rcu_unregister_thread();
4039    return NULL;
4040}
4041
4042static void bg_migration_vm_start_bh(void *opaque)
4043{
4044    MigrationState *s = opaque;
4045
4046    qemu_bh_delete(s->vm_start_bh);
4047    s->vm_start_bh = NULL;
4048
4049    vm_start();
4050    s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start;
4051}
4052
4053/**
4054 * Background snapshot thread, based on live migration code.
4055 * This is an alternative implementation of live migration mechanism
4056 * introduced specifically to support background snapshots.
4057 *
4058 * It takes advantage of userfault_fd write protection mechanism introduced
4059 * in v5.7 kernel. Compared to existing dirty page logging migration much
4060 * lesser stream traffic is produced resulting in smaller snapshot images,
4061 * simply cause of no page duplicates can get into the stream.
4062 *
4063 * Another key point is that generated vmstate stream reflects machine state
4064 * 'frozen' at the beginning of snapshot creation compared to dirty page logging
4065 * mechanism, which effectively results in that saved snapshot is the state of VM
4066 * at the end of the process.
4067 */
4068static void *bg_migration_thread(void *opaque)
4069{
4070    MigrationState *s = opaque;
4071    int64_t setup_start;
4072    MigThrError thr_error;
4073    QEMUFile *fb;
4074    bool early_fail = true;
4075
4076    rcu_register_thread();
4077    object_ref(OBJECT(s));
4078
4079    qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
4080
4081    setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
4082    /*
4083     * We want to save vmstate for the moment when migration has been
4084     * initiated but also we want to save RAM content while VM is running.
4085     * The RAM content should appear first in the vmstate. So, we first
4086     * stash the non-RAM part of the vmstate to the temporary buffer,
4087     * then write RAM part of the vmstate to the migration stream
4088     * with vCPUs running and, finally, write stashed non-RAM part of
4089     * the vmstate from the buffer to the migration stream.
4090     */
4091    s->bioc = qio_channel_buffer_new(512 * 1024);
4092    qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer");
4093    fb = qemu_file_new_output(QIO_CHANNEL(s->bioc));
4094    object_unref(OBJECT(s->bioc));
4095
4096    update_iteration_initial_status(s);
4097
4098    /*
4099     * Prepare for tracking memory writes with UFFD-WP - populate
4100     * RAM pages before protecting.
4101     */
4102#ifdef __linux__
4103    ram_write_tracking_prepare();
4104#endif
4105
4106    qemu_savevm_state_header(s->to_dst_file);
4107    qemu_savevm_state_setup(s->to_dst_file);
4108
4109    qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
4110                               MIGRATION_STATUS_ACTIVE);
4111
4112    s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
4113
4114    trace_migration_thread_setup_complete();
4115    s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
4116
4117    qemu_mutex_lock_iothread();
4118
4119    /*
4120     * If VM is currently in suspended state, then, to make a valid runstate
4121     * transition in vm_stop_force_state() we need to wakeup it up.
4122     */
4123    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
4124    s->vm_was_running = runstate_is_running();
4125
4126    if (global_state_store()) {
4127        goto fail;
4128    }
4129    /* Forcibly stop VM before saving state of vCPUs and devices */
4130    if (vm_stop_force_state(RUN_STATE_PAUSED)) {
4131        goto fail;
4132    }
4133    /*
4134     * Put vCPUs in sync with shadow context structures, then
4135     * save their state to channel-buffer along with devices.
4136     */
4137    cpu_synchronize_all_states();
4138    if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) {
4139        goto fail;
4140    }
4141    /*
4142     * Since we are going to get non-iterable state data directly
4143     * from s->bioc->data, explicit flush is needed here.
4144     */
4145    qemu_fflush(fb);
4146
4147    /* Now initialize UFFD context and start tracking RAM writes */
4148    if (ram_write_tracking_start()) {
4149        goto fail;
4150    }
4151    early_fail = false;
4152
4153    /*
4154     * Start VM from BH handler to avoid write-fault lock here.
4155     * UFFD-WP protection for the whole RAM is already enabled so
4156     * calling VM state change notifiers from vm_start() would initiate
4157     * writes to virtio VQs memory which is in write-protected region.
4158     */
4159    s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s);
4160    qemu_bh_schedule(s->vm_start_bh);
4161
4162    qemu_mutex_unlock_iothread();
4163
4164    while (migration_is_active(s)) {
4165        MigIterateState iter_state = bg_migration_iteration_run(s);
4166        if (iter_state == MIG_ITERATE_SKIP) {
4167            continue;
4168        } else if (iter_state == MIG_ITERATE_BREAK) {
4169            break;
4170        }
4171
4172        /*
4173         * Try to detect any kind of failures, and see whether we
4174         * should stop the migration now.
4175         */
4176        thr_error = migration_detect_error(s);
4177        if (thr_error == MIG_THR_ERR_FATAL) {
4178            /* Stop migration */
4179            break;
4180        }
4181
4182        migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
4183    }
4184
4185    trace_migration_thread_after_loop();
4186
4187fail:
4188    if (early_fail) {
4189        migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
4190                MIGRATION_STATUS_FAILED);
4191        qemu_mutex_unlock_iothread();
4192    }
4193
4194    bg_migration_iteration_finish(s);
4195
4196    qemu_fclose(fb);
4197    object_unref(OBJECT(s));
4198    rcu_unregister_thread();
4199
4200    return NULL;
4201}
4202
4203void migrate_fd_connect(MigrationState *s, Error *error_in)
4204{
4205    Error *local_err = NULL;
4206    int64_t rate_limit;
4207    bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
4208
4209    /*
4210     * If there's a previous error, free it and prepare for another one.
4211     * Meanwhile if migration completes successfully, there won't have an error
4212     * dumped when calling migrate_fd_cleanup().
4213     */
4214    migrate_error_free(s);
4215
4216    s->expected_downtime = s->parameters.downtime_limit;
4217    if (resume) {
4218        assert(s->cleanup_bh);
4219    } else {
4220        assert(!s->cleanup_bh);
4221        s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup_bh, s);
4222    }
4223    if (error_in) {
4224        migrate_fd_error(s, error_in);
4225        if (resume) {
4226            /*
4227             * Don't do cleanup for resume if channel is invalid, but only dump
4228             * the error.  We wait for another channel connect from the user.
4229             * The error_report still gives HMP user a hint on what failed.
4230             * It's normally done in migrate_fd_cleanup(), but call it here
4231             * explicitly.
4232             */
4233            error_report_err(error_copy(s->error));
4234        } else {
4235            migrate_fd_cleanup(s);
4236        }
4237        return;
4238    }
4239
4240    if (resume) {
4241        /* This is a resumed migration */
4242        rate_limit = s->parameters.max_postcopy_bandwidth /
4243            XFER_LIMIT_RATIO;
4244    } else {
4245        /* This is a fresh new migration */
4246        rate_limit = s->parameters.max_bandwidth / XFER_LIMIT_RATIO;
4247
4248        /* Notify before starting migration thread */
4249        notifier_list_notify(&migration_state_notifiers, s);
4250    }
4251
4252    qemu_file_set_rate_limit(s->to_dst_file, rate_limit);
4253    qemu_file_set_blocking(s->to_dst_file, true);
4254
4255    /*
4256     * Open the return path. For postcopy, it is used exclusively. For
4257     * precopy, only if user specified "return-path" capability would
4258     * QEMU uses the return path.
4259     */
4260    if (migrate_postcopy_ram() || migrate_use_return_path()) {
4261        if (open_return_path_on_source(s, !resume)) {
4262            error_report("Unable to open return-path for postcopy");
4263            migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
4264            migrate_fd_cleanup(s);
4265            return;
4266        }
4267    }
4268
4269    /* This needs to be done before resuming a postcopy */
4270    if (postcopy_preempt_setup(s, &local_err)) {
4271        error_report_err(local_err);
4272        migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
4273                          MIGRATION_STATUS_FAILED);
4274        migrate_fd_cleanup(s);
4275        return;
4276    }
4277
4278    if (resume) {
4279        /* Wakeup the main migration thread to do the recovery */
4280        migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
4281                          MIGRATION_STATUS_POSTCOPY_RECOVER);
4282        qemu_sem_post(&s->postcopy_pause_sem);
4283        return;
4284    }
4285
4286    if (multifd_save_setup(&local_err) != 0) {
4287        error_report_err(local_err);
4288        migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
4289                          MIGRATION_STATUS_FAILED);
4290        migrate_fd_cleanup(s);
4291        return;
4292    }
4293
4294    if (migrate_background_snapshot()) {
4295        qemu_thread_create(&s->thread, "bg_snapshot",
4296                bg_migration_thread, s, QEMU_THREAD_JOINABLE);
4297    } else {
4298        qemu_thread_create(&s->thread, "live_migration",
4299                migration_thread, s, QEMU_THREAD_JOINABLE);
4300    }
4301    s->migration_thread_running = true;
4302}
4303
4304void migration_global_dump(Monitor *mon)
4305{
4306    MigrationState *ms = migrate_get_current();
4307
4308    monitor_printf(mon, "globals:\n");
4309    monitor_printf(mon, "store-global-state: %s\n",
4310                   ms->store_global_state ? "on" : "off");
4311    monitor_printf(mon, "only-migratable: %s\n",
4312                   only_migratable ? "on" : "off");
4313    monitor_printf(mon, "send-configuration: %s\n",
4314                   ms->send_configuration ? "on" : "off");
4315    monitor_printf(mon, "send-section-footer: %s\n",
4316                   ms->send_section_footer ? "on" : "off");
4317    monitor_printf(mon, "decompress-error-check: %s\n",
4318                   ms->decompress_error_check ? "on" : "off");
4319    monitor_printf(mon, "clear-bitmap-shift: %u\n",
4320                   ms->clear_bitmap_shift);
4321}
4322
4323#define DEFINE_PROP_MIG_CAP(name, x)             \
4324    DEFINE_PROP_BOOL(name, MigrationState, enabled_capabilities[x], false)
4325
4326static Property migration_properties[] = {
4327    DEFINE_PROP_BOOL("store-global-state", MigrationState,
4328                     store_global_state, true),
4329    DEFINE_PROP_BOOL("send-configuration", MigrationState,
4330                     send_configuration, true),
4331    DEFINE_PROP_BOOL("send-section-footer", MigrationState,
4332                     send_section_footer, true),
4333    DEFINE_PROP_BOOL("decompress-error-check", MigrationState,
4334                      decompress_error_check, true),
4335    DEFINE_PROP_UINT8("x-clear-bitmap-shift", MigrationState,
4336                      clear_bitmap_shift, CLEAR_BITMAP_SHIFT_DEFAULT),
4337
4338    /* Migration parameters */
4339    DEFINE_PROP_UINT8("x-compress-level", MigrationState,
4340                      parameters.compress_level,
4341                      DEFAULT_MIGRATE_COMPRESS_LEVEL),
4342    DEFINE_PROP_UINT8("x-compress-threads", MigrationState,
4343                      parameters.compress_threads,
4344                      DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT),
4345    DEFINE_PROP_BOOL("x-compress-wait-thread", MigrationState,
4346                      parameters.compress_wait_thread, true),
4347    DEFINE_PROP_UINT8("x-decompress-threads", MigrationState,
4348                      parameters.decompress_threads,
4349                      DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT),
4350    DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState,
4351                      parameters.throttle_trigger_threshold,
4352                      DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD),
4353    DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState,
4354                      parameters.cpu_throttle_initial,
4355                      DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL),
4356    DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState,
4357                      parameters.cpu_throttle_increment,
4358                      DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT),
4359    DEFINE_PROP_BOOL("x-cpu-throttle-tailslow", MigrationState,
4360                      parameters.cpu_throttle_tailslow, false),
4361    DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState,
4362                      parameters.max_bandwidth, MAX_THROTTLE),
4363    DEFINE_PROP_UINT64("x-downtime-limit", MigrationState,
4364                      parameters.downtime_limit,
4365                      DEFAULT_MIGRATE_SET_DOWNTIME),
4366    DEFINE_PROP_UINT32("x-checkpoint-delay", MigrationState,
4367                      parameters.x_checkpoint_delay,
4368                      DEFAULT_MIGRATE_X_CHECKPOINT_DELAY),
4369    DEFINE_PROP_UINT8("multifd-channels", MigrationState,
4370                      parameters.multifd_channels,
4371                      DEFAULT_MIGRATE_MULTIFD_CHANNELS),
4372    DEFINE_PROP_MULTIFD_COMPRESSION("multifd-compression", MigrationState,
4373                      parameters.multifd_compression,
4374                      DEFAULT_MIGRATE_MULTIFD_COMPRESSION),
4375    DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState,
4376                      parameters.multifd_zlib_level,
4377                      DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL),
4378    DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState,
4379                      parameters.multifd_zstd_level,
4380                      DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL),
4381    DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState,
4382                      parameters.xbzrle_cache_size,
4383                      DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE),
4384    DEFINE_PROP_SIZE("max-postcopy-bandwidth", MigrationState,
4385                      parameters.max_postcopy_bandwidth,
4386                      DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH),
4387    DEFINE_PROP_UINT8("max-cpu-throttle", MigrationState,
4388                      parameters.max_cpu_throttle,
4389                      DEFAULT_MIGRATE_MAX_CPU_THROTTLE),
4390    DEFINE_PROP_SIZE("announce-initial", MigrationState,
4391                      parameters.announce_initial,
4392                      DEFAULT_MIGRATE_ANNOUNCE_INITIAL),
4393    DEFINE_PROP_SIZE("announce-max", MigrationState,
4394                      parameters.announce_max,
4395                      DEFAULT_MIGRATE_ANNOUNCE_MAX),
4396    DEFINE_PROP_SIZE("announce-rounds", MigrationState,
4397                      parameters.announce_rounds,
4398                      DEFAULT_MIGRATE_ANNOUNCE_ROUNDS),
4399    DEFINE_PROP_SIZE("announce-step", MigrationState,
4400                      parameters.announce_step,
4401                      DEFAULT_MIGRATE_ANNOUNCE_STEP),
4402    DEFINE_PROP_BOOL("x-postcopy-preempt-break-huge", MigrationState,
4403                      postcopy_preempt_break_huge, true),
4404    DEFINE_PROP_STRING("tls-creds", MigrationState, parameters.tls_creds),
4405    DEFINE_PROP_STRING("tls-hostname", MigrationState, parameters.tls_hostname),
4406    DEFINE_PROP_STRING("tls-authz", MigrationState, parameters.tls_authz),
4407
4408    /* Migration capabilities */
4409    DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
4410    DEFINE_PROP_MIG_CAP("x-rdma-pin-all", MIGRATION_CAPABILITY_RDMA_PIN_ALL),
4411    DEFINE_PROP_MIG_CAP("x-auto-converge", MIGRATION_CAPABILITY_AUTO_CONVERGE),
4412    DEFINE_PROP_MIG_CAP("x-zero-blocks", MIGRATION_CAPABILITY_ZERO_BLOCKS),
4413    DEFINE_PROP_MIG_CAP("x-compress", MIGRATION_CAPABILITY_COMPRESS),
4414    DEFINE_PROP_MIG_CAP("x-events", MIGRATION_CAPABILITY_EVENTS),
4415    DEFINE_PROP_MIG_CAP("x-postcopy-ram", MIGRATION_CAPABILITY_POSTCOPY_RAM),
4416    DEFINE_PROP_MIG_CAP("x-postcopy-preempt",
4417                        MIGRATION_CAPABILITY_POSTCOPY_PREEMPT),
4418    DEFINE_PROP_MIG_CAP("x-colo", MIGRATION_CAPABILITY_X_COLO),
4419    DEFINE_PROP_MIG_CAP("x-release-ram", MIGRATION_CAPABILITY_RELEASE_RAM),
4420    DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK),
4421    DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH),
4422    DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD),
4423    DEFINE_PROP_MIG_CAP("x-background-snapshot",
4424            MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT),
4425#ifdef CONFIG_LINUX
4426    DEFINE_PROP_MIG_CAP("x-zero-copy-send",
4427            MIGRATION_CAPABILITY_ZERO_COPY_SEND),
4428#endif
4429
4430    DEFINE_PROP_END_OF_LIST(),
4431};
4432
4433static void migration_class_init(ObjectClass *klass, void *data)
4434{
4435    DeviceClass *dc = DEVICE_CLASS(klass);
4436
4437    dc->user_creatable = false;
4438    device_class_set_props(dc, migration_properties);
4439}
4440
4441static void migration_instance_finalize(Object *obj)
4442{
4443    MigrationState *ms = MIGRATION_OBJ(obj);
4444
4445    qemu_mutex_destroy(&ms->error_mutex);
4446    qemu_mutex_destroy(&ms->qemu_file_lock);
4447    qemu_sem_destroy(&ms->wait_unplug_sem);
4448    qemu_sem_destroy(&ms->rate_limit_sem);
4449    qemu_sem_destroy(&ms->pause_sem);
4450    qemu_sem_destroy(&ms->postcopy_pause_sem);
4451    qemu_sem_destroy(&ms->postcopy_pause_rp_sem);
4452    qemu_sem_destroy(&ms->rp_state.rp_sem);
4453    qemu_sem_destroy(&ms->postcopy_qemufile_src_sem);
4454    error_free(ms->error);
4455}
4456
4457static void migration_instance_init(Object *obj)
4458{
4459    MigrationState *ms = MIGRATION_OBJ(obj);
4460    MigrationParameters *params = &ms->parameters;
4461
4462    ms->state = MIGRATION_STATUS_NONE;
4463    ms->mbps = -1;
4464    ms->pages_per_second = -1;
4465    qemu_sem_init(&ms->pause_sem, 0);
4466    qemu_mutex_init(&ms->error_mutex);
4467
4468    params->tls_hostname = g_strdup("");
4469    params->tls_creds = g_strdup("");
4470
4471    /* Set has_* up only for parameter checks */
4472    params->has_compress_level = true;
4473    params->has_compress_threads = true;
4474    params->has_compress_wait_thread = true;
4475    params->has_decompress_threads = true;
4476    params->has_throttle_trigger_threshold = true;
4477    params->has_cpu_throttle_initial = true;
4478    params->has_cpu_throttle_increment = true;
4479    params->has_cpu_throttle_tailslow = true;
4480    params->has_max_bandwidth = true;
4481    params->has_downtime_limit = true;
4482    params->has_x_checkpoint_delay = true;
4483    params->has_block_incremental = true;
4484    params->has_multifd_channels = true;
4485    params->has_multifd_compression = true;
4486    params->has_multifd_zlib_level = true;
4487    params->has_multifd_zstd_level = true;
4488    params->has_xbzrle_cache_size = true;
4489    params->has_max_postcopy_bandwidth = true;
4490    params->has_max_cpu_throttle = true;
4491    params->has_announce_initial = true;
4492    params->has_announce_max = true;
4493    params->has_announce_rounds = true;
4494    params->has_announce_step = true;
4495    params->has_tls_creds = true;
4496    params->has_tls_hostname = true;
4497    params->has_tls_authz = true;
4498
4499    qemu_sem_init(&ms->postcopy_pause_sem, 0);
4500    qemu_sem_init(&ms->postcopy_pause_rp_sem, 0);
4501    qemu_sem_init(&ms->rp_state.rp_sem, 0);
4502    qemu_sem_init(&ms->rate_limit_sem, 0);
4503    qemu_sem_init(&ms->wait_unplug_sem, 0);
4504    qemu_sem_init(&ms->postcopy_qemufile_src_sem, 0);
4505    qemu_mutex_init(&ms->qemu_file_lock);
4506}
4507
4508/*
4509 * Return true if check pass, false otherwise. Error will be put
4510 * inside errp if provided.
4511 */
4512static bool migration_object_check(MigrationState *ms, Error **errp)
4513{
4514    MigrationCapabilityStatusList *head = NULL;
4515    /* Assuming all off */
4516    bool cap_list[MIGRATION_CAPABILITY__MAX] = { 0 }, ret;
4517    int i;
4518
4519    if (!migrate_params_check(&ms->parameters, errp)) {
4520        return false;
4521    }
4522
4523    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
4524        if (ms->enabled_capabilities[i]) {
4525            QAPI_LIST_PREPEND(head, migrate_cap_add(i, true));
4526        }
4527    }
4528
4529    ret = migrate_caps_check(cap_list, head, errp);
4530
4531    /* It works with head == NULL */
4532    qapi_free_MigrationCapabilityStatusList(head);
4533
4534    return ret;
4535}
4536
4537static const TypeInfo migration_type = {
4538    .name = TYPE_MIGRATION,
4539    /*
4540     * NOTE: TYPE_MIGRATION is not really a device, as the object is
4541     * not created using qdev_new(), it is not attached to the qdev
4542     * device tree, and it is never realized.
4543     *
4544     * TODO: Make this TYPE_OBJECT once QOM provides something like
4545     * TYPE_DEVICE's "-global" properties.
4546     */
4547    .parent = TYPE_DEVICE,
4548    .class_init = migration_class_init,
4549    .class_size = sizeof(MigrationClass),
4550    .instance_size = sizeof(MigrationState),
4551    .instance_init = migration_instance_init,
4552    .instance_finalize = migration_instance_finalize,
4553};
4554
4555static void register_migration_types(void)
4556{
4557    type_register_static(&migration_type);
4558}
4559
4560type_init(register_migration_types);
4561