qemu/migration/migration.c
<<
>>
Prefs
   1/*
   2 * QEMU live migration
   3 *
   4 * Copyright IBM, Corp. 2008
   5 *
   6 * Authors:
   7 *  Anthony Liguori   <aliguori@us.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 * Contributions after 2012-01-13 are licensed under the terms of the
  13 * GNU GPL, version 2 or (at your option) any later version.
  14 */
  15
  16#include "qemu/osdep.h"
  17#include "qemu/cutils.h"
  18#include "qemu/error-report.h"
  19#include "qemu/main-loop.h"
  20#include "migration/blocker.h"
  21#include "exec.h"
  22#include "fd.h"
  23#include "socket.h"
  24#include "sysemu/runstate.h"
  25#include "sysemu/sysemu.h"
  26#include "sysemu/cpu-throttle.h"
  27#include "rdma.h"
  28#include "ram.h"
  29#include "migration/global_state.h"
  30#include "migration/misc.h"
  31#include "migration.h"
  32#include "savevm.h"
  33#include "qemu-file-channel.h"
  34#include "qemu-file.h"
  35#include "migration/vmstate.h"
  36#include "block/block.h"
  37#include "qapi/error.h"
  38#include "qapi/clone-visitor.h"
  39#include "qapi/qapi-visit-migration.h"
  40#include "qapi/qapi-visit-sockets.h"
  41#include "qapi/qapi-commands-migration.h"
  42#include "qapi/qapi-events-migration.h"
  43#include "qapi/qmp/qerror.h"
  44#include "qapi/qmp/qnull.h"
  45#include "qemu/rcu.h"
  46#include "block.h"
  47#include "postcopy-ram.h"
  48#include "qemu/thread.h"
  49#include "trace.h"
  50#include "exec/target_page.h"
  51#include "io/channel-buffer.h"
  52#include "migration/colo.h"
  53#include "hw/boards.h"
  54#include "hw/qdev-properties.h"
  55#include "hw/qdev-properties-system.h"
  56#include "monitor/monitor.h"
  57#include "net/announce.h"
  58#include "qemu/queue.h"
  59#include "multifd.h"
  60#include "qemu/yank.h"
  61#include "sysemu/cpus.h"
  62#include "yank_functions.h"
  63
  64#define MAX_THROTTLE  (128 << 20)      /* Migration transfer speed throttling */
  65
  66/* Amount of time to allocate to each "chunk" of bandwidth-throttled
  67 * data. */
  68#define BUFFER_DELAY     100
  69#define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY)
  70
  71/* Time in milliseconds we are allowed to stop the source,
  72 * for sending the last part */
  73#define DEFAULT_MIGRATE_SET_DOWNTIME 300
  74
  75/* Maximum migrate downtime set to 2000 seconds */
  76#define MAX_MIGRATE_DOWNTIME_SECONDS 2000
  77#define MAX_MIGRATE_DOWNTIME (MAX_MIGRATE_DOWNTIME_SECONDS * 1000)
  78
  79/* Default compression thread count */
  80#define DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT 8
  81/* Default decompression thread count, usually decompression is at
  82 * least 4 times as fast as compression.*/
  83#define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2
  84/*0: means nocompress, 1: best speed, ... 9: best compress ratio */
  85#define DEFAULT_MIGRATE_COMPRESS_LEVEL 1
  86/* Define default autoconverge cpu throttle migration parameters */
  87#define DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD 50
  88#define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20
  89#define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10
  90#define DEFAULT_MIGRATE_MAX_CPU_THROTTLE 99
  91
  92/* Migration XBZRLE default cache size */
  93#define DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE (64 * 1024 * 1024)
  94
  95/* The delay time (in ms) between two COLO checkpoints */
  96#define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY (200 * 100)
  97#define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
  98#define DEFAULT_MIGRATE_MULTIFD_COMPRESSION MULTIFD_COMPRESSION_NONE
  99/* 0: means nocompress, 1: best speed, ... 9: best compress ratio */
 100#define DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL 1
 101/* 0: means nocompress, 1: best speed, ... 20: best compress ratio */
 102#define DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL 1
 103
 104/* Background transfer rate for postcopy, 0 means unlimited, note
 105 * that page requests can still exceed this limit.
 106 */
 107#define DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH 0
 108
 109/*
 110 * Parameters for self_announce_delay giving a stream of RARP/ARP
 111 * packets after migration.
 112 */
 113#define DEFAULT_MIGRATE_ANNOUNCE_INITIAL  50
 114#define DEFAULT_MIGRATE_ANNOUNCE_MAX     550
 115#define DEFAULT_MIGRATE_ANNOUNCE_ROUNDS    5
 116#define DEFAULT_MIGRATE_ANNOUNCE_STEP    100
 117
 118static NotifierList migration_state_notifiers =
 119    NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
 120
 121/* Messages sent on the return path from destination to source */
 122enum mig_rp_message_type {
 123    MIG_RP_MSG_INVALID = 0,  /* Must be 0 */
 124    MIG_RP_MSG_SHUT,         /* sibling will not send any more RP messages */
 125    MIG_RP_MSG_PONG,         /* Response to a PING; data (seq: be32 ) */
 126
 127    MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */
 128    MIG_RP_MSG_REQ_PAGES,    /* data (start: be64, len: be32) */
 129    MIG_RP_MSG_RECV_BITMAP,  /* send recved_bitmap back to source */
 130    MIG_RP_MSG_RESUME_ACK,   /* tell source that we are ready to resume */
 131
 132    MIG_RP_MSG_MAX
 133};
 134
 135/* Migration capabilities set */
 136struct MigrateCapsSet {
 137    int size;                       /* Capability set size */
 138    MigrationCapability caps[];     /* Variadic array of capabilities */
 139};
 140typedef struct MigrateCapsSet MigrateCapsSet;
 141
 142/* Define and initialize MigrateCapsSet */
 143#define INITIALIZE_MIGRATE_CAPS_SET(_name, ...)   \
 144    MigrateCapsSet _name = {    \
 145        .size = sizeof((int []) { __VA_ARGS__ }) / sizeof(int), \
 146        .caps = { __VA_ARGS__ } \
 147    }
 148
 149/* Background-snapshot compatibility check list */
 150static const
 151INITIALIZE_MIGRATE_CAPS_SET(check_caps_background_snapshot,
 152    MIGRATION_CAPABILITY_POSTCOPY_RAM,
 153    MIGRATION_CAPABILITY_DIRTY_BITMAPS,
 154    MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME,
 155    MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE,
 156    MIGRATION_CAPABILITY_RETURN_PATH,
 157    MIGRATION_CAPABILITY_MULTIFD,
 158    MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER,
 159    MIGRATION_CAPABILITY_AUTO_CONVERGE,
 160    MIGRATION_CAPABILITY_RELEASE_RAM,
 161    MIGRATION_CAPABILITY_RDMA_PIN_ALL,
 162    MIGRATION_CAPABILITY_COMPRESS,
 163    MIGRATION_CAPABILITY_XBZRLE,
 164    MIGRATION_CAPABILITY_X_COLO,
 165    MIGRATION_CAPABILITY_VALIDATE_UUID);
 166
 167/* When we add fault tolerance, we could have several
 168   migrations at once.  For now we don't need to add
 169   dynamic creation of migration */
 170
 171static MigrationState *current_migration;
 172static MigrationIncomingState *current_incoming;
 173
 174static GSList *migration_blockers;
 175
 176static bool migration_object_check(MigrationState *ms, Error **errp);
 177static int migration_maybe_pause(MigrationState *s,
 178                                 int *current_active_state,
 179                                 int new_state);
 180static void migrate_fd_cancel(MigrationState *s);
 181
 182static gint page_request_addr_cmp(gconstpointer ap, gconstpointer bp)
 183{
 184    uintptr_t a = (uintptr_t) ap, b = (uintptr_t) bp;
 185
 186    return (a > b) - (a < b);
 187}
 188
 189void migration_object_init(void)
 190{
 191    Error *err = NULL;
 192
 193    /* This can only be called once. */
 194    assert(!current_migration);
 195    current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION));
 196
 197    /*
 198     * Init the migrate incoming object as well no matter whether
 199     * we'll use it or not.
 200     */
 201    assert(!current_incoming);
 202    current_incoming = g_new0(MigrationIncomingState, 1);
 203    current_incoming->state = MIGRATION_STATUS_NONE;
 204    current_incoming->postcopy_remote_fds =
 205        g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD));
 206    qemu_mutex_init(&current_incoming->rp_mutex);
 207    qemu_event_init(&current_incoming->main_thread_load_event, false);
 208    qemu_sem_init(&current_incoming->postcopy_pause_sem_dst, 0);
 209    qemu_sem_init(&current_incoming->postcopy_pause_sem_fault, 0);
 210    qemu_mutex_init(&current_incoming->page_request_mutex);
 211    current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
 212
 213    if (!migration_object_check(current_migration, &err)) {
 214        error_report_err(err);
 215        exit(1);
 216    }
 217
 218    blk_mig_init();
 219    ram_mig_init();
 220    dirty_bitmap_mig_init();
 221}
 222
 223void migration_cancel(void)
 224{
 225    migrate_fd_cancel(current_migration);
 226}
 227
 228void migration_shutdown(void)
 229{
 230    /*
 231     * Cancel the current migration - that will (eventually)
 232     * stop the migration using this structure
 233     */
 234    migration_cancel();
 235    object_unref(OBJECT(current_migration));
 236
 237    /*
 238     * Cancel outgoing migration of dirty bitmaps. It should
 239     * at least unref used block nodes.
 240     */
 241    dirty_bitmap_mig_cancel_outgoing();
 242
 243    /*
 244     * Cancel incoming migration of dirty bitmaps. Dirty bitmaps
 245     * are non-critical data, and their loss never considered as
 246     * something serious.
 247     */
 248    dirty_bitmap_mig_cancel_incoming();
 249}
 250
 251/* For outgoing */
 252MigrationState *migrate_get_current(void)
 253{
 254    /* This can only be called after the object created. */
 255    assert(current_migration);
 256    return current_migration;
 257}
 258
 259MigrationIncomingState *migration_incoming_get_current(void)
 260{
 261    assert(current_incoming);
 262    return current_incoming;
 263}
 264
 265void migration_incoming_state_destroy(void)
 266{
 267    struct MigrationIncomingState *mis = migration_incoming_get_current();
 268
 269    if (mis->to_src_file) {
 270        /* Tell source that we are done */
 271        migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
 272        qemu_fclose(mis->to_src_file);
 273        mis->to_src_file = NULL;
 274    }
 275
 276    if (mis->from_src_file) {
 277        migration_ioc_unregister_yank_from_file(mis->from_src_file);
 278        qemu_fclose(mis->from_src_file);
 279        mis->from_src_file = NULL;
 280    }
 281    if (mis->postcopy_remote_fds) {
 282        g_array_free(mis->postcopy_remote_fds, TRUE);
 283        mis->postcopy_remote_fds = NULL;
 284    }
 285    if (mis->transport_cleanup) {
 286        mis->transport_cleanup(mis->transport_data);
 287    }
 288
 289    qemu_event_reset(&mis->main_thread_load_event);
 290
 291    if (mis->page_requested) {
 292        g_tree_destroy(mis->page_requested);
 293        mis->page_requested = NULL;
 294    }
 295
 296    if (mis->socket_address_list) {
 297        qapi_free_SocketAddressList(mis->socket_address_list);
 298        mis->socket_address_list = NULL;
 299    }
 300
 301    yank_unregister_instance(MIGRATION_YANK_INSTANCE);
 302}
 303
 304static void migrate_generate_event(int new_state)
 305{
 306    if (migrate_use_events()) {
 307        qapi_event_send_migration(new_state);
 308    }
 309}
 310
 311static bool migrate_late_block_activate(void)
 312{
 313    MigrationState *s;
 314
 315    s = migrate_get_current();
 316
 317    return s->enabled_capabilities[
 318        MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE];
 319}
 320
 321/*
 322 * Send a message on the return channel back to the source
 323 * of the migration.
 324 */
 325static int migrate_send_rp_message(MigrationIncomingState *mis,
 326                                   enum mig_rp_message_type message_type,
 327                                   uint16_t len, void *data)
 328{
 329    int ret = 0;
 330
 331    trace_migrate_send_rp_message((int)message_type, len);
 332    QEMU_LOCK_GUARD(&mis->rp_mutex);
 333
 334    /*
 335     * It's possible that the file handle got lost due to network
 336     * failures.
 337     */
 338    if (!mis->to_src_file) {
 339        ret = -EIO;
 340        return ret;
 341    }
 342
 343    qemu_put_be16(mis->to_src_file, (unsigned int)message_type);
 344    qemu_put_be16(mis->to_src_file, len);
 345    qemu_put_buffer(mis->to_src_file, data, len);
 346    qemu_fflush(mis->to_src_file);
 347
 348    /* It's possible that qemu file got error during sending */
 349    ret = qemu_file_get_error(mis->to_src_file);
 350
 351    return ret;
 352}
 353
 354/* Request one page from the source VM at the given start address.
 355 *   rb: the RAMBlock to request the page in
 356 *   Start: Address offset within the RB
 357 *   Len: Length in bytes required - must be a multiple of pagesize
 358 */
 359int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
 360                                      RAMBlock *rb, ram_addr_t start)
 361{
 362    uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname up to 256 */
 363    size_t msglen = 12; /* start + len */
 364    size_t len = qemu_ram_pagesize(rb);
 365    enum mig_rp_message_type msg_type;
 366    const char *rbname;
 367    int rbname_len;
 368
 369    *(uint64_t *)bufc = cpu_to_be64((uint64_t)start);
 370    *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len);
 371
 372    /*
 373     * We maintain the last ramblock that we requested for page.  Note that we
 374     * don't need locking because this function will only be called within the
 375     * postcopy ram fault thread.
 376     */
 377    if (rb != mis->last_rb) {
 378        mis->last_rb = rb;
 379
 380        rbname = qemu_ram_get_idstr(rb);
 381        rbname_len = strlen(rbname);
 382
 383        assert(rbname_len < 256);
 384
 385        bufc[msglen++] = rbname_len;
 386        memcpy(bufc + msglen, rbname, rbname_len);
 387        msglen += rbname_len;
 388        msg_type = MIG_RP_MSG_REQ_PAGES_ID;
 389    } else {
 390        msg_type = MIG_RP_MSG_REQ_PAGES;
 391    }
 392
 393    return migrate_send_rp_message(mis, msg_type, msglen, bufc);
 394}
 395
 396int migrate_send_rp_req_pages(MigrationIncomingState *mis,
 397                              RAMBlock *rb, ram_addr_t start, uint64_t haddr)
 398{
 399    void *aligned = (void *)(uintptr_t)(haddr & (-qemu_ram_pagesize(rb)));
 400    bool received = false;
 401
 402    WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
 403        received = ramblock_recv_bitmap_test_byte_offset(rb, start);
 404        if (!received && !g_tree_lookup(mis->page_requested, aligned)) {
 405            /*
 406             * The page has not been received, and it's not yet in the page
 407             * request list.  Queue it.  Set the value of element to 1, so that
 408             * things like g_tree_lookup() will return TRUE (1) when found.
 409             */
 410            g_tree_insert(mis->page_requested, aligned, (gpointer)1);
 411            mis->page_requested_count++;
 412            trace_postcopy_page_req_add(aligned, mis->page_requested_count);
 413        }
 414    }
 415
 416    /*
 417     * If the page is there, skip sending the message.  We don't even need the
 418     * lock because as long as the page arrived, it'll be there forever.
 419     */
 420    if (received) {
 421        return 0;
 422    }
 423
 424    return migrate_send_rp_message_req_pages(mis, rb, start);
 425}
 426
 427static bool migration_colo_enabled;
 428bool migration_incoming_colo_enabled(void)
 429{
 430    return migration_colo_enabled;
 431}
 432
 433void migration_incoming_disable_colo(void)
 434{
 435    ram_block_discard_disable(false);
 436    migration_colo_enabled = false;
 437}
 438
 439int migration_incoming_enable_colo(void)
 440{
 441    if (ram_block_discard_disable(true)) {
 442        error_report("COLO: cannot disable RAM discard");
 443        return -EBUSY;
 444    }
 445    migration_colo_enabled = true;
 446    return 0;
 447}
 448
 449void migrate_add_address(SocketAddress *address)
 450{
 451    MigrationIncomingState *mis = migration_incoming_get_current();
 452
 453    QAPI_LIST_PREPEND(mis->socket_address_list,
 454                      QAPI_CLONE(SocketAddress, address));
 455}
 456
 457static void qemu_start_incoming_migration(const char *uri, Error **errp)
 458{
 459    const char *p = NULL;
 460
 461    qapi_event_send_migration(MIGRATION_STATUS_SETUP);
 462    if (strstart(uri, "tcp:", &p) ||
 463        strstart(uri, "unix:", NULL) ||
 464        strstart(uri, "vsock:", NULL)) {
 465        socket_start_incoming_migration(p ? p : uri, errp);
 466#ifdef CONFIG_RDMA
 467    } else if (strstart(uri, "rdma:", &p)) {
 468        rdma_start_incoming_migration(p, errp);
 469#endif
 470    } else if (strstart(uri, "exec:", &p)) {
 471        exec_start_incoming_migration(p, errp);
 472    } else if (strstart(uri, "fd:", &p)) {
 473        fd_start_incoming_migration(p, errp);
 474    } else {
 475        error_setg(errp, "unknown migration protocol: %s", uri);
 476    }
 477}
 478
 479static void process_incoming_migration_bh(void *opaque)
 480{
 481    Error *local_err = NULL;
 482    MigrationIncomingState *mis = opaque;
 483
 484    /* If capability late_block_activate is set:
 485     * Only fire up the block code now if we're going to restart the
 486     * VM, else 'cont' will do it.
 487     * This causes file locking to happen; so we don't want it to happen
 488     * unless we really are starting the VM.
 489     */
 490    if (!migrate_late_block_activate() ||
 491         (autostart && (!global_state_received() ||
 492            global_state_get_runstate() == RUN_STATE_RUNNING))) {
 493        /* Make sure all file formats flush their mutable metadata.
 494         * If we get an error here, just don't restart the VM yet. */
 495        bdrv_invalidate_cache_all(&local_err);
 496        if (local_err) {
 497            error_report_err(local_err);
 498            local_err = NULL;
 499            autostart = false;
 500        }
 501    }
 502
 503    /*
 504     * This must happen after all error conditions are dealt with and
 505     * we're sure the VM is going to be running on this host.
 506     */
 507    qemu_announce_self(&mis->announce_timer, migrate_announce_params());
 508
 509    if (multifd_load_cleanup(&local_err) != 0) {
 510        error_report_err(local_err);
 511        autostart = false;
 512    }
 513    /* If global state section was not received or we are in running
 514       state, we need to obey autostart. Any other state is set with
 515       runstate_set. */
 516
 517    dirty_bitmap_mig_before_vm_start();
 518
 519    if (!global_state_received() ||
 520        global_state_get_runstate() == RUN_STATE_RUNNING) {
 521        if (autostart) {
 522            vm_start();
 523        } else {
 524            runstate_set(RUN_STATE_PAUSED);
 525        }
 526    } else if (migration_incoming_colo_enabled()) {
 527        migration_incoming_disable_colo();
 528        vm_start();
 529    } else {
 530        runstate_set(global_state_get_runstate());
 531    }
 532    /*
 533     * This must happen after any state changes since as soon as an external
 534     * observer sees this event they might start to prod at the VM assuming
 535     * it's ready to use.
 536     */
 537    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
 538                      MIGRATION_STATUS_COMPLETED);
 539    qemu_bh_delete(mis->bh);
 540    migration_incoming_state_destroy();
 541}
 542
 543static void process_incoming_migration_co(void *opaque)
 544{
 545    MigrationIncomingState *mis = migration_incoming_get_current();
 546    PostcopyState ps;
 547    int ret;
 548    Error *local_err = NULL;
 549
 550    assert(mis->from_src_file);
 551    mis->migration_incoming_co = qemu_coroutine_self();
 552    mis->largest_page_size = qemu_ram_pagesize_largest();
 553    postcopy_state_set(POSTCOPY_INCOMING_NONE);
 554    migrate_set_state(&mis->state, MIGRATION_STATUS_NONE,
 555                      MIGRATION_STATUS_ACTIVE);
 556    ret = qemu_loadvm_state(mis->from_src_file);
 557
 558    ps = postcopy_state_get();
 559    trace_process_incoming_migration_co_end(ret, ps);
 560    if (ps != POSTCOPY_INCOMING_NONE) {
 561        if (ps == POSTCOPY_INCOMING_ADVISE) {
 562            /*
 563             * Where a migration had postcopy enabled (and thus went to advise)
 564             * but managed to complete within the precopy period, we can use
 565             * the normal exit.
 566             */
 567            postcopy_ram_incoming_cleanup(mis);
 568        } else if (ret >= 0) {
 569            /*
 570             * Postcopy was started, cleanup should happen at the end of the
 571             * postcopy thread.
 572             */
 573            trace_process_incoming_migration_co_postcopy_end_main();
 574            return;
 575        }
 576        /* Else if something went wrong then just fall out of the normal exit */
 577    }
 578
 579    /* we get COLO info, and know if we are in COLO mode */
 580    if (!ret && migration_incoming_colo_enabled()) {
 581        /* Make sure all file formats flush their mutable metadata */
 582        bdrv_invalidate_cache_all(&local_err);
 583        if (local_err) {
 584            error_report_err(local_err);
 585            goto fail;
 586        }
 587
 588        qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming",
 589             colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
 590        mis->have_colo_incoming_thread = true;
 591        qemu_coroutine_yield();
 592
 593        /* Wait checkpoint incoming thread exit before free resource */
 594        qemu_thread_join(&mis->colo_incoming_thread);
 595        /* We hold the global iothread lock, so it is safe here */
 596        colo_release_ram_cache();
 597    }
 598
 599    if (ret < 0) {
 600        error_report("load of migration failed: %s", strerror(-ret));
 601        goto fail;
 602    }
 603    mis->bh = qemu_bh_new(process_incoming_migration_bh, mis);
 604    qemu_bh_schedule(mis->bh);
 605    mis->migration_incoming_co = NULL;
 606    return;
 607fail:
 608    local_err = NULL;
 609    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
 610                      MIGRATION_STATUS_FAILED);
 611    qemu_fclose(mis->from_src_file);
 612    if (multifd_load_cleanup(&local_err) != 0) {
 613        error_report_err(local_err);
 614    }
 615    exit(EXIT_FAILURE);
 616}
 617
 618/**
 619 * @migration_incoming_setup: Setup incoming migration
 620 *
 621 * Returns 0 for no error or 1 for error
 622 *
 623 * @f: file for main migration channel
 624 * @errp: where to put errors
 625 */
 626static int migration_incoming_setup(QEMUFile *f, Error **errp)
 627{
 628    MigrationIncomingState *mis = migration_incoming_get_current();
 629    Error *local_err = NULL;
 630
 631    if (multifd_load_setup(&local_err) != 0) {
 632        /* We haven't been able to create multifd threads
 633           nothing better to do */
 634        error_report_err(local_err);
 635        exit(EXIT_FAILURE);
 636    }
 637
 638    if (!mis->from_src_file) {
 639        mis->from_src_file = f;
 640    }
 641    qemu_file_set_blocking(f, false);
 642    return 0;
 643}
 644
 645void migration_incoming_process(void)
 646{
 647    Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
 648    qemu_coroutine_enter(co);
 649}
 650
 651/* Returns true if recovered from a paused migration, otherwise false */
 652static bool postcopy_try_recover(QEMUFile *f)
 653{
 654    MigrationIncomingState *mis = migration_incoming_get_current();
 655
 656    if (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
 657        /* Resumed from a paused postcopy migration */
 658
 659        mis->from_src_file = f;
 660        /* Postcopy has standalone thread to do vm load */
 661        qemu_file_set_blocking(f, true);
 662
 663        /* Re-configure the return path */
 664        mis->to_src_file = qemu_file_get_return_path(f);
 665
 666        migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
 667                          MIGRATION_STATUS_POSTCOPY_RECOVER);
 668
 669        /*
 670         * Here, we only wake up the main loading thread (while the
 671         * fault thread will still be waiting), so that we can receive
 672         * commands from source now, and answer it if needed. The
 673         * fault thread will be woken up afterwards until we are sure
 674         * that source is ready to reply to page requests.
 675         */
 676        qemu_sem_post(&mis->postcopy_pause_sem_dst);
 677        return true;
 678    }
 679
 680    return false;
 681}
 682
 683void migration_fd_process_incoming(QEMUFile *f, Error **errp)
 684{
 685    Error *local_err = NULL;
 686
 687    if (postcopy_try_recover(f)) {
 688        return;
 689    }
 690
 691    if (migration_incoming_setup(f, &local_err)) {
 692        error_propagate(errp, local_err);
 693        return;
 694    }
 695    migration_incoming_process();
 696}
 697
 698void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
 699{
 700    MigrationIncomingState *mis = migration_incoming_get_current();
 701    Error *local_err = NULL;
 702    bool start_migration;
 703
 704    if (!mis->from_src_file) {
 705        /* The first connection (multifd may have multiple) */
 706        QEMUFile *f = qemu_fopen_channel_input(ioc);
 707
 708        /* If it's a recovery, we're done */
 709        if (postcopy_try_recover(f)) {
 710            return;
 711        }
 712
 713        if (migration_incoming_setup(f, &local_err)) {
 714            error_propagate(errp, local_err);
 715            return;
 716        }
 717
 718        /*
 719         * Common migration only needs one channel, so we can start
 720         * right now.  Multifd needs more than one channel, we wait.
 721         */
 722        start_migration = !migrate_use_multifd();
 723    } else {
 724        /* Multiple connections */
 725        assert(migrate_use_multifd());
 726        start_migration = multifd_recv_new_channel(ioc, &local_err);
 727        if (local_err) {
 728            error_propagate(errp, local_err);
 729            return;
 730        }
 731    }
 732
 733    if (start_migration) {
 734        migration_incoming_process();
 735    }
 736}
 737
 738/**
 739 * @migration_has_all_channels: We have received all channels that we need
 740 *
 741 * Returns true when we have got connections to all the channels that
 742 * we need for migration.
 743 */
 744bool migration_has_all_channels(void)
 745{
 746    MigrationIncomingState *mis = migration_incoming_get_current();
 747    bool all_channels;
 748
 749    all_channels = multifd_recv_all_channels_created();
 750
 751    return all_channels && mis->from_src_file != NULL;
 752}
 753
 754/*
 755 * Send a 'SHUT' message on the return channel with the given value
 756 * to indicate that we've finished with the RP.  Non-0 value indicates
 757 * error.
 758 */
 759void migrate_send_rp_shut(MigrationIncomingState *mis,
 760                          uint32_t value)
 761{
 762    uint32_t buf;
 763
 764    buf = cpu_to_be32(value);
 765    migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf);
 766}
 767
 768/*
 769 * Send a 'PONG' message on the return channel with the given value
 770 * (normally in response to a 'PING')
 771 */
 772void migrate_send_rp_pong(MigrationIncomingState *mis,
 773                          uint32_t value)
 774{
 775    uint32_t buf;
 776
 777    buf = cpu_to_be32(value);
 778    migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
 779}
 780
 781void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
 782                                 char *block_name)
 783{
 784    char buf[512];
 785    int len;
 786    int64_t res;
 787
 788    /*
 789     * First, we send the header part. It contains only the len of
 790     * idstr, and the idstr itself.
 791     */
 792    len = strlen(block_name);
 793    buf[0] = len;
 794    memcpy(buf + 1, block_name, len);
 795
 796    if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
 797        error_report("%s: MSG_RP_RECV_BITMAP only used for recovery",
 798                     __func__);
 799        return;
 800    }
 801
 802    migrate_send_rp_message(mis, MIG_RP_MSG_RECV_BITMAP, len + 1, buf);
 803
 804    /*
 805     * Next, we dump the received bitmap to the stream.
 806     *
 807     * TODO: currently we are safe since we are the only one that is
 808     * using the to_src_file handle (fault thread is still paused),
 809     * and it's ok even not taking the mutex. However the best way is
 810     * to take the lock before sending the message header, and release
 811     * the lock after sending the bitmap.
 812     */
 813    qemu_mutex_lock(&mis->rp_mutex);
 814    res = ramblock_recv_bitmap_send(mis->to_src_file, block_name);
 815    qemu_mutex_unlock(&mis->rp_mutex);
 816
 817    trace_migrate_send_rp_recv_bitmap(block_name, res);
 818}
 819
 820void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
 821{
 822    uint32_t buf;
 823
 824    buf = cpu_to_be32(value);
 825    migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf);
 826}
 827
 828MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp)
 829{
 830    MigrationCapabilityStatusList *head = NULL, **tail = &head;
 831    MigrationCapabilityStatus *caps;
 832    MigrationState *s = migrate_get_current();
 833    int i;
 834
 835    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 836#ifndef CONFIG_LIVE_BLOCK_MIGRATION
 837        if (i == MIGRATION_CAPABILITY_BLOCK) {
 838            continue;
 839        }
 840#endif
 841        caps = g_malloc0(sizeof(*caps));
 842        caps->capability = i;
 843        caps->state = s->enabled_capabilities[i];
 844        QAPI_LIST_APPEND(tail, caps);
 845    }
 846
 847    return head;
 848}
 849
 850MigrationParameters *qmp_query_migrate_parameters(Error **errp)
 851{
 852    MigrationParameters *params;
 853    MigrationState *s = migrate_get_current();
 854
 855    /* TODO use QAPI_CLONE() instead of duplicating it inline */
 856    params = g_malloc0(sizeof(*params));
 857    params->has_compress_level = true;
 858    params->compress_level = s->parameters.compress_level;
 859    params->has_compress_threads = true;
 860    params->compress_threads = s->parameters.compress_threads;
 861    params->has_compress_wait_thread = true;
 862    params->compress_wait_thread = s->parameters.compress_wait_thread;
 863    params->has_decompress_threads = true;
 864    params->decompress_threads = s->parameters.decompress_threads;
 865    params->has_throttle_trigger_threshold = true;
 866    params->throttle_trigger_threshold = s->parameters.throttle_trigger_threshold;
 867    params->has_cpu_throttle_initial = true;
 868    params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
 869    params->has_cpu_throttle_increment = true;
 870    params->cpu_throttle_increment = s->parameters.cpu_throttle_increment;
 871    params->has_cpu_throttle_tailslow = true;
 872    params->cpu_throttle_tailslow = s->parameters.cpu_throttle_tailslow;
 873    params->has_tls_creds = true;
 874    params->tls_creds = g_strdup(s->parameters.tls_creds);
 875    params->has_tls_hostname = true;
 876    params->tls_hostname = g_strdup(s->parameters.tls_hostname);
 877    params->has_tls_authz = true;
 878    params->tls_authz = g_strdup(s->parameters.tls_authz ?
 879                                 s->parameters.tls_authz : "");
 880    params->has_max_bandwidth = true;
 881    params->max_bandwidth = s->parameters.max_bandwidth;
 882    params->has_downtime_limit = true;
 883    params->downtime_limit = s->parameters.downtime_limit;
 884    params->has_x_checkpoint_delay = true;
 885    params->x_checkpoint_delay = s->parameters.x_checkpoint_delay;
 886    params->has_block_incremental = true;
 887    params->block_incremental = s->parameters.block_incremental;
 888    params->has_multifd_channels = true;
 889    params->multifd_channels = s->parameters.multifd_channels;
 890    params->has_multifd_compression = true;
 891    params->multifd_compression = s->parameters.multifd_compression;
 892    params->has_multifd_zlib_level = true;
 893    params->multifd_zlib_level = s->parameters.multifd_zlib_level;
 894    params->has_multifd_zstd_level = true;
 895    params->multifd_zstd_level = s->parameters.multifd_zstd_level;
 896    params->has_xbzrle_cache_size = true;
 897    params->xbzrle_cache_size = s->parameters.xbzrle_cache_size;
 898    params->has_max_postcopy_bandwidth = true;
 899    params->max_postcopy_bandwidth = s->parameters.max_postcopy_bandwidth;
 900    params->has_max_cpu_throttle = true;
 901    params->max_cpu_throttle = s->parameters.max_cpu_throttle;
 902    params->has_announce_initial = true;
 903    params->announce_initial = s->parameters.announce_initial;
 904    params->has_announce_max = true;
 905    params->announce_max = s->parameters.announce_max;
 906    params->has_announce_rounds = true;
 907    params->announce_rounds = s->parameters.announce_rounds;
 908    params->has_announce_step = true;
 909    params->announce_step = s->parameters.announce_step;
 910
 911    if (s->parameters.has_block_bitmap_mapping) {
 912        params->has_block_bitmap_mapping = true;
 913        params->block_bitmap_mapping =
 914            QAPI_CLONE(BitmapMigrationNodeAliasList,
 915                       s->parameters.block_bitmap_mapping);
 916    }
 917
 918    return params;
 919}
 920
 921AnnounceParameters *migrate_announce_params(void)
 922{
 923    static AnnounceParameters ap;
 924
 925    MigrationState *s = migrate_get_current();
 926
 927    ap.initial = s->parameters.announce_initial;
 928    ap.max = s->parameters.announce_max;
 929    ap.rounds = s->parameters.announce_rounds;
 930    ap.step = s->parameters.announce_step;
 931
 932    return &ap;
 933}
 934
 935/*
 936 * Return true if we're already in the middle of a migration
 937 * (i.e. any of the active or setup states)
 938 */
 939bool migration_is_setup_or_active(int state)
 940{
 941    switch (state) {
 942    case MIGRATION_STATUS_ACTIVE:
 943    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
 944    case MIGRATION_STATUS_POSTCOPY_PAUSED:
 945    case MIGRATION_STATUS_POSTCOPY_RECOVER:
 946    case MIGRATION_STATUS_SETUP:
 947    case MIGRATION_STATUS_PRE_SWITCHOVER:
 948    case MIGRATION_STATUS_DEVICE:
 949    case MIGRATION_STATUS_WAIT_UNPLUG:
 950    case MIGRATION_STATUS_COLO:
 951        return true;
 952
 953    default:
 954        return false;
 955
 956    }
 957}
 958
 959bool migration_is_running(int state)
 960{
 961    switch (state) {
 962    case MIGRATION_STATUS_ACTIVE:
 963    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
 964    case MIGRATION_STATUS_POSTCOPY_PAUSED:
 965    case MIGRATION_STATUS_POSTCOPY_RECOVER:
 966    case MIGRATION_STATUS_SETUP:
 967    case MIGRATION_STATUS_PRE_SWITCHOVER:
 968    case MIGRATION_STATUS_DEVICE:
 969    case MIGRATION_STATUS_WAIT_UNPLUG:
 970    case MIGRATION_STATUS_CANCELLING:
 971        return true;
 972
 973    default:
 974        return false;
 975
 976    }
 977}
 978
 979static void populate_time_info(MigrationInfo *info, MigrationState *s)
 980{
 981    info->has_status = true;
 982    info->has_setup_time = true;
 983    info->setup_time = s->setup_time;
 984    if (s->state == MIGRATION_STATUS_COMPLETED) {
 985        info->has_total_time = true;
 986        info->total_time = s->total_time;
 987        info->has_downtime = true;
 988        info->downtime = s->downtime;
 989    } else {
 990        info->has_total_time = true;
 991        info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
 992                           s->start_time;
 993        info->has_expected_downtime = true;
 994        info->expected_downtime = s->expected_downtime;
 995    }
 996}
 997
 998static void populate_ram_info(MigrationInfo *info, MigrationState *s)
 999{
1000    info->has_ram = true;
1001    info->ram = g_malloc0(sizeof(*info->ram));
1002    info->ram->transferred = ram_counters.transferred;
1003    info->ram->total = ram_bytes_total();
1004    info->ram->duplicate = ram_counters.duplicate;
1005    /* legacy value.  It is not used anymore */
1006    info->ram->skipped = 0;
1007    info->ram->normal = ram_counters.normal;
1008    info->ram->normal_bytes = ram_counters.normal *
1009        qemu_target_page_size();
1010    info->ram->mbps = s->mbps;
1011    info->ram->dirty_sync_count = ram_counters.dirty_sync_count;
1012    info->ram->postcopy_requests = ram_counters.postcopy_requests;
1013    info->ram->page_size = qemu_target_page_size();
1014    info->ram->multifd_bytes = ram_counters.multifd_bytes;
1015    info->ram->pages_per_second = s->pages_per_second;
1016
1017    if (migrate_use_xbzrle()) {
1018        info->has_xbzrle_cache = true;
1019        info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache));
1020        info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size();
1021        info->xbzrle_cache->bytes = xbzrle_counters.bytes;
1022        info->xbzrle_cache->pages = xbzrle_counters.pages;
1023        info->xbzrle_cache->cache_miss = xbzrle_counters.cache_miss;
1024        info->xbzrle_cache->cache_miss_rate = xbzrle_counters.cache_miss_rate;
1025        info->xbzrle_cache->encoding_rate = xbzrle_counters.encoding_rate;
1026        info->xbzrle_cache->overflow = xbzrle_counters.overflow;
1027    }
1028
1029    if (migrate_use_compression()) {
1030        info->has_compression = true;
1031        info->compression = g_malloc0(sizeof(*info->compression));
1032        info->compression->pages = compression_counters.pages;
1033        info->compression->busy = compression_counters.busy;
1034        info->compression->busy_rate = compression_counters.busy_rate;
1035        info->compression->compressed_size =
1036                                    compression_counters.compressed_size;
1037        info->compression->compression_rate =
1038                                    compression_counters.compression_rate;
1039    }
1040
1041    if (cpu_throttle_active()) {
1042        info->has_cpu_throttle_percentage = true;
1043        info->cpu_throttle_percentage = cpu_throttle_get_percentage();
1044    }
1045
1046    if (s->state != MIGRATION_STATUS_COMPLETED) {
1047        info->ram->remaining = ram_bytes_remaining();
1048        info->ram->dirty_pages_rate = ram_counters.dirty_pages_rate;
1049    }
1050}
1051
1052static void populate_disk_info(MigrationInfo *info)
1053{
1054    if (blk_mig_active()) {
1055        info->has_disk = true;
1056        info->disk = g_malloc0(sizeof(*info->disk));
1057        info->disk->transferred = blk_mig_bytes_transferred();
1058        info->disk->remaining = blk_mig_bytes_remaining();
1059        info->disk->total = blk_mig_bytes_total();
1060    }
1061}
1062
1063static void fill_source_migration_info(MigrationInfo *info)
1064{
1065    MigrationState *s = migrate_get_current();
1066    GSList *cur_blocker = migration_blockers;
1067
1068    info->blocked_reasons = NULL;
1069
1070    /*
1071     * There are two types of reasons a migration might be blocked;
1072     * a) devices marked in VMState as non-migratable, and
1073     * b) Explicit migration blockers
1074     * We need to add both of them here.
1075     */
1076    qemu_savevm_non_migratable_list(&info->blocked_reasons);
1077
1078    while (cur_blocker) {
1079        QAPI_LIST_PREPEND(info->blocked_reasons,
1080                          g_strdup(error_get_pretty(cur_blocker->data)));
1081        cur_blocker = g_slist_next(cur_blocker);
1082    }
1083    info->has_blocked_reasons = info->blocked_reasons != NULL;
1084
1085    switch (s->state) {
1086    case MIGRATION_STATUS_NONE:
1087        /* no migration has happened ever */
1088        /* do not overwrite destination migration status */
1089        return;
1090    case MIGRATION_STATUS_SETUP:
1091        info->has_status = true;
1092        info->has_total_time = false;
1093        break;
1094    case MIGRATION_STATUS_ACTIVE:
1095    case MIGRATION_STATUS_CANCELLING:
1096    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1097    case MIGRATION_STATUS_PRE_SWITCHOVER:
1098    case MIGRATION_STATUS_DEVICE:
1099    case MIGRATION_STATUS_POSTCOPY_PAUSED:
1100    case MIGRATION_STATUS_POSTCOPY_RECOVER:
1101        /* TODO add some postcopy stats */
1102        populate_time_info(info, s);
1103        populate_ram_info(info, s);
1104        populate_disk_info(info);
1105        populate_vfio_info(info);
1106        break;
1107    case MIGRATION_STATUS_COLO:
1108        info->has_status = true;
1109        /* TODO: display COLO specific information (checkpoint info etc.) */
1110        break;
1111    case MIGRATION_STATUS_COMPLETED:
1112        populate_time_info(info, s);
1113        populate_ram_info(info, s);
1114        populate_vfio_info(info);
1115        break;
1116    case MIGRATION_STATUS_FAILED:
1117        info->has_status = true;
1118        if (s->error) {
1119            info->has_error_desc = true;
1120            info->error_desc = g_strdup(error_get_pretty(s->error));
1121        }
1122        break;
1123    case MIGRATION_STATUS_CANCELLED:
1124        info->has_status = true;
1125        break;
1126    case MIGRATION_STATUS_WAIT_UNPLUG:
1127        info->has_status = true;
1128        break;
1129    }
1130    info->status = s->state;
1131}
1132
1133typedef enum WriteTrackingSupport {
1134    WT_SUPPORT_UNKNOWN = 0,
1135    WT_SUPPORT_ABSENT,
1136    WT_SUPPORT_AVAILABLE,
1137    WT_SUPPORT_COMPATIBLE
1138} WriteTrackingSupport;
1139
1140static
1141WriteTrackingSupport migrate_query_write_tracking(void)
1142{
1143    /* Check if kernel supports required UFFD features */
1144    if (!ram_write_tracking_available()) {
1145        return WT_SUPPORT_ABSENT;
1146    }
1147    /*
1148     * Check if current memory configuration is
1149     * compatible with required UFFD features.
1150     */
1151    if (!ram_write_tracking_compatible()) {
1152        return WT_SUPPORT_AVAILABLE;
1153    }
1154
1155    return WT_SUPPORT_COMPATIBLE;
1156}
1157
1158/**
1159 * @migration_caps_check - check capability validity
1160 *
1161 * @cap_list: old capability list, array of bool
1162 * @params: new capabilities to be applied soon
1163 * @errp: set *errp if the check failed, with reason
1164 *
1165 * Returns true if check passed, otherwise false.
1166 */
1167static bool migrate_caps_check(bool *cap_list,
1168                               MigrationCapabilityStatusList *params,
1169                               Error **errp)
1170{
1171    MigrationCapabilityStatusList *cap;
1172    bool old_postcopy_cap;
1173    MigrationIncomingState *mis = migration_incoming_get_current();
1174
1175    old_postcopy_cap = cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM];
1176
1177    for (cap = params; cap; cap = cap->next) {
1178        cap_list[cap->value->capability] = cap->value->state;
1179    }
1180
1181#ifndef CONFIG_LIVE_BLOCK_MIGRATION
1182    if (cap_list[MIGRATION_CAPABILITY_BLOCK]) {
1183        error_setg(errp, "QEMU compiled without old-style (blk/-b, inc/-i) "
1184                   "block migration");
1185        error_append_hint(errp, "Use drive_mirror+NBD instead.\n");
1186        return false;
1187    }
1188#endif
1189
1190#ifndef CONFIG_REPLICATION
1191    if (cap_list[MIGRATION_CAPABILITY_X_COLO]) {
1192        error_setg(errp, "QEMU compiled without replication module"
1193                   " can't enable COLO");
1194        error_append_hint(errp, "Please enable replication before COLO.\n");
1195        return false;
1196    }
1197#endif
1198
1199    if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
1200        /* This check is reasonably expensive, so only when it's being
1201         * set the first time, also it's only the destination that needs
1202         * special support.
1203         */
1204        if (!old_postcopy_cap && runstate_check(RUN_STATE_INMIGRATE) &&
1205            !postcopy_ram_supported_by_host(mis)) {
1206            /* postcopy_ram_supported_by_host will have emitted a more
1207             * detailed message
1208             */
1209            error_setg(errp, "Postcopy is not supported");
1210            return false;
1211        }
1212
1213        if (cap_list[MIGRATION_CAPABILITY_X_IGNORE_SHARED]) {
1214            error_setg(errp, "Postcopy is not compatible with ignore-shared");
1215            return false;
1216        }
1217    }
1218
1219    if (cap_list[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT]) {
1220        WriteTrackingSupport wt_support;
1221        int idx;
1222        /*
1223         * Check if 'background-snapshot' capability is supported by
1224         * host kernel and compatible with guest memory configuration.
1225         */
1226        wt_support = migrate_query_write_tracking();
1227        if (wt_support < WT_SUPPORT_AVAILABLE) {
1228            error_setg(errp, "Background-snapshot is not supported by host kernel");
1229            return false;
1230        }
1231        if (wt_support < WT_SUPPORT_COMPATIBLE) {
1232            error_setg(errp, "Background-snapshot is not compatible "
1233                    "with guest memory configuration");
1234            return false;
1235        }
1236
1237        /*
1238         * Check if there are any migration capabilities
1239         * incompatible with 'background-snapshot'.
1240         */
1241        for (idx = 0; idx < check_caps_background_snapshot.size; idx++) {
1242            int incomp_cap = check_caps_background_snapshot.caps[idx];
1243            if (cap_list[incomp_cap]) {
1244                error_setg(errp,
1245                        "Background-snapshot is not compatible with %s",
1246                        MigrationCapability_str(incomp_cap));
1247                return false;
1248            }
1249        }
1250    }
1251
1252    return true;
1253}
1254
1255static void fill_destination_migration_info(MigrationInfo *info)
1256{
1257    MigrationIncomingState *mis = migration_incoming_get_current();
1258
1259    if (mis->socket_address_list) {
1260        info->has_socket_address = true;
1261        info->socket_address =
1262            QAPI_CLONE(SocketAddressList, mis->socket_address_list);
1263    }
1264
1265    switch (mis->state) {
1266    case MIGRATION_STATUS_NONE:
1267        return;
1268    case MIGRATION_STATUS_SETUP:
1269    case MIGRATION_STATUS_CANCELLING:
1270    case MIGRATION_STATUS_CANCELLED:
1271    case MIGRATION_STATUS_ACTIVE:
1272    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1273    case MIGRATION_STATUS_POSTCOPY_PAUSED:
1274    case MIGRATION_STATUS_POSTCOPY_RECOVER:
1275    case MIGRATION_STATUS_FAILED:
1276    case MIGRATION_STATUS_COLO:
1277        info->has_status = true;
1278        break;
1279    case MIGRATION_STATUS_COMPLETED:
1280        info->has_status = true;
1281        fill_destination_postcopy_migration_info(info);
1282        break;
1283    }
1284    info->status = mis->state;
1285}
1286
1287MigrationInfo *qmp_query_migrate(Error **errp)
1288{
1289    MigrationInfo *info = g_malloc0(sizeof(*info));
1290
1291    fill_destination_migration_info(info);
1292    fill_source_migration_info(info);
1293
1294    return info;
1295}
1296
1297void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
1298                                  Error **errp)
1299{
1300    MigrationState *s = migrate_get_current();
1301    MigrationCapabilityStatusList *cap;
1302    bool cap_list[MIGRATION_CAPABILITY__MAX];
1303
1304    if (migration_is_running(s->state)) {
1305        error_setg(errp, QERR_MIGRATION_ACTIVE);
1306        return;
1307    }
1308
1309    memcpy(cap_list, s->enabled_capabilities, sizeof(cap_list));
1310    if (!migrate_caps_check(cap_list, params, errp)) {
1311        return;
1312    }
1313
1314    for (cap = params; cap; cap = cap->next) {
1315        s->enabled_capabilities[cap->value->capability] = cap->value->state;
1316    }
1317}
1318
1319/*
1320 * Check whether the parameters are valid. Error will be put into errp
1321 * (if provided). Return true if valid, otherwise false.
1322 */
1323static bool migrate_params_check(MigrationParameters *params, Error **errp)
1324{
1325    if (params->has_compress_level &&
1326        (params->compress_level > 9)) {
1327        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level",
1328                   "a value between 0 and 9");
1329        return false;
1330    }
1331
1332    if (params->has_compress_threads && (params->compress_threads < 1)) {
1333        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1334                   "compress_threads",
1335                   "a value between 1 and 255");
1336        return false;
1337    }
1338
1339    if (params->has_decompress_threads && (params->decompress_threads < 1)) {
1340        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1341                   "decompress_threads",
1342                   "a value between 1 and 255");
1343        return false;
1344    }
1345
1346    if (params->has_throttle_trigger_threshold &&
1347        (params->throttle_trigger_threshold < 1 ||
1348         params->throttle_trigger_threshold > 100)) {
1349        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1350                   "throttle_trigger_threshold",
1351                   "an integer in the range of 1 to 100");
1352        return false;
1353    }
1354
1355    if (params->has_cpu_throttle_initial &&
1356        (params->cpu_throttle_initial < 1 ||
1357         params->cpu_throttle_initial > 99)) {
1358        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1359                   "cpu_throttle_initial",
1360                   "an integer in the range of 1 to 99");
1361        return false;
1362    }
1363
1364    if (params->has_cpu_throttle_increment &&
1365        (params->cpu_throttle_increment < 1 ||
1366         params->cpu_throttle_increment > 99)) {
1367        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1368                   "cpu_throttle_increment",
1369                   "an integer in the range of 1 to 99");
1370        return false;
1371    }
1372
1373    if (params->has_max_bandwidth && (params->max_bandwidth > SIZE_MAX)) {
1374        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1375                   "max_bandwidth",
1376                   "an integer in the range of 0 to "stringify(SIZE_MAX)
1377                   " bytes/second");
1378        return false;
1379    }
1380
1381    if (params->has_downtime_limit &&
1382        (params->downtime_limit > MAX_MIGRATE_DOWNTIME)) {
1383        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1384                   "downtime_limit",
1385                   "an integer in the range of 0 to "
1386                    stringify(MAX_MIGRATE_DOWNTIME)" ms");
1387        return false;
1388    }
1389
1390    /* x_checkpoint_delay is now always positive */
1391
1392    if (params->has_multifd_channels && (params->multifd_channels < 1)) {
1393        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1394                   "multifd_channels",
1395                   "a value between 1 and 255");
1396        return false;
1397    }
1398
1399    if (params->has_multifd_zlib_level &&
1400        (params->multifd_zlib_level > 9)) {
1401        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zlib_level",
1402                   "a value between 0 and 9");
1403        return false;
1404    }
1405
1406    if (params->has_multifd_zstd_level &&
1407        (params->multifd_zstd_level > 20)) {
1408        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "multifd_zstd_level",
1409                   "a value between 0 and 20");
1410        return false;
1411    }
1412
1413    if (params->has_xbzrle_cache_size &&
1414        (params->xbzrle_cache_size < qemu_target_page_size() ||
1415         !is_power_of_2(params->xbzrle_cache_size))) {
1416        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1417                   "xbzrle_cache_size",
1418                   "a power of two no less than the target page size");
1419        return false;
1420    }
1421
1422    if (params->has_max_cpu_throttle &&
1423        (params->max_cpu_throttle < params->cpu_throttle_initial ||
1424         params->max_cpu_throttle > 99)) {
1425        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1426                   "max_cpu_throttle",
1427                   "an integer in the range of cpu_throttle_initial to 99");
1428        return false;
1429    }
1430
1431    if (params->has_announce_initial &&
1432        params->announce_initial > 100000) {
1433        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1434                   "announce_initial",
1435                   "a value between 0 and 100000");
1436        return false;
1437    }
1438    if (params->has_announce_max &&
1439        params->announce_max > 100000) {
1440        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1441                   "announce_max",
1442                   "a value between 0 and 100000");
1443       return false;
1444    }
1445    if (params->has_announce_rounds &&
1446        params->announce_rounds > 1000) {
1447        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1448                   "announce_rounds",
1449                   "a value between 0 and 1000");
1450       return false;
1451    }
1452    if (params->has_announce_step &&
1453        (params->announce_step < 1 ||
1454        params->announce_step > 10000)) {
1455        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1456                   "announce_step",
1457                   "a value between 0 and 10000");
1458       return false;
1459    }
1460
1461    if (params->has_block_bitmap_mapping &&
1462        !check_dirty_bitmap_mig_alias_map(params->block_bitmap_mapping, errp)) {
1463        error_prepend(errp, "Invalid mapping given for block-bitmap-mapping: ");
1464        return false;
1465    }
1466
1467    return true;
1468}
1469
1470static void migrate_params_test_apply(MigrateSetParameters *params,
1471                                      MigrationParameters *dest)
1472{
1473    *dest = migrate_get_current()->parameters;
1474
1475    /* TODO use QAPI_CLONE() instead of duplicating it inline */
1476
1477    if (params->has_compress_level) {
1478        dest->compress_level = params->compress_level;
1479    }
1480
1481    if (params->has_compress_threads) {
1482        dest->compress_threads = params->compress_threads;
1483    }
1484
1485    if (params->has_compress_wait_thread) {
1486        dest->compress_wait_thread = params->compress_wait_thread;
1487    }
1488
1489    if (params->has_decompress_threads) {
1490        dest->decompress_threads = params->decompress_threads;
1491    }
1492
1493    if (params->has_throttle_trigger_threshold) {
1494        dest->throttle_trigger_threshold = params->throttle_trigger_threshold;
1495    }
1496
1497    if (params->has_cpu_throttle_initial) {
1498        dest->cpu_throttle_initial = params->cpu_throttle_initial;
1499    }
1500
1501    if (params->has_cpu_throttle_increment) {
1502        dest->cpu_throttle_increment = params->cpu_throttle_increment;
1503    }
1504
1505    if (params->has_cpu_throttle_tailslow) {
1506        dest->cpu_throttle_tailslow = params->cpu_throttle_tailslow;
1507    }
1508
1509    if (params->has_tls_creds) {
1510        assert(params->tls_creds->type == QTYPE_QSTRING);
1511        dest->tls_creds = params->tls_creds->u.s;
1512    }
1513
1514    if (params->has_tls_hostname) {
1515        assert(params->tls_hostname->type == QTYPE_QSTRING);
1516        dest->tls_hostname = params->tls_hostname->u.s;
1517    }
1518
1519    if (params->has_max_bandwidth) {
1520        dest->max_bandwidth = params->max_bandwidth;
1521    }
1522
1523    if (params->has_downtime_limit) {
1524        dest->downtime_limit = params->downtime_limit;
1525    }
1526
1527    if (params->has_x_checkpoint_delay) {
1528        dest->x_checkpoint_delay = params->x_checkpoint_delay;
1529    }
1530
1531    if (params->has_block_incremental) {
1532        dest->block_incremental = params->block_incremental;
1533    }
1534    if (params->has_multifd_channels) {
1535        dest->multifd_channels = params->multifd_channels;
1536    }
1537    if (params->has_multifd_compression) {
1538        dest->multifd_compression = params->multifd_compression;
1539    }
1540    if (params->has_xbzrle_cache_size) {
1541        dest->xbzrle_cache_size = params->xbzrle_cache_size;
1542    }
1543    if (params->has_max_postcopy_bandwidth) {
1544        dest->max_postcopy_bandwidth = params->max_postcopy_bandwidth;
1545    }
1546    if (params->has_max_cpu_throttle) {
1547        dest->max_cpu_throttle = params->max_cpu_throttle;
1548    }
1549    if (params->has_announce_initial) {
1550        dest->announce_initial = params->announce_initial;
1551    }
1552    if (params->has_announce_max) {
1553        dest->announce_max = params->announce_max;
1554    }
1555    if (params->has_announce_rounds) {
1556        dest->announce_rounds = params->announce_rounds;
1557    }
1558    if (params->has_announce_step) {
1559        dest->announce_step = params->announce_step;
1560    }
1561
1562    if (params->has_block_bitmap_mapping) {
1563        dest->has_block_bitmap_mapping = true;
1564        dest->block_bitmap_mapping = params->block_bitmap_mapping;
1565    }
1566}
1567
1568static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
1569{
1570    MigrationState *s = migrate_get_current();
1571
1572    /* TODO use QAPI_CLONE() instead of duplicating it inline */
1573
1574    if (params->has_compress_level) {
1575        s->parameters.compress_level = params->compress_level;
1576    }
1577
1578    if (params->has_compress_threads) {
1579        s->parameters.compress_threads = params->compress_threads;
1580    }
1581
1582    if (params->has_compress_wait_thread) {
1583        s->parameters.compress_wait_thread = params->compress_wait_thread;
1584    }
1585
1586    if (params->has_decompress_threads) {
1587        s->parameters.decompress_threads = params->decompress_threads;
1588    }
1589
1590    if (params->has_throttle_trigger_threshold) {
1591        s->parameters.throttle_trigger_threshold = params->throttle_trigger_threshold;
1592    }
1593
1594    if (params->has_cpu_throttle_initial) {
1595        s->parameters.cpu_throttle_initial = params->cpu_throttle_initial;
1596    }
1597
1598    if (params->has_cpu_throttle_increment) {
1599        s->parameters.cpu_throttle_increment = params->cpu_throttle_increment;
1600    }
1601
1602    if (params->has_cpu_throttle_tailslow) {
1603        s->parameters.cpu_throttle_tailslow = params->cpu_throttle_tailslow;
1604    }
1605
1606    if (params->has_tls_creds) {
1607        g_free(s->parameters.tls_creds);
1608        assert(params->tls_creds->type == QTYPE_QSTRING);
1609        s->parameters.tls_creds = g_strdup(params->tls_creds->u.s);
1610    }
1611
1612    if (params->has_tls_hostname) {
1613        g_free(s->parameters.tls_hostname);
1614        assert(params->tls_hostname->type == QTYPE_QSTRING);
1615        s->parameters.tls_hostname = g_strdup(params->tls_hostname->u.s);
1616    }
1617
1618    if (params->has_tls_authz) {
1619        g_free(s->parameters.tls_authz);
1620        assert(params->tls_authz->type == QTYPE_QSTRING);
1621        s->parameters.tls_authz = g_strdup(params->tls_authz->u.s);
1622    }
1623
1624    if (params->has_max_bandwidth) {
1625        s->parameters.max_bandwidth = params->max_bandwidth;
1626        if (s->to_dst_file && !migration_in_postcopy()) {
1627            qemu_file_set_rate_limit(s->to_dst_file,
1628                                s->parameters.max_bandwidth / XFER_LIMIT_RATIO);
1629        }
1630    }
1631
1632    if (params->has_downtime_limit) {
1633        s->parameters.downtime_limit = params->downtime_limit;
1634    }
1635
1636    if (params->has_x_checkpoint_delay) {
1637        s->parameters.x_checkpoint_delay = params->x_checkpoint_delay;
1638        if (migration_in_colo_state()) {
1639            colo_checkpoint_notify(s);
1640        }
1641    }
1642
1643    if (params->has_block_incremental) {
1644        s->parameters.block_incremental = params->block_incremental;
1645    }
1646    if (params->has_multifd_channels) {
1647        s->parameters.multifd_channels = params->multifd_channels;
1648    }
1649    if (params->has_multifd_compression) {
1650        s->parameters.multifd_compression = params->multifd_compression;
1651    }
1652    if (params->has_xbzrle_cache_size) {
1653        s->parameters.xbzrle_cache_size = params->xbzrle_cache_size;
1654        xbzrle_cache_resize(params->xbzrle_cache_size, errp);
1655    }
1656    if (params->has_max_postcopy_bandwidth) {
1657        s->parameters.max_postcopy_bandwidth = params->max_postcopy_bandwidth;
1658        if (s->to_dst_file && migration_in_postcopy()) {
1659            qemu_file_set_rate_limit(s->to_dst_file,
1660                    s->parameters.max_postcopy_bandwidth / XFER_LIMIT_RATIO);
1661        }
1662    }
1663    if (params->has_max_cpu_throttle) {
1664        s->parameters.max_cpu_throttle = params->max_cpu_throttle;
1665    }
1666    if (params->has_announce_initial) {
1667        s->parameters.announce_initial = params->announce_initial;
1668    }
1669    if (params->has_announce_max) {
1670        s->parameters.announce_max = params->announce_max;
1671    }
1672    if (params->has_announce_rounds) {
1673        s->parameters.announce_rounds = params->announce_rounds;
1674    }
1675    if (params->has_announce_step) {
1676        s->parameters.announce_step = params->announce_step;
1677    }
1678
1679    if (params->has_block_bitmap_mapping) {
1680        qapi_free_BitmapMigrationNodeAliasList(
1681            s->parameters.block_bitmap_mapping);
1682
1683        s->parameters.has_block_bitmap_mapping = true;
1684        s->parameters.block_bitmap_mapping =
1685            QAPI_CLONE(BitmapMigrationNodeAliasList,
1686                       params->block_bitmap_mapping);
1687    }
1688}
1689
1690void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
1691{
1692    MigrationParameters tmp;
1693
1694    /* TODO Rewrite "" to null instead */
1695    if (params->has_tls_creds
1696        && params->tls_creds->type == QTYPE_QNULL) {
1697        qobject_unref(params->tls_creds->u.n);
1698        params->tls_creds->type = QTYPE_QSTRING;
1699        params->tls_creds->u.s = strdup("");
1700    }
1701    /* TODO Rewrite "" to null instead */
1702    if (params->has_tls_hostname
1703        && params->tls_hostname->type == QTYPE_QNULL) {
1704        qobject_unref(params->tls_hostname->u.n);
1705        params->tls_hostname->type = QTYPE_QSTRING;
1706        params->tls_hostname->u.s = strdup("");
1707    }
1708
1709    migrate_params_test_apply(params, &tmp);
1710
1711    if (!migrate_params_check(&tmp, errp)) {
1712        /* Invalid parameter */
1713        return;
1714    }
1715
1716    migrate_params_apply(params, errp);
1717}
1718
1719
1720void qmp_migrate_start_postcopy(Error **errp)
1721{
1722    MigrationState *s = migrate_get_current();
1723
1724    if (!migrate_postcopy()) {
1725        error_setg(errp, "Enable postcopy with migrate_set_capability before"
1726                         " the start of migration");
1727        return;
1728    }
1729
1730    if (s->state == MIGRATION_STATUS_NONE) {
1731        error_setg(errp, "Postcopy must be started after migration has been"
1732                         " started");
1733        return;
1734    }
1735    /*
1736     * we don't error if migration has finished since that would be racy
1737     * with issuing this command.
1738     */
1739    qatomic_set(&s->start_postcopy, true);
1740}
1741
1742/* shared migration helpers */
1743
1744void migrate_set_state(int *state, int old_state, int new_state)
1745{
1746    assert(new_state < MIGRATION_STATUS__MAX);
1747    if (qatomic_cmpxchg(state, old_state, new_state) == old_state) {
1748        trace_migrate_set_state(MigrationStatus_str(new_state));
1749        migrate_generate_event(new_state);
1750    }
1751}
1752
1753static MigrationCapabilityStatus *migrate_cap_add(MigrationCapability index,
1754                                                  bool state)
1755{
1756    MigrationCapabilityStatus *cap;
1757
1758    cap = g_new0(MigrationCapabilityStatus, 1);
1759    cap->capability = index;
1760    cap->state = state;
1761
1762    return cap;
1763}
1764
1765void migrate_set_block_enabled(bool value, Error **errp)
1766{
1767    MigrationCapabilityStatusList *cap = NULL;
1768
1769    QAPI_LIST_PREPEND(cap, migrate_cap_add(MIGRATION_CAPABILITY_BLOCK, value));
1770    qmp_migrate_set_capabilities(cap, errp);
1771    qapi_free_MigrationCapabilityStatusList(cap);
1772}
1773
1774static void migrate_set_block_incremental(MigrationState *s, bool value)
1775{
1776    s->parameters.block_incremental = value;
1777}
1778
1779static void block_cleanup_parameters(MigrationState *s)
1780{
1781    if (s->must_remove_block_options) {
1782        /* setting to false can never fail */
1783        migrate_set_block_enabled(false, &error_abort);
1784        migrate_set_block_incremental(s, false);
1785        s->must_remove_block_options = false;
1786    }
1787}
1788
1789static void migrate_fd_cleanup(MigrationState *s)
1790{
1791    qemu_bh_delete(s->cleanup_bh);
1792    s->cleanup_bh = NULL;
1793
1794    qemu_savevm_state_cleanup();
1795
1796    if (s->to_dst_file) {
1797        QEMUFile *tmp;
1798
1799        trace_migrate_fd_cleanup();
1800        qemu_mutex_unlock_iothread();
1801        if (s->migration_thread_running) {
1802            qemu_thread_join(&s->thread);
1803            s->migration_thread_running = false;
1804        }
1805        qemu_mutex_lock_iothread();
1806
1807        multifd_save_cleanup();
1808        qemu_mutex_lock(&s->qemu_file_lock);
1809        tmp = s->to_dst_file;
1810        s->to_dst_file = NULL;
1811        qemu_mutex_unlock(&s->qemu_file_lock);
1812        /*
1813         * Close the file handle without the lock to make sure the
1814         * critical section won't block for long.
1815         */
1816        migration_ioc_unregister_yank_from_file(tmp);
1817        qemu_fclose(tmp);
1818    }
1819
1820    assert(!migration_is_active(s));
1821
1822    if (s->state == MIGRATION_STATUS_CANCELLING) {
1823        migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING,
1824                          MIGRATION_STATUS_CANCELLED);
1825    }
1826
1827    if (s->error) {
1828        /* It is used on info migrate.  We can't free it */
1829        error_report_err(error_copy(s->error));
1830    }
1831    notifier_list_notify(&migration_state_notifiers, s);
1832    block_cleanup_parameters(s);
1833    yank_unregister_instance(MIGRATION_YANK_INSTANCE);
1834}
1835
1836static void migrate_fd_cleanup_schedule(MigrationState *s)
1837{
1838    /*
1839     * Ref the state for bh, because it may be called when
1840     * there're already no other refs
1841     */
1842    object_ref(OBJECT(s));
1843    qemu_bh_schedule(s->cleanup_bh);
1844}
1845
1846static void migrate_fd_cleanup_bh(void *opaque)
1847{
1848    MigrationState *s = opaque;
1849    migrate_fd_cleanup(s);
1850    object_unref(OBJECT(s));
1851}
1852
1853void migrate_set_error(MigrationState *s, const Error *error)
1854{
1855    QEMU_LOCK_GUARD(&s->error_mutex);
1856    if (!s->error) {
1857        s->error = error_copy(error);
1858    }
1859}
1860
1861static void migrate_error_free(MigrationState *s)
1862{
1863    QEMU_LOCK_GUARD(&s->error_mutex);
1864    if (s->error) {
1865        error_free(s->error);
1866        s->error = NULL;
1867    }
1868}
1869
1870void migrate_fd_error(MigrationState *s, const Error *error)
1871{
1872    trace_migrate_fd_error(error_get_pretty(error));
1873    assert(s->to_dst_file == NULL);
1874    migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
1875                      MIGRATION_STATUS_FAILED);
1876    migrate_set_error(s, error);
1877}
1878
1879static void migrate_fd_cancel(MigrationState *s)
1880{
1881    int old_state ;
1882    QEMUFile *f = migrate_get_current()->to_dst_file;
1883    trace_migrate_fd_cancel();
1884
1885    WITH_QEMU_LOCK_GUARD(&s->qemu_file_lock) {
1886        if (s->rp_state.from_dst_file) {
1887            /* shutdown the rp socket, so causing the rp thread to shutdown */
1888            qemu_file_shutdown(s->rp_state.from_dst_file);
1889        }
1890    }
1891
1892    do {
1893        old_state = s->state;
1894        if (!migration_is_running(old_state)) {
1895            break;
1896        }
1897        /* If the migration is paused, kick it out of the pause */
1898        if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) {
1899            qemu_sem_post(&s->pause_sem);
1900        }
1901        migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING);
1902    } while (s->state != MIGRATION_STATUS_CANCELLING);
1903
1904    /*
1905     * If we're unlucky the migration code might be stuck somewhere in a
1906     * send/write while the network has failed and is waiting to timeout;
1907     * if we've got shutdown(2) available then we can force it to quit.
1908     * The outgoing qemu file gets closed in migrate_fd_cleanup that is
1909     * called in a bh, so there is no race against this cancel.
1910     */
1911    if (s->state == MIGRATION_STATUS_CANCELLING && f) {
1912        qemu_file_shutdown(f);
1913    }
1914    if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) {
1915        Error *local_err = NULL;
1916
1917        bdrv_invalidate_cache_all(&local_err);
1918        if (local_err) {
1919            error_report_err(local_err);
1920        } else {
1921            s->block_inactive = false;
1922        }
1923    }
1924}
1925
1926void add_migration_state_change_notifier(Notifier *notify)
1927{
1928    notifier_list_add(&migration_state_notifiers, notify);
1929}
1930
1931void remove_migration_state_change_notifier(Notifier *notify)
1932{
1933    notifier_remove(notify);
1934}
1935
1936bool migration_in_setup(MigrationState *s)
1937{
1938    return s->state == MIGRATION_STATUS_SETUP;
1939}
1940
1941bool migration_has_finished(MigrationState *s)
1942{
1943    return s->state == MIGRATION_STATUS_COMPLETED;
1944}
1945
1946bool migration_has_failed(MigrationState *s)
1947{
1948    return (s->state == MIGRATION_STATUS_CANCELLED ||
1949            s->state == MIGRATION_STATUS_FAILED);
1950}
1951
1952bool migration_in_postcopy(void)
1953{
1954    MigrationState *s = migrate_get_current();
1955
1956    switch (s->state) {
1957    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1958    case MIGRATION_STATUS_POSTCOPY_PAUSED:
1959    case MIGRATION_STATUS_POSTCOPY_RECOVER:
1960        return true;
1961    default:
1962        return false;
1963    }
1964}
1965
1966bool migration_in_postcopy_after_devices(MigrationState *s)
1967{
1968    return migration_in_postcopy() && s->postcopy_after_devices;
1969}
1970
1971bool migration_in_incoming_postcopy(void)
1972{
1973    PostcopyState ps = postcopy_state_get();
1974
1975    return ps >= POSTCOPY_INCOMING_DISCARD && ps < POSTCOPY_INCOMING_END;
1976}
1977
1978bool migration_in_bg_snapshot(void)
1979{
1980    MigrationState *s = migrate_get_current();
1981
1982    return migrate_background_snapshot() &&
1983            migration_is_setup_or_active(s->state);
1984}
1985
1986bool migration_is_idle(void)
1987{
1988    MigrationState *s = current_migration;
1989
1990    if (!s) {
1991        return true;
1992    }
1993
1994    switch (s->state) {
1995    case MIGRATION_STATUS_NONE:
1996    case MIGRATION_STATUS_CANCELLED:
1997    case MIGRATION_STATUS_COMPLETED:
1998    case MIGRATION_STATUS_FAILED:
1999        return true;
2000    case MIGRATION_STATUS_SETUP:
2001    case MIGRATION_STATUS_CANCELLING:
2002    case MIGRATION_STATUS_ACTIVE:
2003    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
2004    case MIGRATION_STATUS_COLO:
2005    case MIGRATION_STATUS_PRE_SWITCHOVER:
2006    case MIGRATION_STATUS_DEVICE:
2007    case MIGRATION_STATUS_WAIT_UNPLUG:
2008        return false;
2009    case MIGRATION_STATUS__MAX:
2010        g_assert_not_reached();
2011    }
2012
2013    return false;
2014}
2015
2016bool migration_is_active(MigrationState *s)
2017{
2018    return (s->state == MIGRATION_STATUS_ACTIVE ||
2019            s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
2020}
2021
2022void migrate_init(MigrationState *s)
2023{
2024    /*
2025     * Reinitialise all migration state, except
2026     * parameters/capabilities that the user set, and
2027     * locks.
2028     */
2029    s->cleanup_bh = 0;
2030    s->vm_start_bh = 0;
2031    s->to_dst_file = NULL;
2032    s->state = MIGRATION_STATUS_NONE;
2033    s->rp_state.from_dst_file = NULL;
2034    s->rp_state.error = false;
2035    s->mbps = 0.0;
2036    s->pages_per_second = 0.0;
2037    s->downtime = 0;
2038    s->expected_downtime = 0;
2039    s->setup_time = 0;
2040    s->start_postcopy = false;
2041    s->postcopy_after_devices = false;
2042    s->migration_thread_running = false;
2043    error_free(s->error);
2044    s->error = NULL;
2045    s->hostname = NULL;
2046
2047    migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
2048
2049    s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2050    s->total_time = 0;
2051    s->vm_was_running = false;
2052    s->iteration_initial_bytes = 0;
2053    s->threshold_size = 0;
2054}
2055
2056int migrate_add_blocker(Error *reason, Error **errp)
2057{
2058    if (only_migratable) {
2059        error_propagate_prepend(errp, error_copy(reason),
2060                                "disallowing migration blocker "
2061                                "(--only-migratable) for: ");
2062        return -EACCES;
2063    }
2064
2065    if (migration_is_idle()) {
2066        migration_blockers = g_slist_prepend(migration_blockers, reason);
2067        return 0;
2068    }
2069
2070    error_propagate_prepend(errp, error_copy(reason),
2071                            "disallowing migration blocker "
2072                            "(migration in progress) for: ");
2073    return -EBUSY;
2074}
2075
2076void migrate_del_blocker(Error *reason)
2077{
2078    migration_blockers = g_slist_remove(migration_blockers, reason);
2079}
2080
2081void qmp_migrate_incoming(const char *uri, Error **errp)
2082{
2083    Error *local_err = NULL;
2084    static bool once = true;
2085
2086    if (!once) {
2087        error_setg(errp, "The incoming migration has already been started");
2088        return;
2089    }
2090    if (!runstate_check(RUN_STATE_INMIGRATE)) {
2091        error_setg(errp, "'-incoming' was not specified on the command line");
2092        return;
2093    }
2094
2095    if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
2096        return;
2097    }
2098
2099    qemu_start_incoming_migration(uri, &local_err);
2100
2101    if (local_err) {
2102        yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2103        error_propagate(errp, local_err);
2104        return;
2105    }
2106
2107    once = false;
2108}
2109
2110void qmp_migrate_recover(const char *uri, Error **errp)
2111{
2112    MigrationIncomingState *mis = migration_incoming_get_current();
2113
2114    /*
2115     * Don't even bother to use ERRP_GUARD() as it _must_ always be set by
2116     * callers (no one should ignore a recover failure); if there is, it's a
2117     * programming error.
2118     */
2119    assert(errp);
2120
2121    if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
2122        error_setg(errp, "Migrate recover can only be run "
2123                   "when postcopy is paused.");
2124        return;
2125    }
2126
2127    if (qatomic_cmpxchg(&mis->postcopy_recover_triggered,
2128                       false, true) == true) {
2129        error_setg(errp, "Migrate recovery is triggered already");
2130        return;
2131    }
2132
2133    /*
2134     * Note that this call will never start a real migration; it will
2135     * only re-setup the migration stream and poke existing migration
2136     * to continue using that newly established channel.
2137     */
2138    qemu_start_incoming_migration(uri, errp);
2139
2140    /* Safe to dereference with the assert above */
2141    if (*errp) {
2142        /* Reset the flag so user could still retry */
2143        qatomic_set(&mis->postcopy_recover_triggered, false);
2144    }
2145}
2146
2147void qmp_migrate_pause(Error **errp)
2148{
2149    MigrationState *ms = migrate_get_current();
2150    MigrationIncomingState *mis = migration_incoming_get_current();
2151    int ret;
2152
2153    if (ms->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2154        /* Source side, during postcopy */
2155        qemu_mutex_lock(&ms->qemu_file_lock);
2156        ret = qemu_file_shutdown(ms->to_dst_file);
2157        qemu_mutex_unlock(&ms->qemu_file_lock);
2158        if (ret) {
2159            error_setg(errp, "Failed to pause source migration");
2160        }
2161        return;
2162    }
2163
2164    if (mis->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2165        ret = qemu_file_shutdown(mis->from_src_file);
2166        if (ret) {
2167            error_setg(errp, "Failed to pause destination migration");
2168        }
2169        return;
2170    }
2171
2172    error_setg(errp, "migrate-pause is currently only supported "
2173               "during postcopy-active state");
2174}
2175
2176bool migration_is_blocked(Error **errp)
2177{
2178    if (qemu_savevm_state_blocked(errp)) {
2179        return true;
2180    }
2181
2182    if (migration_blockers) {
2183        error_propagate(errp, error_copy(migration_blockers->data));
2184        return true;
2185    }
2186
2187    return false;
2188}
2189
2190/* Returns true if continue to migrate, or false if error detected */
2191static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc,
2192                            bool resume, Error **errp)
2193{
2194    Error *local_err = NULL;
2195
2196    if (resume) {
2197        if (s->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
2198            error_setg(errp, "Cannot resume if there is no "
2199                       "paused migration");
2200            return false;
2201        }
2202
2203        /*
2204         * Postcopy recovery won't work well with release-ram
2205         * capability since release-ram will drop the page buffer as
2206         * long as the page is put into the send buffer.  So if there
2207         * is a network failure happened, any page buffers that have
2208         * not yet reached the destination VM but have already been
2209         * sent from the source VM will be lost forever.  Let's refuse
2210         * the client from resuming such a postcopy migration.
2211         * Luckily release-ram was designed to only be used when src
2212         * and destination VMs are on the same host, so it should be
2213         * fine.
2214         */
2215        if (migrate_release_ram()) {
2216            error_setg(errp, "Postcopy recovery cannot work "
2217                       "when release-ram capability is set");
2218            return false;
2219        }
2220
2221        /* This is a resume, skip init status */
2222        return true;
2223    }
2224
2225    if (migration_is_running(s->state)) {
2226        error_setg(errp, QERR_MIGRATION_ACTIVE);
2227        return false;
2228    }
2229
2230    if (runstate_check(RUN_STATE_INMIGRATE)) {
2231        error_setg(errp, "Guest is waiting for an incoming migration");
2232        return false;
2233    }
2234
2235    if (runstate_check(RUN_STATE_POSTMIGRATE)) {
2236        error_setg(errp, "Can't migrate the vm that was paused due to "
2237                   "previous migration");
2238        return false;
2239    }
2240
2241    if (migration_is_blocked(errp)) {
2242        return false;
2243    }
2244
2245    if (blk || blk_inc) {
2246        if (migrate_colo_enabled()) {
2247            error_setg(errp, "No disk migration is required in COLO mode");
2248            return false;
2249        }
2250        if (migrate_use_block() || migrate_use_block_incremental()) {
2251            error_setg(errp, "Command options are incompatible with "
2252                       "current migration capabilities");
2253            return false;
2254        }
2255        migrate_set_block_enabled(true, &local_err);
2256        if (local_err) {
2257            error_propagate(errp, local_err);
2258            return false;
2259        }
2260        s->must_remove_block_options = true;
2261    }
2262
2263    if (blk_inc) {
2264        migrate_set_block_incremental(s, true);
2265    }
2266
2267    migrate_init(s);
2268    /*
2269     * set ram_counters memory to zero for a
2270     * new migration
2271     */
2272    memset(&ram_counters, 0, sizeof(ram_counters));
2273
2274    return true;
2275}
2276
2277void qmp_migrate(const char *uri, bool has_blk, bool blk,
2278                 bool has_inc, bool inc, bool has_detach, bool detach,
2279                 bool has_resume, bool resume, Error **errp)
2280{
2281    Error *local_err = NULL;
2282    MigrationState *s = migrate_get_current();
2283    const char *p = NULL;
2284
2285    if (!migrate_prepare(s, has_blk && blk, has_inc && inc,
2286                         has_resume && resume, errp)) {
2287        /* Error detected, put into errp */
2288        return;
2289    }
2290
2291    if (!(has_resume && resume)) {
2292        if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
2293            return;
2294        }
2295    }
2296
2297    if (strstart(uri, "tcp:", &p) ||
2298        strstart(uri, "unix:", NULL) ||
2299        strstart(uri, "vsock:", NULL)) {
2300        socket_start_outgoing_migration(s, p ? p : uri, &local_err);
2301#ifdef CONFIG_RDMA
2302    } else if (strstart(uri, "rdma:", &p)) {
2303        rdma_start_outgoing_migration(s, p, &local_err);
2304#endif
2305    } else if (strstart(uri, "exec:", &p)) {
2306        exec_start_outgoing_migration(s, p, &local_err);
2307    } else if (strstart(uri, "fd:", &p)) {
2308        fd_start_outgoing_migration(s, p, &local_err);
2309    } else {
2310        if (!(has_resume && resume)) {
2311            yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2312        }
2313        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri",
2314                   "a valid migration protocol");
2315        migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
2316                          MIGRATION_STATUS_FAILED);
2317        block_cleanup_parameters(s);
2318        return;
2319    }
2320
2321    if (local_err) {
2322        if (!(has_resume && resume)) {
2323            yank_unregister_instance(MIGRATION_YANK_INSTANCE);
2324        }
2325        migrate_fd_error(s, local_err);
2326        error_propagate(errp, local_err);
2327        return;
2328    }
2329}
2330
2331void qmp_migrate_cancel(Error **errp)
2332{
2333    migration_cancel();
2334}
2335
2336void qmp_migrate_continue(MigrationStatus state, Error **errp)
2337{
2338    MigrationState *s = migrate_get_current();
2339    if (s->state != state) {
2340        error_setg(errp,  "Migration not in expected state: %s",
2341                   MigrationStatus_str(s->state));
2342        return;
2343    }
2344    qemu_sem_post(&s->pause_sem);
2345}
2346
2347bool migrate_release_ram(void)
2348{
2349    MigrationState *s;
2350
2351    s = migrate_get_current();
2352
2353    return s->enabled_capabilities[MIGRATION_CAPABILITY_RELEASE_RAM];
2354}
2355
2356bool migrate_postcopy_ram(void)
2357{
2358    MigrationState *s;
2359
2360    s = migrate_get_current();
2361
2362    return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM];
2363}
2364
2365bool migrate_postcopy(void)
2366{
2367    return migrate_postcopy_ram() || migrate_dirty_bitmaps();
2368}
2369
2370bool migrate_auto_converge(void)
2371{
2372    MigrationState *s;
2373
2374    s = migrate_get_current();
2375
2376    return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE];
2377}
2378
2379bool migrate_zero_blocks(void)
2380{
2381    MigrationState *s;
2382
2383    s = migrate_get_current();
2384
2385    return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS];
2386}
2387
2388bool migrate_postcopy_blocktime(void)
2389{
2390    MigrationState *s;
2391
2392    s = migrate_get_current();
2393
2394    return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME];
2395}
2396
2397bool migrate_use_compression(void)
2398{
2399    MigrationState *s;
2400
2401    s = migrate_get_current();
2402
2403    return s->enabled_capabilities[MIGRATION_CAPABILITY_COMPRESS];
2404}
2405
2406int migrate_compress_level(void)
2407{
2408    MigrationState *s;
2409
2410    s = migrate_get_current();
2411
2412    return s->parameters.compress_level;
2413}
2414
2415int migrate_compress_threads(void)
2416{
2417    MigrationState *s;
2418
2419    s = migrate_get_current();
2420
2421    return s->parameters.compress_threads;
2422}
2423
2424int migrate_compress_wait_thread(void)
2425{
2426    MigrationState *s;
2427
2428    s = migrate_get_current();
2429
2430    return s->parameters.compress_wait_thread;
2431}
2432
2433int migrate_decompress_threads(void)
2434{
2435    MigrationState *s;
2436
2437    s = migrate_get_current();
2438
2439    return s->parameters.decompress_threads;
2440}
2441
2442bool migrate_dirty_bitmaps(void)
2443{
2444    MigrationState *s;
2445
2446    s = migrate_get_current();
2447
2448    return s->enabled_capabilities[MIGRATION_CAPABILITY_DIRTY_BITMAPS];
2449}
2450
2451bool migrate_ignore_shared(void)
2452{
2453    MigrationState *s;
2454
2455    s = migrate_get_current();
2456
2457    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_IGNORE_SHARED];
2458}
2459
2460bool migrate_validate_uuid(void)
2461{
2462    MigrationState *s;
2463
2464    s = migrate_get_current();
2465
2466    return s->enabled_capabilities[MIGRATION_CAPABILITY_VALIDATE_UUID];
2467}
2468
2469bool migrate_use_events(void)
2470{
2471    MigrationState *s;
2472
2473    s = migrate_get_current();
2474
2475    return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS];
2476}
2477
2478bool migrate_use_multifd(void)
2479{
2480    MigrationState *s;
2481
2482    s = migrate_get_current();
2483
2484    return s->enabled_capabilities[MIGRATION_CAPABILITY_MULTIFD];
2485}
2486
2487bool migrate_pause_before_switchover(void)
2488{
2489    MigrationState *s;
2490
2491    s = migrate_get_current();
2492
2493    return s->enabled_capabilities[
2494        MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER];
2495}
2496
2497int migrate_multifd_channels(void)
2498{
2499    MigrationState *s;
2500
2501    s = migrate_get_current();
2502
2503    return s->parameters.multifd_channels;
2504}
2505
2506MultiFDCompression migrate_multifd_compression(void)
2507{
2508    MigrationState *s;
2509
2510    s = migrate_get_current();
2511
2512    return s->parameters.multifd_compression;
2513}
2514
2515int migrate_multifd_zlib_level(void)
2516{
2517    MigrationState *s;
2518
2519    s = migrate_get_current();
2520
2521    return s->parameters.multifd_zlib_level;
2522}
2523
2524int migrate_multifd_zstd_level(void)
2525{
2526    MigrationState *s;
2527
2528    s = migrate_get_current();
2529
2530    return s->parameters.multifd_zstd_level;
2531}
2532
2533int migrate_use_xbzrle(void)
2534{
2535    MigrationState *s;
2536
2537    s = migrate_get_current();
2538
2539    return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE];
2540}
2541
2542uint64_t migrate_xbzrle_cache_size(void)
2543{
2544    MigrationState *s;
2545
2546    s = migrate_get_current();
2547
2548    return s->parameters.xbzrle_cache_size;
2549}
2550
2551static int64_t migrate_max_postcopy_bandwidth(void)
2552{
2553    MigrationState *s;
2554
2555    s = migrate_get_current();
2556
2557    return s->parameters.max_postcopy_bandwidth;
2558}
2559
2560bool migrate_use_block(void)
2561{
2562    MigrationState *s;
2563
2564    s = migrate_get_current();
2565
2566    return s->enabled_capabilities[MIGRATION_CAPABILITY_BLOCK];
2567}
2568
2569bool migrate_use_return_path(void)
2570{
2571    MigrationState *s;
2572
2573    s = migrate_get_current();
2574
2575    return s->enabled_capabilities[MIGRATION_CAPABILITY_RETURN_PATH];
2576}
2577
2578bool migrate_use_block_incremental(void)
2579{
2580    MigrationState *s;
2581
2582    s = migrate_get_current();
2583
2584    return s->parameters.block_incremental;
2585}
2586
2587bool migrate_background_snapshot(void)
2588{
2589    MigrationState *s;
2590
2591    s = migrate_get_current();
2592
2593    return s->enabled_capabilities[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT];
2594}
2595
2596/* migration thread support */
2597/*
2598 * Something bad happened to the RP stream, mark an error
2599 * The caller shall print or trace something to indicate why
2600 */
2601static void mark_source_rp_bad(MigrationState *s)
2602{
2603    s->rp_state.error = true;
2604}
2605
2606static struct rp_cmd_args {
2607    ssize_t     len; /* -1 = variable */
2608    const char *name;
2609} rp_cmd_args[] = {
2610    [MIG_RP_MSG_INVALID]        = { .len = -1, .name = "INVALID" },
2611    [MIG_RP_MSG_SHUT]           = { .len =  4, .name = "SHUT" },
2612    [MIG_RP_MSG_PONG]           = { .len =  4, .name = "PONG" },
2613    [MIG_RP_MSG_REQ_PAGES]      = { .len = 12, .name = "REQ_PAGES" },
2614    [MIG_RP_MSG_REQ_PAGES_ID]   = { .len = -1, .name = "REQ_PAGES_ID" },
2615    [MIG_RP_MSG_RECV_BITMAP]    = { .len = -1, .name = "RECV_BITMAP" },
2616    [MIG_RP_MSG_RESUME_ACK]     = { .len =  4, .name = "RESUME_ACK" },
2617    [MIG_RP_MSG_MAX]            = { .len = -1, .name = "MAX" },
2618};
2619
2620/*
2621 * Process a request for pages received on the return path,
2622 * We're allowed to send more than requested (e.g. to round to our page size)
2623 * and we don't need to send pages that have already been sent.
2624 */
2625static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
2626                                       ram_addr_t start, size_t len)
2627{
2628    long our_host_ps = qemu_real_host_page_size;
2629
2630    trace_migrate_handle_rp_req_pages(rbname, start, len);
2631
2632    /*
2633     * Since we currently insist on matching page sizes, just sanity check
2634     * we're being asked for whole host pages.
2635     */
2636    if (start & (our_host_ps - 1) ||
2637       (len & (our_host_ps - 1))) {
2638        error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
2639                     " len: %zd", __func__, start, len);
2640        mark_source_rp_bad(ms);
2641        return;
2642    }
2643
2644    if (ram_save_queue_pages(rbname, start, len)) {
2645        mark_source_rp_bad(ms);
2646    }
2647}
2648
2649/* Return true to retry, false to quit */
2650static bool postcopy_pause_return_path_thread(MigrationState *s)
2651{
2652    trace_postcopy_pause_return_path();
2653
2654    qemu_sem_wait(&s->postcopy_pause_rp_sem);
2655
2656    trace_postcopy_pause_return_path_continued();
2657
2658    return true;
2659}
2660
2661static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name)
2662{
2663    RAMBlock *block = qemu_ram_block_by_name(block_name);
2664
2665    if (!block) {
2666        error_report("%s: invalid block name '%s'", __func__, block_name);
2667        return -EINVAL;
2668    }
2669
2670    /* Fetch the received bitmap and refresh the dirty bitmap */
2671    return ram_dirty_bitmap_reload(s, block);
2672}
2673
2674static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value)
2675{
2676    trace_source_return_path_thread_resume_ack(value);
2677
2678    if (value != MIGRATION_RESUME_ACK_VALUE) {
2679        error_report("%s: illegal resume_ack value %"PRIu32,
2680                     __func__, value);
2681        return -1;
2682    }
2683
2684    /* Now both sides are active. */
2685    migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2686                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
2687
2688    /* Notify send thread that time to continue send pages */
2689    qemu_sem_post(&s->rp_state.rp_sem);
2690
2691    return 0;
2692}
2693
2694/* Release ms->rp_state.from_dst_file in a safe way */
2695static void migration_release_from_dst_file(MigrationState *ms)
2696{
2697    QEMUFile *file;
2698
2699    WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
2700        /*
2701         * Reset the from_dst_file pointer first before releasing it, as we
2702         * can't block within lock section
2703         */
2704        file = ms->rp_state.from_dst_file;
2705        ms->rp_state.from_dst_file = NULL;
2706    }
2707
2708    qemu_fclose(file);
2709}
2710
2711/*
2712 * Handles messages sent on the return path towards the source VM
2713 *
2714 */
2715static void *source_return_path_thread(void *opaque)
2716{
2717    MigrationState *ms = opaque;
2718    QEMUFile *rp = ms->rp_state.from_dst_file;
2719    uint16_t header_len, header_type;
2720    uint8_t buf[512];
2721    uint32_t tmp32, sibling_error;
2722    ram_addr_t start = 0; /* =0 to silence warning */
2723    size_t  len = 0, expected_len;
2724    int res;
2725
2726    trace_source_return_path_thread_entry();
2727    rcu_register_thread();
2728
2729retry:
2730    while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
2731           migration_is_setup_or_active(ms->state)) {
2732        trace_source_return_path_thread_loop_top();
2733        header_type = qemu_get_be16(rp);
2734        header_len = qemu_get_be16(rp);
2735
2736        if (qemu_file_get_error(rp)) {
2737            mark_source_rp_bad(ms);
2738            goto out;
2739        }
2740
2741        if (header_type >= MIG_RP_MSG_MAX ||
2742            header_type == MIG_RP_MSG_INVALID) {
2743            error_report("RP: Received invalid message 0x%04x length 0x%04x",
2744                         header_type, header_len);
2745            mark_source_rp_bad(ms);
2746            goto out;
2747        }
2748
2749        if ((rp_cmd_args[header_type].len != -1 &&
2750            header_len != rp_cmd_args[header_type].len) ||
2751            header_len > sizeof(buf)) {
2752            error_report("RP: Received '%s' message (0x%04x) with"
2753                         "incorrect length %d expecting %zu",
2754                         rp_cmd_args[header_type].name, header_type, header_len,
2755                         (size_t)rp_cmd_args[header_type].len);
2756            mark_source_rp_bad(ms);
2757            goto out;
2758        }
2759
2760        /* We know we've got a valid header by this point */
2761        res = qemu_get_buffer(rp, buf, header_len);
2762        if (res != header_len) {
2763            error_report("RP: Failed reading data for message 0x%04x"
2764                         " read %d expected %d",
2765                         header_type, res, header_len);
2766            mark_source_rp_bad(ms);
2767            goto out;
2768        }
2769
2770        /* OK, we have the message and the data */
2771        switch (header_type) {
2772        case MIG_RP_MSG_SHUT:
2773            sibling_error = ldl_be_p(buf);
2774            trace_source_return_path_thread_shut(sibling_error);
2775            if (sibling_error) {
2776                error_report("RP: Sibling indicated error %d", sibling_error);
2777                mark_source_rp_bad(ms);
2778            }
2779            /*
2780             * We'll let the main thread deal with closing the RP
2781             * we could do a shutdown(2) on it, but we're the only user
2782             * anyway, so there's nothing gained.
2783             */
2784            goto out;
2785
2786        case MIG_RP_MSG_PONG:
2787            tmp32 = ldl_be_p(buf);
2788            trace_source_return_path_thread_pong(tmp32);
2789            break;
2790
2791        case MIG_RP_MSG_REQ_PAGES:
2792            start = ldq_be_p(buf);
2793            len = ldl_be_p(buf + 8);
2794            migrate_handle_rp_req_pages(ms, NULL, start, len);
2795            break;
2796
2797        case MIG_RP_MSG_REQ_PAGES_ID:
2798            expected_len = 12 + 1; /* header + termination */
2799
2800            if (header_len >= expected_len) {
2801                start = ldq_be_p(buf);
2802                len = ldl_be_p(buf + 8);
2803                /* Now we expect an idstr */
2804                tmp32 = buf[12]; /* Length of the following idstr */
2805                buf[13 + tmp32] = '\0';
2806                expected_len += tmp32;
2807            }
2808            if (header_len != expected_len) {
2809                error_report("RP: Req_Page_id with length %d expecting %zd",
2810                             header_len, expected_len);
2811                mark_source_rp_bad(ms);
2812                goto out;
2813            }
2814            migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
2815            break;
2816
2817        case MIG_RP_MSG_RECV_BITMAP:
2818            if (header_len < 1) {
2819                error_report("%s: missing block name", __func__);
2820                mark_source_rp_bad(ms);
2821                goto out;
2822            }
2823            /* Format: len (1B) + idstr (<255B). This ends the idstr. */
2824            buf[buf[0] + 1] = '\0';
2825            if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1))) {
2826                mark_source_rp_bad(ms);
2827                goto out;
2828            }
2829            break;
2830
2831        case MIG_RP_MSG_RESUME_ACK:
2832            tmp32 = ldl_be_p(buf);
2833            if (migrate_handle_rp_resume_ack(ms, tmp32)) {
2834                mark_source_rp_bad(ms);
2835                goto out;
2836            }
2837            break;
2838
2839        default:
2840            break;
2841        }
2842    }
2843
2844out:
2845    res = qemu_file_get_error(rp);
2846    if (res) {
2847        if (res == -EIO && migration_in_postcopy()) {
2848            /*
2849             * Maybe there is something we can do: it looks like a
2850             * network down issue, and we pause for a recovery.
2851             */
2852            migration_release_from_dst_file(ms);
2853            rp = NULL;
2854            if (postcopy_pause_return_path_thread(ms)) {
2855                /*
2856                 * Reload rp, reset the rest.  Referencing it is safe since
2857                 * it's reset only by us above, or when migration completes
2858                 */
2859                rp = ms->rp_state.from_dst_file;
2860                ms->rp_state.error = false;
2861                goto retry;
2862            }
2863        }
2864
2865        trace_source_return_path_thread_bad_end();
2866        mark_source_rp_bad(ms);
2867    }
2868
2869    trace_source_return_path_thread_end();
2870    migration_release_from_dst_file(ms);
2871    rcu_unregister_thread();
2872    return NULL;
2873}
2874
2875static int open_return_path_on_source(MigrationState *ms,
2876                                      bool create_thread)
2877{
2878    ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
2879    if (!ms->rp_state.from_dst_file) {
2880        return -1;
2881    }
2882
2883    trace_open_return_path_on_source();
2884
2885    if (!create_thread) {
2886        /* We're done */
2887        return 0;
2888    }
2889
2890    qemu_thread_create(&ms->rp_state.rp_thread, "return path",
2891                       source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
2892    ms->rp_state.rp_thread_created = true;
2893
2894    trace_open_return_path_on_source_continue();
2895
2896    return 0;
2897}
2898
2899/* Returns 0 if the RP was ok, otherwise there was an error on the RP */
2900static int await_return_path_close_on_source(MigrationState *ms)
2901{
2902    /*
2903     * If this is a normal exit then the destination will send a SHUT and the
2904     * rp_thread will exit, however if there's an error we need to cause
2905     * it to exit.
2906     */
2907    if (qemu_file_get_error(ms->to_dst_file) && ms->rp_state.from_dst_file) {
2908        /*
2909         * shutdown(2), if we have it, will cause it to unblock if it's stuck
2910         * waiting for the destination.
2911         */
2912        qemu_file_shutdown(ms->rp_state.from_dst_file);
2913        mark_source_rp_bad(ms);
2914    }
2915    trace_await_return_path_close_on_source_joining();
2916    qemu_thread_join(&ms->rp_state.rp_thread);
2917    ms->rp_state.rp_thread_created = false;
2918    trace_await_return_path_close_on_source_close();
2919    return ms->rp_state.error;
2920}
2921
2922/*
2923 * Switch from normal iteration to postcopy
2924 * Returns non-0 on error
2925 */
2926static int postcopy_start(MigrationState *ms)
2927{
2928    int ret;
2929    QIOChannelBuffer *bioc;
2930    QEMUFile *fb;
2931    int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2932    int64_t bandwidth = migrate_max_postcopy_bandwidth();
2933    bool restart_block = false;
2934    int cur_state = MIGRATION_STATUS_ACTIVE;
2935    if (!migrate_pause_before_switchover()) {
2936        migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
2937                          MIGRATION_STATUS_POSTCOPY_ACTIVE);
2938    }
2939
2940    trace_postcopy_start();
2941    qemu_mutex_lock_iothread();
2942    trace_postcopy_start_set_run();
2943
2944    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
2945    global_state_store();
2946    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
2947    if (ret < 0) {
2948        goto fail;
2949    }
2950
2951    ret = migration_maybe_pause(ms, &cur_state,
2952                                MIGRATION_STATUS_POSTCOPY_ACTIVE);
2953    if (ret < 0) {
2954        goto fail;
2955    }
2956
2957    ret = bdrv_inactivate_all();
2958    if (ret < 0) {
2959        goto fail;
2960    }
2961    restart_block = true;
2962
2963    /*
2964     * Cause any non-postcopiable, but iterative devices to
2965     * send out their final data.
2966     */
2967    qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false);
2968
2969    /*
2970     * in Finish migrate and with the io-lock held everything should
2971     * be quiet, but we've potentially still got dirty pages and we
2972     * need to tell the destination to throw any pages it's already received
2973     * that are dirty
2974     */
2975    if (migrate_postcopy_ram()) {
2976        if (ram_postcopy_send_discard_bitmap(ms)) {
2977            error_report("postcopy send discard bitmap failed");
2978            goto fail;
2979        }
2980    }
2981
2982    /*
2983     * send rest of state - note things that are doing postcopy
2984     * will notice we're in POSTCOPY_ACTIVE and not actually
2985     * wrap their state up here
2986     */
2987    /* 0 max-postcopy-bandwidth means unlimited */
2988    if (!bandwidth) {
2989        qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX);
2990    } else {
2991        qemu_file_set_rate_limit(ms->to_dst_file, bandwidth / XFER_LIMIT_RATIO);
2992    }
2993    if (migrate_postcopy_ram()) {
2994        /* Ping just for debugging, helps line traces up */
2995        qemu_savevm_send_ping(ms->to_dst_file, 2);
2996    }
2997
2998    /*
2999     * While loading the device state we may trigger page transfer
3000     * requests and the fd must be free to process those, and thus
3001     * the destination must read the whole device state off the fd before
3002     * it starts processing it.  Unfortunately the ad-hoc migration format
3003     * doesn't allow the destination to know the size to read without fully
3004     * parsing it through each devices load-state code (especially the open
3005     * coded devices that use get/put).
3006     * So we wrap the device state up in a package with a length at the start;
3007     * to do this we use a qemu_buf to hold the whole of the device state.
3008     */
3009    bioc = qio_channel_buffer_new(4096);
3010    qio_channel_set_name(QIO_CHANNEL(bioc), "migration-postcopy-buffer");
3011    fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc));
3012    object_unref(OBJECT(bioc));
3013
3014    /*
3015     * Make sure the receiver can get incoming pages before we send the rest
3016     * of the state
3017     */
3018    qemu_savevm_send_postcopy_listen(fb);
3019
3020    qemu_savevm_state_complete_precopy(fb, false, false);
3021    if (migrate_postcopy_ram()) {
3022        qemu_savevm_send_ping(fb, 3);
3023    }
3024
3025    qemu_savevm_send_postcopy_run(fb);
3026
3027    /* <><> end of stuff going into the package */
3028
3029    /* Last point of recovery; as soon as we send the package the destination
3030     * can open devices and potentially start running.
3031     * Lets just check again we've not got any errors.
3032     */
3033    ret = qemu_file_get_error(ms->to_dst_file);
3034    if (ret) {
3035        error_report("postcopy_start: Migration stream errored (pre package)");
3036        goto fail_closefb;
3037    }
3038
3039    restart_block = false;
3040
3041    /* Now send that blob */
3042    if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) {
3043        goto fail_closefb;
3044    }
3045    qemu_fclose(fb);
3046
3047    /* Send a notify to give a chance for anything that needs to happen
3048     * at the transition to postcopy and after the device state; in particular
3049     * spice needs to trigger a transition now
3050     */
3051    ms->postcopy_after_devices = true;
3052    notifier_list_notify(&migration_state_notifiers, ms);
3053
3054    ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
3055
3056    qemu_mutex_unlock_iothread();
3057
3058    if (migrate_postcopy_ram()) {
3059        /*
3060         * Although this ping is just for debug, it could potentially be
3061         * used for getting a better measurement of downtime at the source.
3062         */
3063        qemu_savevm_send_ping(ms->to_dst_file, 4);
3064    }
3065
3066    if (migrate_release_ram()) {
3067        ram_postcopy_migrated_memory_release(ms);
3068    }
3069
3070    ret = qemu_file_get_error(ms->to_dst_file);
3071    if (ret) {
3072        error_report("postcopy_start: Migration stream errored");
3073        migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
3074                              MIGRATION_STATUS_FAILED);
3075    }
3076
3077    return ret;
3078
3079fail_closefb:
3080    qemu_fclose(fb);
3081fail:
3082    migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
3083                          MIGRATION_STATUS_FAILED);
3084    if (restart_block) {
3085        /* A failure happened early enough that we know the destination hasn't
3086         * accessed block devices, so we're safe to recover.
3087         */
3088        Error *local_err = NULL;
3089
3090        bdrv_invalidate_cache_all(&local_err);
3091        if (local_err) {
3092            error_report_err(local_err);
3093        }
3094    }
3095    qemu_mutex_unlock_iothread();
3096    return -1;
3097}
3098
3099/**
3100 * migration_maybe_pause: Pause if required to by
3101 * migrate_pause_before_switchover called with the iothread locked
3102 * Returns: 0 on success
3103 */
3104static int migration_maybe_pause(MigrationState *s,
3105                                 int *current_active_state,
3106                                 int new_state)
3107{
3108    if (!migrate_pause_before_switchover()) {
3109        return 0;
3110    }
3111
3112    /* Since leaving this state is not atomic with posting the semaphore
3113     * it's possible that someone could have issued multiple migrate_continue
3114     * and the semaphore is incorrectly positive at this point;
3115     * the docs say it's undefined to reinit a semaphore that's already
3116     * init'd, so use timedwait to eat up any existing posts.
3117     */
3118    while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) {
3119        /* This block intentionally left blank */
3120    }
3121
3122    /*
3123     * If the migration is cancelled when it is in the completion phase,
3124     * the migration state is set to MIGRATION_STATUS_CANCELLING.
3125     * So we don't need to wait a semaphore, otherwise we would always
3126     * wait for the 'pause_sem' semaphore.
3127     */
3128    if (s->state != MIGRATION_STATUS_CANCELLING) {
3129        qemu_mutex_unlock_iothread();
3130        migrate_set_state(&s->state, *current_active_state,
3131                          MIGRATION_STATUS_PRE_SWITCHOVER);
3132        qemu_sem_wait(&s->pause_sem);
3133        migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER,
3134                          new_state);
3135        *current_active_state = new_state;
3136        qemu_mutex_lock_iothread();
3137    }
3138
3139    return s->state == new_state ? 0 : -EINVAL;
3140}
3141
3142/**
3143 * migration_completion: Used by migration_thread when there's not much left.
3144 *   The caller 'breaks' the loop when this returns.
3145 *
3146 * @s: Current migration state
3147 */
3148static void migration_completion(MigrationState *s)
3149{
3150    int ret;
3151    int current_active_state = s->state;
3152
3153    if (s->state == MIGRATION_STATUS_ACTIVE) {
3154        qemu_mutex_lock_iothread();
3155        s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3156        qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
3157        s->vm_was_running = runstate_is_running();
3158        ret = global_state_store();
3159
3160        if (!ret) {
3161            bool inactivate = !migrate_colo_enabled();
3162            ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
3163            trace_migration_completion_vm_stop(ret);
3164            if (ret >= 0) {
3165                ret = migration_maybe_pause(s, &current_active_state,
3166                                            MIGRATION_STATUS_DEVICE);
3167            }
3168            if (ret >= 0) {
3169                qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
3170                ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
3171                                                         inactivate);
3172            }
3173            if (inactivate && ret >= 0) {
3174                s->block_inactive = true;
3175            }
3176        }
3177        qemu_mutex_unlock_iothread();
3178
3179        if (ret < 0) {
3180            goto fail;
3181        }
3182    } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
3183        trace_migration_completion_postcopy_end();
3184
3185        qemu_savevm_state_complete_postcopy(s->to_dst_file);
3186        trace_migration_completion_postcopy_end_after_complete();
3187    } else if (s->state == MIGRATION_STATUS_CANCELLING) {
3188        goto fail;
3189    }
3190
3191    /*
3192     * If rp was opened we must clean up the thread before
3193     * cleaning everything else up (since if there are no failures
3194     * it will wait for the destination to send it's status in
3195     * a SHUT command).
3196     */
3197    if (s->rp_state.rp_thread_created) {
3198        int rp_error;
3199        trace_migration_return_path_end_before();
3200        rp_error = await_return_path_close_on_source(s);
3201        trace_migration_return_path_end_after(rp_error);
3202        if (rp_error) {
3203            goto fail_invalidate;
3204        }
3205    }
3206
3207    if (qemu_file_get_error(s->to_dst_file)) {
3208        trace_migration_completion_file_err();
3209        goto fail_invalidate;
3210    }
3211
3212    if (!migrate_colo_enabled()) {
3213        migrate_set_state(&s->state, current_active_state,
3214                          MIGRATION_STATUS_COMPLETED);
3215    }
3216
3217    return;
3218
3219fail_invalidate:
3220    /* If not doing postcopy, vm_start() will be called: let's regain
3221     * control on images.
3222     */
3223    if (s->state == MIGRATION_STATUS_ACTIVE ||
3224        s->state == MIGRATION_STATUS_DEVICE) {
3225        Error *local_err = NULL;
3226
3227        qemu_mutex_lock_iothread();
3228        bdrv_invalidate_cache_all(&local_err);
3229        if (local_err) {
3230            error_report_err(local_err);
3231        } else {
3232            s->block_inactive = false;
3233        }
3234        qemu_mutex_unlock_iothread();
3235    }
3236
3237fail:
3238    migrate_set_state(&s->state, current_active_state,
3239                      MIGRATION_STATUS_FAILED);
3240}
3241
3242/**
3243 * bg_migration_completion: Used by bg_migration_thread when after all the
3244 *   RAM has been saved. The caller 'breaks' the loop when this returns.
3245 *
3246 * @s: Current migration state
3247 */
3248static void bg_migration_completion(MigrationState *s)
3249{
3250    int current_active_state = s->state;
3251
3252    /*
3253     * Stop tracking RAM writes - un-protect memory, un-register UFFD
3254     * memory ranges, flush kernel wait queues and wake up threads
3255     * waiting for write fault to be resolved.
3256     */
3257    ram_write_tracking_stop();
3258
3259    if (s->state == MIGRATION_STATUS_ACTIVE) {
3260        /*
3261         * By this moment we have RAM content saved into the migration stream.
3262         * The next step is to flush the non-RAM content (device state)
3263         * right after the ram content. The device state has been stored into
3264         * the temporary buffer before RAM saving started.
3265         */
3266        qemu_put_buffer(s->to_dst_file, s->bioc->data, s->bioc->usage);
3267        qemu_fflush(s->to_dst_file);
3268    } else if (s->state == MIGRATION_STATUS_CANCELLING) {
3269        goto fail;
3270    }
3271
3272    if (qemu_file_get_error(s->to_dst_file)) {
3273        trace_migration_completion_file_err();
3274        goto fail;
3275    }
3276
3277    migrate_set_state(&s->state, current_active_state,
3278                      MIGRATION_STATUS_COMPLETED);
3279    return;
3280
3281fail:
3282    migrate_set_state(&s->state, current_active_state,
3283                      MIGRATION_STATUS_FAILED);
3284}
3285
3286bool migrate_colo_enabled(void)
3287{
3288    MigrationState *s = migrate_get_current();
3289    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO];
3290}
3291
3292typedef enum MigThrError {
3293    /* No error detected */
3294    MIG_THR_ERR_NONE = 0,
3295    /* Detected error, but resumed successfully */
3296    MIG_THR_ERR_RECOVERED = 1,
3297    /* Detected fatal error, need to exit */
3298    MIG_THR_ERR_FATAL = 2,
3299} MigThrError;
3300
3301static int postcopy_resume_handshake(MigrationState *s)
3302{
3303    qemu_savevm_send_postcopy_resume(s->to_dst_file);
3304
3305    while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
3306        qemu_sem_wait(&s->rp_state.rp_sem);
3307    }
3308
3309    if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
3310        return 0;
3311    }
3312
3313    return -1;
3314}
3315
3316/* Return zero if success, or <0 for error */
3317static int postcopy_do_resume(MigrationState *s)
3318{
3319    int ret;
3320
3321    /*
3322     * Call all the resume_prepare() hooks, so that modules can be
3323     * ready for the migration resume.
3324     */
3325    ret = qemu_savevm_state_resume_prepare(s);
3326    if (ret) {
3327        error_report("%s: resume_prepare() failure detected: %d",
3328                     __func__, ret);
3329        return ret;
3330    }
3331
3332    /*
3333     * Last handshake with destination on the resume (destination will
3334     * switch to postcopy-active afterwards)
3335     */
3336    ret = postcopy_resume_handshake(s);
3337    if (ret) {
3338        error_report("%s: handshake failed: %d", __func__, ret);
3339        return ret;
3340    }
3341
3342    return 0;
3343}
3344
3345/*
3346 * We don't return until we are in a safe state to continue current
3347 * postcopy migration.  Returns MIG_THR_ERR_RECOVERED if recovered, or
3348 * MIG_THR_ERR_FATAL if unrecovery failure happened.
3349 */
3350static MigThrError postcopy_pause(MigrationState *s)
3351{
3352    assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
3353
3354    while (true) {
3355        QEMUFile *file;
3356
3357        /*
3358         * Current channel is possibly broken. Release it.  Note that this is
3359         * guaranteed even without lock because to_dst_file should only be
3360         * modified by the migration thread.  That also guarantees that the
3361         * unregister of yank is safe too without the lock.  It should be safe
3362         * even to be within the qemu_file_lock, but we didn't do that to avoid
3363         * taking more mutex (yank_lock) within qemu_file_lock.  TL;DR: we make
3364         * the qemu_file_lock critical section as small as possible.
3365         */
3366        assert(s->to_dst_file);
3367        migration_ioc_unregister_yank_from_file(s->to_dst_file);
3368        qemu_mutex_lock(&s->qemu_file_lock);
3369        file = s->to_dst_file;
3370        s->to_dst_file = NULL;
3371        qemu_mutex_unlock(&s->qemu_file_lock);
3372
3373        qemu_file_shutdown(file);
3374        qemu_fclose(file);
3375
3376        migrate_set_state(&s->state, s->state,
3377                          MIGRATION_STATUS_POSTCOPY_PAUSED);
3378
3379        error_report("Detected IO failure for postcopy. "
3380                     "Migration paused.");
3381
3382        /*
3383         * We wait until things fixed up. Then someone will setup the
3384         * status back for us.
3385         */
3386        while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
3387            qemu_sem_wait(&s->postcopy_pause_sem);
3388        }
3389
3390        if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
3391            /* Woken up by a recover procedure. Give it a shot */
3392
3393            /*
3394             * Firstly, let's wake up the return path now, with a new
3395             * return path channel.
3396             */
3397            qemu_sem_post(&s->postcopy_pause_rp_sem);
3398
3399            /* Do the resume logic */
3400            if (postcopy_do_resume(s) == 0) {
3401                /* Let's continue! */
3402                trace_postcopy_pause_continued();
3403                return MIG_THR_ERR_RECOVERED;
3404            } else {
3405                /*
3406                 * Something wrong happened during the recovery, let's
3407                 * pause again. Pause is always better than throwing
3408                 * data away.
3409                 */
3410                continue;
3411            }
3412        } else {
3413            /* This is not right... Time to quit. */
3414            return MIG_THR_ERR_FATAL;
3415        }
3416    }
3417}
3418
3419static MigThrError migration_detect_error(MigrationState *s)
3420{
3421    int ret;
3422    int state = s->state;
3423    Error *local_error = NULL;
3424
3425    if (state == MIGRATION_STATUS_CANCELLING ||
3426        state == MIGRATION_STATUS_CANCELLED) {
3427        /* End the migration, but don't set the state to failed */
3428        return MIG_THR_ERR_FATAL;
3429    }
3430
3431    /* Try to detect any file errors */
3432    ret = qemu_file_get_error_obj(s->to_dst_file, &local_error);
3433    if (!ret) {
3434        /* Everything is fine */
3435        assert(!local_error);
3436        return MIG_THR_ERR_NONE;
3437    }
3438
3439    if (local_error) {
3440        migrate_set_error(s, local_error);
3441        error_free(local_error);
3442    }
3443
3444    if (state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) {
3445        /*
3446         * For postcopy, we allow the network to be down for a
3447         * while. After that, it can be continued by a
3448         * recovery phase.
3449         */
3450        return postcopy_pause(s);
3451    } else {
3452        /*
3453         * For precopy (or postcopy with error outside IO), we fail
3454         * with no time.
3455         */
3456        migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED);
3457        trace_migration_thread_file_err();
3458
3459        /* Time to stop the migration, now. */
3460        return MIG_THR_ERR_FATAL;
3461    }
3462}
3463
3464/* How many bytes have we transferred since the beginning of the migration */
3465static uint64_t migration_total_bytes(MigrationState *s)
3466{
3467    return qemu_ftell(s->to_dst_file) + ram_counters.multifd_bytes;
3468}
3469
3470static void migration_calculate_complete(MigrationState *s)
3471{
3472    uint64_t bytes = migration_total_bytes(s);
3473    int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3474    int64_t transfer_time;
3475
3476    s->total_time = end_time - s->start_time;
3477    if (!s->downtime) {
3478        /*
3479         * It's still not set, so we are precopy migration.  For
3480         * postcopy, downtime is calculated during postcopy_start().
3481         */
3482        s->downtime = end_time - s->downtime_start;
3483    }
3484
3485    transfer_time = s->total_time - s->setup_time;
3486    if (transfer_time) {
3487        s->mbps = ((double) bytes * 8.0) / transfer_time / 1000;
3488    }
3489}
3490
3491static void update_iteration_initial_status(MigrationState *s)
3492{
3493    /*
3494     * Update these three fields at the same time to avoid mismatch info lead
3495     * wrong speed calculation.
3496     */
3497    s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3498    s->iteration_initial_bytes = migration_total_bytes(s);
3499    s->iteration_initial_pages = ram_get_total_transferred_pages();
3500}
3501
3502static void migration_update_counters(MigrationState *s,
3503                                      int64_t current_time)
3504{
3505    uint64_t transferred, transferred_pages, time_spent;
3506    uint64_t current_bytes; /* bytes transferred since the beginning */
3507    double bandwidth;
3508
3509    if (current_time < s->iteration_start_time + BUFFER_DELAY) {
3510        return;
3511    }
3512
3513    current_bytes = migration_total_bytes(s);
3514    transferred = current_bytes - s->iteration_initial_bytes;
3515    time_spent = current_time - s->iteration_start_time;
3516    bandwidth = (double)transferred / time_spent;
3517    s->threshold_size = bandwidth * s->parameters.downtime_limit;
3518
3519    s->mbps = (((double) transferred * 8.0) /
3520               ((double) time_spent / 1000.0)) / 1000.0 / 1000.0;
3521
3522    transferred_pages = ram_get_total_transferred_pages() -
3523                            s->iteration_initial_pages;
3524    s->pages_per_second = (double) transferred_pages /
3525                             (((double) time_spent / 1000.0));
3526
3527    /*
3528     * if we haven't sent anything, we don't want to
3529     * recalculate. 10000 is a small enough number for our purposes
3530     */
3531    if (ram_counters.dirty_pages_rate && transferred > 10000) {
3532        s->expected_downtime = ram_counters.remaining / bandwidth;
3533    }
3534
3535    qemu_file_reset_rate_limit(s->to_dst_file);
3536
3537    update_iteration_initial_status(s);
3538
3539    trace_migrate_transferred(transferred, time_spent,
3540                              bandwidth, s->threshold_size);
3541}
3542
3543/* Migration thread iteration status */
3544typedef enum {
3545    MIG_ITERATE_RESUME,         /* Resume current iteration */
3546    MIG_ITERATE_SKIP,           /* Skip current iteration */
3547    MIG_ITERATE_BREAK,          /* Break the loop */
3548} MigIterateState;
3549
3550/*
3551 * Return true if continue to the next iteration directly, false
3552 * otherwise.
3553 */
3554static MigIterateState migration_iteration_run(MigrationState *s)
3555{
3556    uint64_t pending_size, pend_pre, pend_compat, pend_post;
3557    bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
3558
3559    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
3560                              &pend_compat, &pend_post);
3561    pending_size = pend_pre + pend_compat + pend_post;
3562
3563    trace_migrate_pending(pending_size, s->threshold_size,
3564                          pend_pre, pend_compat, pend_post);
3565
3566    if (pending_size && pending_size >= s->threshold_size) {
3567        /* Still a significant amount to transfer */
3568        if (!in_postcopy && pend_pre <= s->threshold_size &&
3569            qatomic_read(&s->start_postcopy)) {
3570            if (postcopy_start(s)) {
3571                error_report("%s: postcopy failed to start", __func__);
3572            }
3573            return MIG_ITERATE_SKIP;
3574        }
3575        /* Just another iteration step */
3576        qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
3577    } else {
3578        trace_migration_thread_low_pending(pending_size);
3579        migration_completion(s);
3580        return MIG_ITERATE_BREAK;
3581    }
3582
3583    return MIG_ITERATE_RESUME;
3584}
3585
3586static void migration_iteration_finish(MigrationState *s)
3587{
3588    /* If we enabled cpu throttling for auto-converge, turn it off. */
3589    cpu_throttle_stop();
3590
3591    qemu_mutex_lock_iothread();
3592    switch (s->state) {
3593    case MIGRATION_STATUS_COMPLETED:
3594        migration_calculate_complete(s);
3595        runstate_set(RUN_STATE_POSTMIGRATE);
3596        break;
3597
3598    case MIGRATION_STATUS_ACTIVE:
3599        /*
3600         * We should really assert here, but since it's during
3601         * migration, let's try to reduce the usage of assertions.
3602         */
3603        if (!migrate_colo_enabled()) {
3604            error_report("%s: critical error: calling COLO code without "
3605                         "COLO enabled", __func__);
3606        }
3607        migrate_start_colo_process(s);
3608        /*
3609         * Fixme: we will run VM in COLO no matter its old running state.
3610         * After exited COLO, we will keep running.
3611         */
3612        s->vm_was_running = true;
3613        /* Fallthrough */
3614    case MIGRATION_STATUS_FAILED:
3615    case MIGRATION_STATUS_CANCELLED:
3616    case MIGRATION_STATUS_CANCELLING:
3617        if (s->vm_was_running) {
3618            vm_start();
3619        } else {
3620            if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
3621                runstate_set(RUN_STATE_POSTMIGRATE);
3622            }
3623        }
3624        break;
3625
3626    default:
3627        /* Should not reach here, but if so, forgive the VM. */
3628        error_report("%s: Unknown ending state %d", __func__, s->state);
3629        break;
3630    }
3631    migrate_fd_cleanup_schedule(s);
3632    qemu_mutex_unlock_iothread();
3633}
3634
3635static void bg_migration_iteration_finish(MigrationState *s)
3636{
3637    qemu_mutex_lock_iothread();
3638    switch (s->state) {
3639    case MIGRATION_STATUS_COMPLETED:
3640        migration_calculate_complete(s);
3641        break;
3642
3643    case MIGRATION_STATUS_ACTIVE:
3644    case MIGRATION_STATUS_FAILED:
3645    case MIGRATION_STATUS_CANCELLED:
3646    case MIGRATION_STATUS_CANCELLING:
3647        break;
3648
3649    default:
3650        /* Should not reach here, but if so, forgive the VM. */
3651        error_report("%s: Unknown ending state %d", __func__, s->state);
3652        break;
3653    }
3654
3655    migrate_fd_cleanup_schedule(s);
3656    qemu_mutex_unlock_iothread();
3657}
3658
3659/*
3660 * Return true if continue to the next iteration directly, false
3661 * otherwise.
3662 */
3663static MigIterateState bg_migration_iteration_run(MigrationState *s)
3664{
3665    int res;
3666
3667    res = qemu_savevm_state_iterate(s->to_dst_file, false);
3668    if (res > 0) {
3669        bg_migration_completion(s);
3670        return MIG_ITERATE_BREAK;
3671    }
3672
3673    return MIG_ITERATE_RESUME;
3674}
3675
3676void migration_make_urgent_request(void)
3677{
3678    qemu_sem_post(&migrate_get_current()->rate_limit_sem);
3679}
3680
3681void migration_consume_urgent_request(void)
3682{
3683    qemu_sem_wait(&migrate_get_current()->rate_limit_sem);
3684}
3685
3686/* Returns true if the rate limiting was broken by an urgent request */
3687bool migration_rate_limit(void)
3688{
3689    int64_t now = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3690    MigrationState *s = migrate_get_current();
3691
3692    bool urgent = false;
3693    migration_update_counters(s, now);
3694    if (qemu_file_rate_limit(s->to_dst_file)) {
3695
3696        if (qemu_file_get_error(s->to_dst_file)) {
3697            return false;
3698        }
3699        /*
3700         * Wait for a delay to do rate limiting OR
3701         * something urgent to post the semaphore.
3702         */
3703        int ms = s->iteration_start_time + BUFFER_DELAY - now;
3704        trace_migration_rate_limit_pre(ms);
3705        if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) {
3706            /*
3707             * We were woken by one or more urgent things but
3708             * the timedwait will have consumed one of them.
3709             * The service routine for the urgent wake will dec
3710             * the semaphore itself for each item it consumes,
3711             * so add this one we just eat back.
3712             */
3713            qemu_sem_post(&s->rate_limit_sem);
3714            urgent = true;
3715        }
3716        trace_migration_rate_limit_post(urgent);
3717    }
3718    return urgent;
3719}
3720
3721/*
3722 * if failover devices are present, wait they are completely
3723 * unplugged
3724 */
3725
3726static void qemu_savevm_wait_unplug(MigrationState *s, int old_state,
3727                                    int new_state)
3728{
3729    if (qemu_savevm_state_guest_unplug_pending()) {
3730        migrate_set_state(&s->state, old_state, MIGRATION_STATUS_WAIT_UNPLUG);
3731
3732        while (s->state == MIGRATION_STATUS_WAIT_UNPLUG &&
3733               qemu_savevm_state_guest_unplug_pending()) {
3734            qemu_sem_timedwait(&s->wait_unplug_sem, 250);
3735        }
3736        if (s->state != MIGRATION_STATUS_WAIT_UNPLUG) {
3737            int timeout = 120; /* 30 seconds */
3738            /*
3739             * migration has been canceled
3740             * but as we have started an unplug we must wait the end
3741             * to be able to plug back the card
3742             */
3743            while (timeout-- && qemu_savevm_state_guest_unplug_pending()) {
3744                qemu_sem_timedwait(&s->wait_unplug_sem, 250);
3745            }
3746            if (qemu_savevm_state_guest_unplug_pending()) {
3747                warn_report("migration: partially unplugged device on "
3748                            "failure");
3749            }
3750        }
3751
3752        migrate_set_state(&s->state, MIGRATION_STATUS_WAIT_UNPLUG, new_state);
3753    } else {
3754        migrate_set_state(&s->state, old_state, new_state);
3755    }
3756}
3757
3758/*
3759 * Master migration thread on the source VM.
3760 * It drives the migration and pumps the data down the outgoing channel.
3761 */
3762static void *migration_thread(void *opaque)
3763{
3764    MigrationState *s = opaque;
3765    int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
3766    MigThrError thr_error;
3767    bool urgent = false;
3768
3769    rcu_register_thread();
3770
3771    object_ref(OBJECT(s));
3772    update_iteration_initial_status(s);
3773
3774    qemu_savevm_state_header(s->to_dst_file);
3775
3776    /*
3777     * If we opened the return path, we need to make sure dst has it
3778     * opened as well.
3779     */
3780    if (s->rp_state.rp_thread_created) {
3781        /* Now tell the dest that it should open its end so it can reply */
3782        qemu_savevm_send_open_return_path(s->to_dst_file);
3783
3784        /* And do a ping that will make stuff easier to debug */
3785        qemu_savevm_send_ping(s->to_dst_file, 1);
3786    }
3787
3788    if (migrate_postcopy()) {
3789        /*
3790         * Tell the destination that we *might* want to do postcopy later;
3791         * if the other end can't do postcopy it should fail now, nice and
3792         * early.
3793         */
3794        qemu_savevm_send_postcopy_advise(s->to_dst_file);
3795    }
3796
3797    if (migrate_colo_enabled()) {
3798        /* Notify migration destination that we enable COLO */
3799        qemu_savevm_send_colo_enable(s->to_dst_file);
3800    }
3801
3802    qemu_savevm_state_setup(s->to_dst_file);
3803
3804    qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
3805                               MIGRATION_STATUS_ACTIVE);
3806
3807    s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
3808
3809    trace_migration_thread_setup_complete();
3810
3811    while (migration_is_active(s)) {
3812        if (urgent || !qemu_file_rate_limit(s->to_dst_file)) {
3813            MigIterateState iter_state = migration_iteration_run(s);
3814            if (iter_state == MIG_ITERATE_SKIP) {
3815                continue;
3816            } else if (iter_state == MIG_ITERATE_BREAK) {
3817                break;
3818            }
3819        }
3820
3821        /*
3822         * Try to detect any kind of failures, and see whether we
3823         * should stop the migration now.
3824         */
3825        thr_error = migration_detect_error(s);
3826        if (thr_error == MIG_THR_ERR_FATAL) {
3827            /* Stop migration */
3828            break;
3829        } else if (thr_error == MIG_THR_ERR_RECOVERED) {
3830            /*
3831             * Just recovered from a e.g. network failure, reset all
3832             * the local variables. This is important to avoid
3833             * breaking transferred_bytes and bandwidth calculation
3834             */
3835            update_iteration_initial_status(s);
3836        }
3837
3838        urgent = migration_rate_limit();
3839    }
3840
3841    trace_migration_thread_after_loop();
3842    migration_iteration_finish(s);
3843    object_unref(OBJECT(s));
3844    rcu_unregister_thread();
3845    return NULL;
3846}
3847
3848static void bg_migration_vm_start_bh(void *opaque)
3849{
3850    MigrationState *s = opaque;
3851
3852    qemu_bh_delete(s->vm_start_bh);
3853    s->vm_start_bh = NULL;
3854
3855    vm_start();
3856    s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start;
3857}
3858
3859/**
3860 * Background snapshot thread, based on live migration code.
3861 * This is an alternative implementation of live migration mechanism
3862 * introduced specifically to support background snapshots.
3863 *
3864 * It takes advantage of userfault_fd write protection mechanism introduced
3865 * in v5.7 kernel. Compared to existing dirty page logging migration much
3866 * lesser stream traffic is produced resulting in smaller snapshot images,
3867 * simply cause of no page duplicates can get into the stream.
3868 *
3869 * Another key point is that generated vmstate stream reflects machine state
3870 * 'frozen' at the beginning of snapshot creation compared to dirty page logging
3871 * mechanism, which effectively results in that saved snapshot is the state of VM
3872 * at the end of the process.
3873 */
3874static void *bg_migration_thread(void *opaque)
3875{
3876    MigrationState *s = opaque;
3877    int64_t setup_start;
3878    MigThrError thr_error;
3879    QEMUFile *fb;
3880    bool early_fail = true;
3881
3882    rcu_register_thread();
3883    object_ref(OBJECT(s));
3884
3885    qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
3886
3887    setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
3888    /*
3889     * We want to save vmstate for the moment when migration has been
3890     * initiated but also we want to save RAM content while VM is running.
3891     * The RAM content should appear first in the vmstate. So, we first
3892     * stash the non-RAM part of the vmstate to the temporary buffer,
3893     * then write RAM part of the vmstate to the migration stream
3894     * with vCPUs running and, finally, write stashed non-RAM part of
3895     * the vmstate from the buffer to the migration stream.
3896     */
3897    s->bioc = qio_channel_buffer_new(512 * 1024);
3898    qio_channel_set_name(QIO_CHANNEL(s->bioc), "vmstate-buffer");
3899    fb = qemu_fopen_channel_output(QIO_CHANNEL(s->bioc));
3900    object_unref(OBJECT(s->bioc));
3901
3902    update_iteration_initial_status(s);
3903
3904    /*
3905     * Prepare for tracking memory writes with UFFD-WP - populate
3906     * RAM pages before protecting.
3907     */
3908#ifdef __linux__
3909    ram_write_tracking_prepare();
3910#endif
3911
3912    qemu_savevm_state_header(s->to_dst_file);
3913    qemu_savevm_state_setup(s->to_dst_file);
3914
3915    qemu_savevm_wait_unplug(s, MIGRATION_STATUS_SETUP,
3916                               MIGRATION_STATUS_ACTIVE);
3917
3918    s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
3919
3920    trace_migration_thread_setup_complete();
3921    s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3922
3923    qemu_mutex_lock_iothread();
3924
3925    /*
3926     * If VM is currently in suspended state, then, to make a valid runstate
3927     * transition in vm_stop_force_state() we need to wakeup it up.
3928     */
3929    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
3930    s->vm_was_running = runstate_is_running();
3931
3932    if (global_state_store()) {
3933        goto fail;
3934    }
3935    /* Forcibly stop VM before saving state of vCPUs and devices */
3936    if (vm_stop_force_state(RUN_STATE_PAUSED)) {
3937        goto fail;
3938    }
3939    /*
3940     * Put vCPUs in sync with shadow context structures, then
3941     * save their state to channel-buffer along with devices.
3942     */
3943    cpu_synchronize_all_states();
3944    if (qemu_savevm_state_complete_precopy_non_iterable(fb, false, false)) {
3945        goto fail;
3946    }
3947    /*
3948     * Since we are going to get non-iterable state data directly
3949     * from s->bioc->data, explicit flush is needed here.
3950     */
3951    qemu_fflush(fb);
3952
3953    /* Now initialize UFFD context and start tracking RAM writes */
3954    if (ram_write_tracking_start()) {
3955        goto fail;
3956    }
3957    early_fail = false;
3958
3959    /*
3960     * Start VM from BH handler to avoid write-fault lock here.
3961     * UFFD-WP protection for the whole RAM is already enabled so
3962     * calling VM state change notifiers from vm_start() would initiate
3963     * writes to virtio VQs memory which is in write-protected region.
3964     */
3965    s->vm_start_bh = qemu_bh_new(bg_migration_vm_start_bh, s);
3966    qemu_bh_schedule(s->vm_start_bh);
3967
3968    qemu_mutex_unlock_iothread();
3969
3970    while (migration_is_active(s)) {
3971        MigIterateState iter_state = bg_migration_iteration_run(s);
3972        if (iter_state == MIG_ITERATE_SKIP) {
3973            continue;
3974        } else if (iter_state == MIG_ITERATE_BREAK) {
3975            break;
3976        }
3977
3978        /*
3979         * Try to detect any kind of failures, and see whether we
3980         * should stop the migration now.
3981         */
3982        thr_error = migration_detect_error(s);
3983        if (thr_error == MIG_THR_ERR_FATAL) {
3984            /* Stop migration */
3985            break;
3986        }
3987
3988        migration_update_counters(s, qemu_clock_get_ms(QEMU_CLOCK_REALTIME));
3989    }
3990
3991    trace_migration_thread_after_loop();
3992
3993fail:
3994    if (early_fail) {
3995        migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
3996                MIGRATION_STATUS_FAILED);
3997        qemu_mutex_unlock_iothread();
3998    }
3999
4000    bg_migration_iteration_finish(s);
4001
4002    qemu_fclose(fb);
4003    object_unref(OBJECT(s));
4004    rcu_unregister_thread();
4005
4006    return NULL;
4007}
4008
4009void migrate_fd_connect(MigrationState *s, Error *error_in)
4010{
4011    Error *local_err = NULL;
4012    int64_t rate_limit;
4013    bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
4014
4015    /*
4016     * If there's a previous error, free it and prepare for another one.
4017     * Meanwhile if migration completes successfully, there won't have an error
4018     * dumped when calling migrate_fd_cleanup().
4019     */
4020    migrate_error_free(s);
4021
4022    s->expected_downtime = s->parameters.downtime_limit;
4023    if (resume) {
4024        assert(s->cleanup_bh);
4025    } else {
4026        assert(!s->cleanup_bh);
4027        s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup_bh, s);
4028    }
4029    if (error_in) {
4030        migrate_fd_error(s, error_in);
4031        if (resume) {
4032            /*
4033             * Don't do cleanup for resume if channel is invalid, but only dump
4034             * the error.  We wait for another channel connect from the user.
4035             * The error_report still gives HMP user a hint on what failed.
4036             * It's normally done in migrate_fd_cleanup(), but call it here
4037             * explicitly.
4038             */
4039            error_report_err(error_copy(s->error));
4040        } else {
4041            migrate_fd_cleanup(s);
4042        }
4043        return;
4044    }
4045
4046    if (resume) {
4047        /* This is a resumed migration */
4048        rate_limit = s->parameters.max_postcopy_bandwidth /
4049            XFER_LIMIT_RATIO;
4050    } else {
4051        /* This is a fresh new migration */
4052        rate_limit = s->parameters.max_bandwidth / XFER_LIMIT_RATIO;
4053
4054        /* Notify before starting migration thread */
4055        notifier_list_notify(&migration_state_notifiers, s);
4056    }
4057
4058    qemu_file_set_rate_limit(s->to_dst_file, rate_limit);
4059    qemu_file_set_blocking(s->to_dst_file, true);
4060
4061    /*
4062     * Open the return path. For postcopy, it is used exclusively. For
4063     * precopy, only if user specified "return-path" capability would
4064     * QEMU uses the return path.
4065     */
4066    if (migrate_postcopy_ram() || migrate_use_return_path()) {
4067        if (open_return_path_on_source(s, !resume)) {
4068            error_report("Unable to open return-path for postcopy");
4069            migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
4070            migrate_fd_cleanup(s);
4071            return;
4072        }
4073    }
4074
4075    if (resume) {
4076        /* Wakeup the main migration thread to do the recovery */
4077        migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
4078                          MIGRATION_STATUS_POSTCOPY_RECOVER);
4079        qemu_sem_post(&s->postcopy_pause_sem);
4080        return;
4081    }
4082
4083    if (multifd_save_setup(&local_err) != 0) {
4084        error_report_err(local_err);
4085        migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
4086                          MIGRATION_STATUS_FAILED);
4087        migrate_fd_cleanup(s);
4088        return;
4089    }
4090
4091    if (migrate_background_snapshot()) {
4092        qemu_thread_create(&s->thread, "bg_snapshot",
4093                bg_migration_thread, s, QEMU_THREAD_JOINABLE);
4094    } else {
4095        qemu_thread_create(&s->thread, "live_migration",
4096                migration_thread, s, QEMU_THREAD_JOINABLE);
4097    }
4098    s->migration_thread_running = true;
4099}
4100
4101void migration_global_dump(Monitor *mon)
4102{
4103    MigrationState *ms = migrate_get_current();
4104
4105    monitor_printf(mon, "globals:\n");
4106    monitor_printf(mon, "store-global-state: %s\n",
4107                   ms->store_global_state ? "on" : "off");
4108    monitor_printf(mon, "only-migratable: %s\n",
4109                   only_migratable ? "on" : "off");
4110    monitor_printf(mon, "send-configuration: %s\n",
4111                   ms->send_configuration ? "on" : "off");
4112    monitor_printf(mon, "send-section-footer: %s\n",
4113                   ms->send_section_footer ? "on" : "off");
4114    monitor_printf(mon, "decompress-error-check: %s\n",
4115                   ms->decompress_error_check ? "on" : "off");
4116    monitor_printf(mon, "clear-bitmap-shift: %u\n",
4117                   ms->clear_bitmap_shift);
4118}
4119
4120#define DEFINE_PROP_MIG_CAP(name, x)             \
4121    DEFINE_PROP_BOOL(name, MigrationState, enabled_capabilities[x], false)
4122
4123static Property migration_properties[] = {
4124    DEFINE_PROP_BOOL("store-global-state", MigrationState,
4125                     store_global_state, true),
4126    DEFINE_PROP_BOOL("send-configuration", MigrationState,
4127                     send_configuration, true),
4128    DEFINE_PROP_BOOL("send-section-footer", MigrationState,
4129                     send_section_footer, true),
4130    DEFINE_PROP_BOOL("decompress-error-check", MigrationState,
4131                      decompress_error_check, true),
4132    DEFINE_PROP_UINT8("x-clear-bitmap-shift", MigrationState,
4133                      clear_bitmap_shift, CLEAR_BITMAP_SHIFT_DEFAULT),
4134
4135    /* Migration parameters */
4136    DEFINE_PROP_UINT8("x-compress-level", MigrationState,
4137                      parameters.compress_level,
4138                      DEFAULT_MIGRATE_COMPRESS_LEVEL),
4139    DEFINE_PROP_UINT8("x-compress-threads", MigrationState,
4140                      parameters.compress_threads,
4141                      DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT),
4142    DEFINE_PROP_BOOL("x-compress-wait-thread", MigrationState,
4143                      parameters.compress_wait_thread, true),
4144    DEFINE_PROP_UINT8("x-decompress-threads", MigrationState,
4145                      parameters.decompress_threads,
4146                      DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT),
4147    DEFINE_PROP_UINT8("x-throttle-trigger-threshold", MigrationState,
4148                      parameters.throttle_trigger_threshold,
4149                      DEFAULT_MIGRATE_THROTTLE_TRIGGER_THRESHOLD),
4150    DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState,
4151                      parameters.cpu_throttle_initial,
4152                      DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL),
4153    DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState,
4154                      parameters.cpu_throttle_increment,
4155                      DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT),
4156    DEFINE_PROP_BOOL("x-cpu-throttle-tailslow", MigrationState,
4157                      parameters.cpu_throttle_tailslow, false),
4158    DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState,
4159                      parameters.max_bandwidth, MAX_THROTTLE),
4160    DEFINE_PROP_UINT64("x-downtime-limit", MigrationState,
4161                      parameters.downtime_limit,
4162                      DEFAULT_MIGRATE_SET_DOWNTIME),
4163    DEFINE_PROP_UINT32("x-checkpoint-delay", MigrationState,
4164                      parameters.x_checkpoint_delay,
4165                      DEFAULT_MIGRATE_X_CHECKPOINT_DELAY),
4166    DEFINE_PROP_UINT8("multifd-channels", MigrationState,
4167                      parameters.multifd_channels,
4168                      DEFAULT_MIGRATE_MULTIFD_CHANNELS),
4169    DEFINE_PROP_MULTIFD_COMPRESSION("multifd-compression", MigrationState,
4170                      parameters.multifd_compression,
4171                      DEFAULT_MIGRATE_MULTIFD_COMPRESSION),
4172    DEFINE_PROP_UINT8("multifd-zlib-level", MigrationState,
4173                      parameters.multifd_zlib_level,
4174                      DEFAULT_MIGRATE_MULTIFD_ZLIB_LEVEL),
4175    DEFINE_PROP_UINT8("multifd-zstd-level", MigrationState,
4176                      parameters.multifd_zstd_level,
4177                      DEFAULT_MIGRATE_MULTIFD_ZSTD_LEVEL),
4178    DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState,
4179                      parameters.xbzrle_cache_size,
4180                      DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE),
4181    DEFINE_PROP_SIZE("max-postcopy-bandwidth", MigrationState,
4182                      parameters.max_postcopy_bandwidth,
4183                      DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH),
4184    DEFINE_PROP_UINT8("max-cpu-throttle", MigrationState,
4185                      parameters.max_cpu_throttle,
4186                      DEFAULT_MIGRATE_MAX_CPU_THROTTLE),
4187    DEFINE_PROP_SIZE("announce-initial", MigrationState,
4188                      parameters.announce_initial,
4189                      DEFAULT_MIGRATE_ANNOUNCE_INITIAL),
4190    DEFINE_PROP_SIZE("announce-max", MigrationState,
4191                      parameters.announce_max,
4192                      DEFAULT_MIGRATE_ANNOUNCE_MAX),
4193    DEFINE_PROP_SIZE("announce-rounds", MigrationState,
4194                      parameters.announce_rounds,
4195                      DEFAULT_MIGRATE_ANNOUNCE_ROUNDS),
4196    DEFINE_PROP_SIZE("announce-step", MigrationState,
4197                      parameters.announce_step,
4198                      DEFAULT_MIGRATE_ANNOUNCE_STEP),
4199
4200    /* Migration capabilities */
4201    DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
4202    DEFINE_PROP_MIG_CAP("x-rdma-pin-all", MIGRATION_CAPABILITY_RDMA_PIN_ALL),
4203    DEFINE_PROP_MIG_CAP("x-auto-converge", MIGRATION_CAPABILITY_AUTO_CONVERGE),
4204    DEFINE_PROP_MIG_CAP("x-zero-blocks", MIGRATION_CAPABILITY_ZERO_BLOCKS),
4205    DEFINE_PROP_MIG_CAP("x-compress", MIGRATION_CAPABILITY_COMPRESS),
4206    DEFINE_PROP_MIG_CAP("x-events", MIGRATION_CAPABILITY_EVENTS),
4207    DEFINE_PROP_MIG_CAP("x-postcopy-ram", MIGRATION_CAPABILITY_POSTCOPY_RAM),
4208    DEFINE_PROP_MIG_CAP("x-colo", MIGRATION_CAPABILITY_X_COLO),
4209    DEFINE_PROP_MIG_CAP("x-release-ram", MIGRATION_CAPABILITY_RELEASE_RAM),
4210    DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK),
4211    DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH),
4212    DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_MULTIFD),
4213    DEFINE_PROP_MIG_CAP("x-background-snapshot",
4214            MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT),
4215
4216    DEFINE_PROP_END_OF_LIST(),
4217};
4218
4219static void migration_class_init(ObjectClass *klass, void *data)
4220{
4221    DeviceClass *dc = DEVICE_CLASS(klass);
4222
4223    dc->user_creatable = false;
4224    device_class_set_props(dc, migration_properties);
4225}
4226
4227static void migration_instance_finalize(Object *obj)
4228{
4229    MigrationState *ms = MIGRATION_OBJ(obj);
4230    MigrationParameters *params = &ms->parameters;
4231
4232    qemu_mutex_destroy(&ms->error_mutex);
4233    qemu_mutex_destroy(&ms->qemu_file_lock);
4234    g_free(params->tls_hostname);
4235    g_free(params->tls_creds);
4236    qemu_sem_destroy(&ms->wait_unplug_sem);
4237    qemu_sem_destroy(&ms->rate_limit_sem);
4238    qemu_sem_destroy(&ms->pause_sem);
4239    qemu_sem_destroy(&ms->postcopy_pause_sem);
4240    qemu_sem_destroy(&ms->postcopy_pause_rp_sem);
4241    qemu_sem_destroy(&ms->rp_state.rp_sem);
4242    error_free(ms->error);
4243}
4244
4245static void migration_instance_init(Object *obj)
4246{
4247    MigrationState *ms = MIGRATION_OBJ(obj);
4248    MigrationParameters *params = &ms->parameters;
4249
4250    ms->state = MIGRATION_STATUS_NONE;
4251    ms->mbps = -1;
4252    ms->pages_per_second = -1;
4253    qemu_sem_init(&ms->pause_sem, 0);
4254    qemu_mutex_init(&ms->error_mutex);
4255
4256    params->tls_hostname = g_strdup("");
4257    params->tls_creds = g_strdup("");
4258
4259    /* Set has_* up only for parameter checks */
4260    params->has_compress_level = true;
4261    params->has_compress_threads = true;
4262    params->has_decompress_threads = true;
4263    params->has_throttle_trigger_threshold = true;
4264    params->has_cpu_throttle_initial = true;
4265    params->has_cpu_throttle_increment = true;
4266    params->has_cpu_throttle_tailslow = true;
4267    params->has_max_bandwidth = true;
4268    params->has_downtime_limit = true;
4269    params->has_x_checkpoint_delay = true;
4270    params->has_block_incremental = true;
4271    params->has_multifd_channels = true;
4272    params->has_multifd_compression = true;
4273    params->has_multifd_zlib_level = true;
4274    params->has_multifd_zstd_level = true;
4275    params->has_xbzrle_cache_size = true;
4276    params->has_max_postcopy_bandwidth = true;
4277    params->has_max_cpu_throttle = true;
4278    params->has_announce_initial = true;
4279    params->has_announce_max = true;
4280    params->has_announce_rounds = true;
4281    params->has_announce_step = true;
4282
4283    qemu_sem_init(&ms->postcopy_pause_sem, 0);
4284    qemu_sem_init(&ms->postcopy_pause_rp_sem, 0);
4285    qemu_sem_init(&ms->rp_state.rp_sem, 0);
4286    qemu_sem_init(&ms->rate_limit_sem, 0);
4287    qemu_sem_init(&ms->wait_unplug_sem, 0);
4288    qemu_mutex_init(&ms->qemu_file_lock);
4289}
4290
4291/*
4292 * Return true if check pass, false otherwise. Error will be put
4293 * inside errp if provided.
4294 */
4295static bool migration_object_check(MigrationState *ms, Error **errp)
4296{
4297    MigrationCapabilityStatusList *head = NULL;
4298    /* Assuming all off */
4299    bool cap_list[MIGRATION_CAPABILITY__MAX] = { 0 }, ret;
4300    int i;
4301
4302    if (!migrate_params_check(&ms->parameters, errp)) {
4303        return false;
4304    }
4305
4306    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
4307        if (ms->enabled_capabilities[i]) {
4308            QAPI_LIST_PREPEND(head, migrate_cap_add(i, true));
4309        }
4310    }
4311
4312    ret = migrate_caps_check(cap_list, head, errp);
4313
4314    /* It works with head == NULL */
4315    qapi_free_MigrationCapabilityStatusList(head);
4316
4317    return ret;
4318}
4319
4320static const TypeInfo migration_type = {
4321    .name = TYPE_MIGRATION,
4322    /*
4323     * NOTE: TYPE_MIGRATION is not really a device, as the object is
4324     * not created using qdev_new(), it is not attached to the qdev
4325     * device tree, and it is never realized.
4326     *
4327     * TODO: Make this TYPE_OBJECT once QOM provides something like
4328     * TYPE_DEVICE's "-global" properties.
4329     */
4330    .parent = TYPE_DEVICE,
4331    .class_init = migration_class_init,
4332    .class_size = sizeof(MigrationClass),
4333    .instance_size = sizeof(MigrationState),
4334    .instance_init = migration_instance_init,
4335    .instance_finalize = migration_instance_finalize,
4336};
4337
4338static void register_migration_types(void)
4339{
4340    type_register_static(&migration_type);
4341}
4342
4343type_init(register_migration_types);
4344