qemu/migration/migration.c
<<
>>
Prefs
   1/*
   2 * QEMU live migration
   3 *
   4 * Copyright IBM, Corp. 2008
   5 *
   6 * Authors:
   7 *  Anthony Liguori   <aliguori@us.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 * Contributions after 2012-01-13 are licensed under the terms of the
  13 * GNU GPL, version 2 or (at your option) any later version.
  14 */
  15
  16#include "qemu/osdep.h"
  17#include "qemu/cutils.h"
  18#include "qemu/error-report.h"
  19#include "migration/blocker.h"
  20#include "exec.h"
  21#include "fd.h"
  22#include "socket.h"
  23#include "rdma.h"
  24#include "ram.h"
  25#include "migration/global_state.h"
  26#include "migration/misc.h"
  27#include "migration.h"
  28#include "savevm.h"
  29#include "qemu-file-channel.h"
  30#include "qemu-file.h"
  31#include "migration/vmstate.h"
  32#include "block/block.h"
  33#include "qapi/error.h"
  34#include "qapi/qapi-commands-migration.h"
  35#include "qapi/qapi-events-migration.h"
  36#include "qapi/qmp/qerror.h"
  37#include "qapi/qmp/qnull.h"
  38#include "qemu/rcu.h"
  39#include "block.h"
  40#include "postcopy-ram.h"
  41#include "qemu/thread.h"
  42#include "trace.h"
  43#include "exec/target_page.h"
  44#include "io/channel-buffer.h"
  45#include "migration/colo.h"
  46#include "hw/boards.h"
  47#include "monitor/monitor.h"
  48
  49#define MAX_THROTTLE  (32 << 20)      /* Migration transfer speed throttling */
  50
  51/* Amount of time to allocate to each "chunk" of bandwidth-throttled
  52 * data. */
  53#define BUFFER_DELAY     100
  54#define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY)
  55
  56/* Time in milliseconds we are allowed to stop the source,
  57 * for sending the last part */
  58#define DEFAULT_MIGRATE_SET_DOWNTIME 300
  59
  60/* Maximum migrate downtime set to 2000 seconds */
  61#define MAX_MIGRATE_DOWNTIME_SECONDS 2000
  62#define MAX_MIGRATE_DOWNTIME (MAX_MIGRATE_DOWNTIME_SECONDS * 1000)
  63
  64/* Default compression thread count */
  65#define DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT 8
  66/* Default decompression thread count, usually decompression is at
  67 * least 4 times as fast as compression.*/
  68#define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2
  69/*0: means nocompress, 1: best speed, ... 9: best compress ratio */
  70#define DEFAULT_MIGRATE_COMPRESS_LEVEL 1
  71/* Define default autoconverge cpu throttle migration parameters */
  72#define DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL 20
  73#define DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT 10
  74
  75/* Migration XBZRLE default cache size */
  76#define DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE (64 * 1024 * 1024)
  77
  78/* The delay time (in ms) between two COLO checkpoints
  79 * Note: Please change this default value to 10000 when we support hybrid mode.
  80 */
  81#define DEFAULT_MIGRATE_X_CHECKPOINT_DELAY 200
  82#define DEFAULT_MIGRATE_MULTIFD_CHANNELS 2
  83#define DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT 16
  84
  85/* Background transfer rate for postcopy, 0 means unlimited, note
  86 * that page requests can still exceed this limit.
  87 */
  88#define DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH 0
  89
  90static NotifierList migration_state_notifiers =
  91    NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
  92
  93static bool deferred_incoming;
  94
  95/* Messages sent on the return path from destination to source */
  96enum mig_rp_message_type {
  97    MIG_RP_MSG_INVALID = 0,  /* Must be 0 */
  98    MIG_RP_MSG_SHUT,         /* sibling will not send any more RP messages */
  99    MIG_RP_MSG_PONG,         /* Response to a PING; data (seq: be32 ) */
 100
 101    MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */
 102    MIG_RP_MSG_REQ_PAGES,    /* data (start: be64, len: be32) */
 103    MIG_RP_MSG_RECV_BITMAP,  /* send recved_bitmap back to source */
 104    MIG_RP_MSG_RESUME_ACK,   /* tell source that we are ready to resume */
 105
 106    MIG_RP_MSG_MAX
 107};
 108
 109/* When we add fault tolerance, we could have several
 110   migrations at once.  For now we don't need to add
 111   dynamic creation of migration */
 112
 113static MigrationState *current_migration;
 114static MigrationIncomingState *current_incoming;
 115
 116static bool migration_object_check(MigrationState *ms, Error **errp);
 117static int migration_maybe_pause(MigrationState *s,
 118                                 int *current_active_state,
 119                                 int new_state);
 120
 121void migration_object_init(void)
 122{
 123    MachineState *ms = MACHINE(qdev_get_machine());
 124    Error *err = NULL;
 125
 126    /* This can only be called once. */
 127    assert(!current_migration);
 128    current_migration = MIGRATION_OBJ(object_new(TYPE_MIGRATION));
 129
 130    /*
 131     * Init the migrate incoming object as well no matter whether
 132     * we'll use it or not.
 133     */
 134    assert(!current_incoming);
 135    current_incoming = g_new0(MigrationIncomingState, 1);
 136    current_incoming->state = MIGRATION_STATUS_NONE;
 137    current_incoming->postcopy_remote_fds =
 138        g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD));
 139    qemu_mutex_init(&current_incoming->rp_mutex);
 140    qemu_event_init(&current_incoming->main_thread_load_event, false);
 141    qemu_sem_init(&current_incoming->postcopy_pause_sem_dst, 0);
 142    qemu_sem_init(&current_incoming->postcopy_pause_sem_fault, 0);
 143
 144    init_dirty_bitmap_incoming_migration();
 145
 146    if (!migration_object_check(current_migration, &err)) {
 147        error_report_err(err);
 148        exit(1);
 149    }
 150
 151    /*
 152     * We cannot really do this in migration_instance_init() since at
 153     * that time global properties are not yet applied, then this
 154     * value will be definitely replaced by something else.
 155     */
 156    if (ms->enforce_config_section) {
 157        current_migration->send_configuration = true;
 158    }
 159}
 160
 161void migration_object_finalize(void)
 162{
 163    object_unref(OBJECT(current_migration));
 164}
 165
 166/* For outgoing */
 167MigrationState *migrate_get_current(void)
 168{
 169    /* This can only be called after the object created. */
 170    assert(current_migration);
 171    return current_migration;
 172}
 173
 174MigrationIncomingState *migration_incoming_get_current(void)
 175{
 176    assert(current_incoming);
 177    return current_incoming;
 178}
 179
 180void migration_incoming_state_destroy(void)
 181{
 182    struct MigrationIncomingState *mis = migration_incoming_get_current();
 183
 184    if (mis->to_src_file) {
 185        /* Tell source that we are done */
 186        migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
 187        qemu_fclose(mis->to_src_file);
 188        mis->to_src_file = NULL;
 189    }
 190
 191    if (mis->from_src_file) {
 192        qemu_fclose(mis->from_src_file);
 193        mis->from_src_file = NULL;
 194    }
 195    if (mis->postcopy_remote_fds) {
 196        g_array_free(mis->postcopy_remote_fds, TRUE);
 197        mis->postcopy_remote_fds = NULL;
 198    }
 199
 200    qemu_event_reset(&mis->main_thread_load_event);
 201}
 202
 203static void migrate_generate_event(int new_state)
 204{
 205    if (migrate_use_events()) {
 206        qapi_event_send_migration(new_state, &error_abort);
 207    }
 208}
 209
 210static bool migrate_late_block_activate(void)
 211{
 212    MigrationState *s;
 213
 214    s = migrate_get_current();
 215
 216    return s->enabled_capabilities[
 217        MIGRATION_CAPABILITY_LATE_BLOCK_ACTIVATE];
 218}
 219
 220/*
 221 * Called on -incoming with a defer: uri.
 222 * The migration can be started later after any parameters have been
 223 * changed.
 224 */
 225static void deferred_incoming_migration(Error **errp)
 226{
 227    if (deferred_incoming) {
 228        error_setg(errp, "Incoming migration already deferred");
 229    }
 230    deferred_incoming = true;
 231}
 232
 233/*
 234 * Send a message on the return channel back to the source
 235 * of the migration.
 236 */
 237static int migrate_send_rp_message(MigrationIncomingState *mis,
 238                                   enum mig_rp_message_type message_type,
 239                                   uint16_t len, void *data)
 240{
 241    int ret = 0;
 242
 243    trace_migrate_send_rp_message((int)message_type, len);
 244    qemu_mutex_lock(&mis->rp_mutex);
 245
 246    /*
 247     * It's possible that the file handle got lost due to network
 248     * failures.
 249     */
 250    if (!mis->to_src_file) {
 251        ret = -EIO;
 252        goto error;
 253    }
 254
 255    qemu_put_be16(mis->to_src_file, (unsigned int)message_type);
 256    qemu_put_be16(mis->to_src_file, len);
 257    qemu_put_buffer(mis->to_src_file, data, len);
 258    qemu_fflush(mis->to_src_file);
 259
 260    /* It's possible that qemu file got error during sending */
 261    ret = qemu_file_get_error(mis->to_src_file);
 262
 263error:
 264    qemu_mutex_unlock(&mis->rp_mutex);
 265    return ret;
 266}
 267
 268/* Request a range of pages from the source VM at the given
 269 * start address.
 270 *   rbname: Name of the RAMBlock to request the page in, if NULL it's the same
 271 *           as the last request (a name must have been given previously)
 272 *   Start: Address offset within the RB
 273 *   Len: Length in bytes required - must be a multiple of pagesize
 274 */
 275int migrate_send_rp_req_pages(MigrationIncomingState *mis, const char *rbname,
 276                              ram_addr_t start, size_t len)
 277{
 278    uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname up to 256 */
 279    size_t msglen = 12; /* start + len */
 280    enum mig_rp_message_type msg_type;
 281
 282    *(uint64_t *)bufc = cpu_to_be64((uint64_t)start);
 283    *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len);
 284
 285    if (rbname) {
 286        int rbname_len = strlen(rbname);
 287        assert(rbname_len < 256);
 288
 289        bufc[msglen++] = rbname_len;
 290        memcpy(bufc + msglen, rbname, rbname_len);
 291        msglen += rbname_len;
 292        msg_type = MIG_RP_MSG_REQ_PAGES_ID;
 293    } else {
 294        msg_type = MIG_RP_MSG_REQ_PAGES;
 295    }
 296
 297    return migrate_send_rp_message(mis, msg_type, msglen, bufc);
 298}
 299
 300void qemu_start_incoming_migration(const char *uri, Error **errp)
 301{
 302    const char *p;
 303
 304    qapi_event_send_migration(MIGRATION_STATUS_SETUP, &error_abort);
 305    if (!strcmp(uri, "defer")) {
 306        deferred_incoming_migration(errp);
 307    } else if (strstart(uri, "tcp:", &p)) {
 308        tcp_start_incoming_migration(p, errp);
 309#ifdef CONFIG_RDMA
 310    } else if (strstart(uri, "rdma:", &p)) {
 311        rdma_start_incoming_migration(p, errp);
 312#endif
 313    } else if (strstart(uri, "exec:", &p)) {
 314        exec_start_incoming_migration(p, errp);
 315    } else if (strstart(uri, "unix:", &p)) {
 316        unix_start_incoming_migration(p, errp);
 317    } else if (strstart(uri, "fd:", &p)) {
 318        fd_start_incoming_migration(p, errp);
 319    } else {
 320        error_setg(errp, "unknown migration protocol: %s", uri);
 321    }
 322}
 323
 324static void process_incoming_migration_bh(void *opaque)
 325{
 326    Error *local_err = NULL;
 327    MigrationIncomingState *mis = opaque;
 328
 329    /* If capability late_block_activate is set:
 330     * Only fire up the block code now if we're going to restart the
 331     * VM, else 'cont' will do it.
 332     * This causes file locking to happen; so we don't want it to happen
 333     * unless we really are starting the VM.
 334     */
 335    if (!migrate_late_block_activate() ||
 336         (autostart && (!global_state_received() ||
 337            global_state_get_runstate() == RUN_STATE_RUNNING))) {
 338        /* Make sure all file formats flush their mutable metadata.
 339         * If we get an error here, just don't restart the VM yet. */
 340        bdrv_invalidate_cache_all(&local_err);
 341        if (local_err) {
 342            error_report_err(local_err);
 343            local_err = NULL;
 344            autostart = false;
 345        }
 346    }
 347
 348    /*
 349     * This must happen after all error conditions are dealt with and
 350     * we're sure the VM is going to be running on this host.
 351     */
 352    qemu_announce_self();
 353
 354    if (multifd_load_cleanup(&local_err) != 0) {
 355        error_report_err(local_err);
 356        autostart = false;
 357    }
 358    /* If global state section was not received or we are in running
 359       state, we need to obey autostart. Any other state is set with
 360       runstate_set. */
 361
 362    dirty_bitmap_mig_before_vm_start();
 363
 364    if (!global_state_received() ||
 365        global_state_get_runstate() == RUN_STATE_RUNNING) {
 366        if (autostart) {
 367            vm_start();
 368        } else {
 369            runstate_set(RUN_STATE_PAUSED);
 370        }
 371    } else {
 372        runstate_set(global_state_get_runstate());
 373    }
 374    /*
 375     * This must happen after any state changes since as soon as an external
 376     * observer sees this event they might start to prod at the VM assuming
 377     * it's ready to use.
 378     */
 379    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
 380                      MIGRATION_STATUS_COMPLETED);
 381    qemu_bh_delete(mis->bh);
 382    migration_incoming_state_destroy();
 383}
 384
 385static void process_incoming_migration_co(void *opaque)
 386{
 387    MigrationIncomingState *mis = migration_incoming_get_current();
 388    PostcopyState ps;
 389    int ret;
 390
 391    assert(mis->from_src_file);
 392    mis->largest_page_size = qemu_ram_pagesize_largest();
 393    postcopy_state_set(POSTCOPY_INCOMING_NONE);
 394    migrate_set_state(&mis->state, MIGRATION_STATUS_NONE,
 395                      MIGRATION_STATUS_ACTIVE);
 396    ret = qemu_loadvm_state(mis->from_src_file);
 397
 398    ps = postcopy_state_get();
 399    trace_process_incoming_migration_co_end(ret, ps);
 400    if (ps != POSTCOPY_INCOMING_NONE) {
 401        if (ps == POSTCOPY_INCOMING_ADVISE) {
 402            /*
 403             * Where a migration had postcopy enabled (and thus went to advise)
 404             * but managed to complete within the precopy period, we can use
 405             * the normal exit.
 406             */
 407            postcopy_ram_incoming_cleanup(mis);
 408        } else if (ret >= 0) {
 409            /*
 410             * Postcopy was started, cleanup should happen at the end of the
 411             * postcopy thread.
 412             */
 413            trace_process_incoming_migration_co_postcopy_end_main();
 414            return;
 415        }
 416        /* Else if something went wrong then just fall out of the normal exit */
 417    }
 418
 419    /* we get COLO info, and know if we are in COLO mode */
 420    if (!ret && migration_incoming_enable_colo()) {
 421        mis->migration_incoming_co = qemu_coroutine_self();
 422        qemu_thread_create(&mis->colo_incoming_thread, "COLO incoming",
 423             colo_process_incoming_thread, mis, QEMU_THREAD_JOINABLE);
 424        mis->have_colo_incoming_thread = true;
 425        qemu_coroutine_yield();
 426
 427        /* Wait checkpoint incoming thread exit before free resource */
 428        qemu_thread_join(&mis->colo_incoming_thread);
 429    }
 430
 431    if (ret < 0) {
 432        Error *local_err = NULL;
 433
 434        migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
 435                          MIGRATION_STATUS_FAILED);
 436        error_report("load of migration failed: %s", strerror(-ret));
 437        qemu_fclose(mis->from_src_file);
 438        if (multifd_load_cleanup(&local_err) != 0) {
 439            error_report_err(local_err);
 440        }
 441        exit(EXIT_FAILURE);
 442    }
 443    mis->bh = qemu_bh_new(process_incoming_migration_bh, mis);
 444    qemu_bh_schedule(mis->bh);
 445}
 446
 447static void migration_incoming_setup(QEMUFile *f)
 448{
 449    MigrationIncomingState *mis = migration_incoming_get_current();
 450
 451    if (multifd_load_setup() != 0) {
 452        /* We haven't been able to create multifd threads
 453           nothing better to do */
 454        exit(EXIT_FAILURE);
 455    }
 456
 457    if (!mis->from_src_file) {
 458        mis->from_src_file = f;
 459    }
 460    qemu_file_set_blocking(f, false);
 461}
 462
 463void migration_incoming_process(void)
 464{
 465    Coroutine *co = qemu_coroutine_create(process_incoming_migration_co, NULL);
 466    qemu_coroutine_enter(co);
 467}
 468
 469/* Returns true if recovered from a paused migration, otherwise false */
 470static bool postcopy_try_recover(QEMUFile *f)
 471{
 472    MigrationIncomingState *mis = migration_incoming_get_current();
 473
 474    if (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
 475        /* Resumed from a paused postcopy migration */
 476
 477        mis->from_src_file = f;
 478        /* Postcopy has standalone thread to do vm load */
 479        qemu_file_set_blocking(f, true);
 480
 481        /* Re-configure the return path */
 482        mis->to_src_file = qemu_file_get_return_path(f);
 483
 484        migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
 485                          MIGRATION_STATUS_POSTCOPY_RECOVER);
 486
 487        /*
 488         * Here, we only wake up the main loading thread (while the
 489         * fault thread will still be waiting), so that we can receive
 490         * commands from source now, and answer it if needed. The
 491         * fault thread will be woken up afterwards until we are sure
 492         * that source is ready to reply to page requests.
 493         */
 494        qemu_sem_post(&mis->postcopy_pause_sem_dst);
 495        return true;
 496    }
 497
 498    return false;
 499}
 500
 501void migration_fd_process_incoming(QEMUFile *f)
 502{
 503    if (postcopy_try_recover(f)) {
 504        return;
 505    }
 506
 507    migration_incoming_setup(f);
 508    migration_incoming_process();
 509}
 510
 511void migration_ioc_process_incoming(QIOChannel *ioc)
 512{
 513    MigrationIncomingState *mis = migration_incoming_get_current();
 514    bool start_migration;
 515
 516    if (!mis->from_src_file) {
 517        /* The first connection (multifd may have multiple) */
 518        QEMUFile *f = qemu_fopen_channel_input(ioc);
 519
 520        /* If it's a recovery, we're done */
 521        if (postcopy_try_recover(f)) {
 522            return;
 523        }
 524
 525        migration_incoming_setup(f);
 526
 527        /*
 528         * Common migration only needs one channel, so we can start
 529         * right now.  Multifd needs more than one channel, we wait.
 530         */
 531        start_migration = !migrate_use_multifd();
 532    } else {
 533        /* Multiple connections */
 534        assert(migrate_use_multifd());
 535        start_migration = multifd_recv_new_channel(ioc);
 536    }
 537
 538    if (start_migration) {
 539        migration_incoming_process();
 540    }
 541}
 542
 543/**
 544 * @migration_has_all_channels: We have received all channels that we need
 545 *
 546 * Returns true when we have got connections to all the channels that
 547 * we need for migration.
 548 */
 549bool migration_has_all_channels(void)
 550{
 551    MigrationIncomingState *mis = migration_incoming_get_current();
 552    bool all_channels;
 553
 554    all_channels = multifd_recv_all_channels_created();
 555
 556    return all_channels && mis->from_src_file != NULL;
 557}
 558
 559/*
 560 * Send a 'SHUT' message on the return channel with the given value
 561 * to indicate that we've finished with the RP.  Non-0 value indicates
 562 * error.
 563 */
 564void migrate_send_rp_shut(MigrationIncomingState *mis,
 565                          uint32_t value)
 566{
 567    uint32_t buf;
 568
 569    buf = cpu_to_be32(value);
 570    migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf);
 571}
 572
 573/*
 574 * Send a 'PONG' message on the return channel with the given value
 575 * (normally in response to a 'PING')
 576 */
 577void migrate_send_rp_pong(MigrationIncomingState *mis,
 578                          uint32_t value)
 579{
 580    uint32_t buf;
 581
 582    buf = cpu_to_be32(value);
 583    migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
 584}
 585
 586void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
 587                                 char *block_name)
 588{
 589    char buf[512];
 590    int len;
 591    int64_t res;
 592
 593    /*
 594     * First, we send the header part. It contains only the len of
 595     * idstr, and the idstr itself.
 596     */
 597    len = strlen(block_name);
 598    buf[0] = len;
 599    memcpy(buf + 1, block_name, len);
 600
 601    if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
 602        error_report("%s: MSG_RP_RECV_BITMAP only used for recovery",
 603                     __func__);
 604        return;
 605    }
 606
 607    migrate_send_rp_message(mis, MIG_RP_MSG_RECV_BITMAP, len + 1, buf);
 608
 609    /*
 610     * Next, we dump the received bitmap to the stream.
 611     *
 612     * TODO: currently we are safe since we are the only one that is
 613     * using the to_src_file handle (fault thread is still paused),
 614     * and it's ok even not taking the mutex. However the best way is
 615     * to take the lock before sending the message header, and release
 616     * the lock after sending the bitmap.
 617     */
 618    qemu_mutex_lock(&mis->rp_mutex);
 619    res = ramblock_recv_bitmap_send(mis->to_src_file, block_name);
 620    qemu_mutex_unlock(&mis->rp_mutex);
 621
 622    trace_migrate_send_rp_recv_bitmap(block_name, res);
 623}
 624
 625void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value)
 626{
 627    uint32_t buf;
 628
 629    buf = cpu_to_be32(value);
 630    migrate_send_rp_message(mis, MIG_RP_MSG_RESUME_ACK, sizeof(buf), &buf);
 631}
 632
 633MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp)
 634{
 635    MigrationCapabilityStatusList *head = NULL;
 636    MigrationCapabilityStatusList *caps;
 637    MigrationState *s = migrate_get_current();
 638    int i;
 639
 640    caps = NULL; /* silence compiler warning */
 641    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 642#ifndef CONFIG_LIVE_BLOCK_MIGRATION
 643        if (i == MIGRATION_CAPABILITY_BLOCK) {
 644            continue;
 645        }
 646#endif
 647        if (head == NULL) {
 648            head = g_malloc0(sizeof(*caps));
 649            caps = head;
 650        } else {
 651            caps->next = g_malloc0(sizeof(*caps));
 652            caps = caps->next;
 653        }
 654        caps->value =
 655            g_malloc(sizeof(*caps->value));
 656        caps->value->capability = i;
 657        caps->value->state = s->enabled_capabilities[i];
 658    }
 659
 660    return head;
 661}
 662
 663MigrationParameters *qmp_query_migrate_parameters(Error **errp)
 664{
 665    MigrationParameters *params;
 666    MigrationState *s = migrate_get_current();
 667
 668    /* TODO use QAPI_CLONE() instead of duplicating it inline */
 669    params = g_malloc0(sizeof(*params));
 670    params->has_compress_level = true;
 671    params->compress_level = s->parameters.compress_level;
 672    params->has_compress_threads = true;
 673    params->compress_threads = s->parameters.compress_threads;
 674    params->has_decompress_threads = true;
 675    params->decompress_threads = s->parameters.decompress_threads;
 676    params->has_cpu_throttle_initial = true;
 677    params->cpu_throttle_initial = s->parameters.cpu_throttle_initial;
 678    params->has_cpu_throttle_increment = true;
 679    params->cpu_throttle_increment = s->parameters.cpu_throttle_increment;
 680    params->has_tls_creds = true;
 681    params->tls_creds = g_strdup(s->parameters.tls_creds);
 682    params->has_tls_hostname = true;
 683    params->tls_hostname = g_strdup(s->parameters.tls_hostname);
 684    params->has_max_bandwidth = true;
 685    params->max_bandwidth = s->parameters.max_bandwidth;
 686    params->has_downtime_limit = true;
 687    params->downtime_limit = s->parameters.downtime_limit;
 688    params->has_x_checkpoint_delay = true;
 689    params->x_checkpoint_delay = s->parameters.x_checkpoint_delay;
 690    params->has_block_incremental = true;
 691    params->block_incremental = s->parameters.block_incremental;
 692    params->has_x_multifd_channels = true;
 693    params->x_multifd_channels = s->parameters.x_multifd_channels;
 694    params->has_x_multifd_page_count = true;
 695    params->x_multifd_page_count = s->parameters.x_multifd_page_count;
 696    params->has_xbzrle_cache_size = true;
 697    params->xbzrle_cache_size = s->parameters.xbzrle_cache_size;
 698    params->has_max_postcopy_bandwidth = true;
 699    params->max_postcopy_bandwidth = s->parameters.max_postcopy_bandwidth;
 700
 701    return params;
 702}
 703
 704/*
 705 * Return true if we're already in the middle of a migration
 706 * (i.e. any of the active or setup states)
 707 */
 708static bool migration_is_setup_or_active(int state)
 709{
 710    switch (state) {
 711    case MIGRATION_STATUS_ACTIVE:
 712    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
 713    case MIGRATION_STATUS_POSTCOPY_PAUSED:
 714    case MIGRATION_STATUS_POSTCOPY_RECOVER:
 715    case MIGRATION_STATUS_SETUP:
 716    case MIGRATION_STATUS_PRE_SWITCHOVER:
 717    case MIGRATION_STATUS_DEVICE:
 718        return true;
 719
 720    default:
 721        return false;
 722
 723    }
 724}
 725
 726static void populate_ram_info(MigrationInfo *info, MigrationState *s)
 727{
 728    info->has_ram = true;
 729    info->ram = g_malloc0(sizeof(*info->ram));
 730    info->ram->transferred = ram_counters.transferred;
 731    info->ram->total = ram_bytes_total();
 732    info->ram->duplicate = ram_counters.duplicate;
 733    /* legacy value.  It is not used anymore */
 734    info->ram->skipped = 0;
 735    info->ram->normal = ram_counters.normal;
 736    info->ram->normal_bytes = ram_counters.normal *
 737        qemu_target_page_size();
 738    info->ram->mbps = s->mbps;
 739    info->ram->dirty_sync_count = ram_counters.dirty_sync_count;
 740    info->ram->postcopy_requests = ram_counters.postcopy_requests;
 741    info->ram->page_size = qemu_target_page_size();
 742    info->ram->multifd_bytes = ram_counters.multifd_bytes;
 743
 744    if (migrate_use_xbzrle()) {
 745        info->has_xbzrle_cache = true;
 746        info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache));
 747        info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size();
 748        info->xbzrle_cache->bytes = xbzrle_counters.bytes;
 749        info->xbzrle_cache->pages = xbzrle_counters.pages;
 750        info->xbzrle_cache->cache_miss = xbzrle_counters.cache_miss;
 751        info->xbzrle_cache->cache_miss_rate = xbzrle_counters.cache_miss_rate;
 752        info->xbzrle_cache->overflow = xbzrle_counters.overflow;
 753    }
 754
 755    if (cpu_throttle_active()) {
 756        info->has_cpu_throttle_percentage = true;
 757        info->cpu_throttle_percentage = cpu_throttle_get_percentage();
 758    }
 759
 760    if (s->state != MIGRATION_STATUS_COMPLETED) {
 761        info->ram->remaining = ram_bytes_remaining();
 762        info->ram->dirty_pages_rate = ram_counters.dirty_pages_rate;
 763    }
 764}
 765
 766static void populate_disk_info(MigrationInfo *info)
 767{
 768    if (blk_mig_active()) {
 769        info->has_disk = true;
 770        info->disk = g_malloc0(sizeof(*info->disk));
 771        info->disk->transferred = blk_mig_bytes_transferred();
 772        info->disk->remaining = blk_mig_bytes_remaining();
 773        info->disk->total = blk_mig_bytes_total();
 774    }
 775}
 776
 777static void fill_source_migration_info(MigrationInfo *info)
 778{
 779    MigrationState *s = migrate_get_current();
 780
 781    switch (s->state) {
 782    case MIGRATION_STATUS_NONE:
 783        /* no migration has happened ever */
 784        /* do not overwrite destination migration status */
 785        return;
 786        break;
 787    case MIGRATION_STATUS_SETUP:
 788        info->has_status = true;
 789        info->has_total_time = false;
 790        break;
 791    case MIGRATION_STATUS_ACTIVE:
 792    case MIGRATION_STATUS_CANCELLING:
 793    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
 794    case MIGRATION_STATUS_PRE_SWITCHOVER:
 795    case MIGRATION_STATUS_DEVICE:
 796    case MIGRATION_STATUS_POSTCOPY_PAUSED:
 797    case MIGRATION_STATUS_POSTCOPY_RECOVER:
 798         /* TODO add some postcopy stats */
 799        info->has_status = true;
 800        info->has_total_time = true;
 801        info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME)
 802            - s->start_time;
 803        info->has_expected_downtime = true;
 804        info->expected_downtime = s->expected_downtime;
 805        info->has_setup_time = true;
 806        info->setup_time = s->setup_time;
 807
 808        populate_ram_info(info, s);
 809        populate_disk_info(info);
 810        break;
 811    case MIGRATION_STATUS_COLO:
 812        info->has_status = true;
 813        /* TODO: display COLO specific information (checkpoint info etc.) */
 814        break;
 815    case MIGRATION_STATUS_COMPLETED:
 816        info->has_status = true;
 817        info->has_total_time = true;
 818        info->total_time = s->total_time;
 819        info->has_downtime = true;
 820        info->downtime = s->downtime;
 821        info->has_setup_time = true;
 822        info->setup_time = s->setup_time;
 823
 824        populate_ram_info(info, s);
 825        break;
 826    case MIGRATION_STATUS_FAILED:
 827        info->has_status = true;
 828        if (s->error) {
 829            info->has_error_desc = true;
 830            info->error_desc = g_strdup(error_get_pretty(s->error));
 831        }
 832        break;
 833    case MIGRATION_STATUS_CANCELLED:
 834        info->has_status = true;
 835        break;
 836    }
 837    info->status = s->state;
 838}
 839
 840/**
 841 * @migration_caps_check - check capability validity
 842 *
 843 * @cap_list: old capability list, array of bool
 844 * @params: new capabilities to be applied soon
 845 * @errp: set *errp if the check failed, with reason
 846 *
 847 * Returns true if check passed, otherwise false.
 848 */
 849static bool migrate_caps_check(bool *cap_list,
 850                               MigrationCapabilityStatusList *params,
 851                               Error **errp)
 852{
 853    MigrationCapabilityStatusList *cap;
 854    bool old_postcopy_cap;
 855    MigrationIncomingState *mis = migration_incoming_get_current();
 856
 857    old_postcopy_cap = cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM];
 858
 859    for (cap = params; cap; cap = cap->next) {
 860        cap_list[cap->value->capability] = cap->value->state;
 861    }
 862
 863#ifndef CONFIG_LIVE_BLOCK_MIGRATION
 864    if (cap_list[MIGRATION_CAPABILITY_BLOCK]) {
 865        error_setg(errp, "QEMU compiled without old-style (blk/-b, inc/-i) "
 866                   "block migration");
 867        error_append_hint(errp, "Use drive_mirror+NBD instead.\n");
 868        return false;
 869    }
 870#endif
 871
 872    if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
 873        if (cap_list[MIGRATION_CAPABILITY_COMPRESS]) {
 874            /* The decompression threads asynchronously write into RAM
 875             * rather than use the atomic copies needed to avoid
 876             * userfaulting.  It should be possible to fix the decompression
 877             * threads for compatibility in future.
 878             */
 879            error_setg(errp, "Postcopy is not currently compatible "
 880                       "with compression");
 881            return false;
 882        }
 883
 884        /* This check is reasonably expensive, so only when it's being
 885         * set the first time, also it's only the destination that needs
 886         * special support.
 887         */
 888        if (!old_postcopy_cap && runstate_check(RUN_STATE_INMIGRATE) &&
 889            !postcopy_ram_supported_by_host(mis)) {
 890            /* postcopy_ram_supported_by_host will have emitted a more
 891             * detailed message
 892             */
 893            error_setg(errp, "Postcopy is not supported");
 894            return false;
 895        }
 896    }
 897
 898    return true;
 899}
 900
 901static void fill_destination_migration_info(MigrationInfo *info)
 902{
 903    MigrationIncomingState *mis = migration_incoming_get_current();
 904
 905    switch (mis->state) {
 906    case MIGRATION_STATUS_NONE:
 907        return;
 908        break;
 909    case MIGRATION_STATUS_SETUP:
 910    case MIGRATION_STATUS_CANCELLING:
 911    case MIGRATION_STATUS_CANCELLED:
 912    case MIGRATION_STATUS_ACTIVE:
 913    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
 914    case MIGRATION_STATUS_POSTCOPY_PAUSED:
 915    case MIGRATION_STATUS_POSTCOPY_RECOVER:
 916    case MIGRATION_STATUS_FAILED:
 917    case MIGRATION_STATUS_COLO:
 918        info->has_status = true;
 919        break;
 920    case MIGRATION_STATUS_COMPLETED:
 921        info->has_status = true;
 922        fill_destination_postcopy_migration_info(info);
 923        break;
 924    }
 925    info->status = mis->state;
 926}
 927
 928MigrationInfo *qmp_query_migrate(Error **errp)
 929{
 930    MigrationInfo *info = g_malloc0(sizeof(*info));
 931
 932    fill_destination_migration_info(info);
 933    fill_source_migration_info(info);
 934
 935    return info;
 936}
 937
 938void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
 939                                  Error **errp)
 940{
 941    MigrationState *s = migrate_get_current();
 942    MigrationCapabilityStatusList *cap;
 943    bool cap_list[MIGRATION_CAPABILITY__MAX];
 944
 945    if (migration_is_setup_or_active(s->state)) {
 946        error_setg(errp, QERR_MIGRATION_ACTIVE);
 947        return;
 948    }
 949
 950    memcpy(cap_list, s->enabled_capabilities, sizeof(cap_list));
 951    if (!migrate_caps_check(cap_list, params, errp)) {
 952        return;
 953    }
 954
 955    for (cap = params; cap; cap = cap->next) {
 956        s->enabled_capabilities[cap->value->capability] = cap->value->state;
 957    }
 958}
 959
 960/*
 961 * Check whether the parameters are valid. Error will be put into errp
 962 * (if provided). Return true if valid, otherwise false.
 963 */
 964static bool migrate_params_check(MigrationParameters *params, Error **errp)
 965{
 966    if (params->has_compress_level &&
 967        (params->compress_level > 9)) {
 968        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level",
 969                   "is invalid, it should be in the range of 0 to 9");
 970        return false;
 971    }
 972
 973    if (params->has_compress_threads && (params->compress_threads < 1)) {
 974        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
 975                   "compress_threads",
 976                   "is invalid, it should be in the range of 1 to 255");
 977        return false;
 978    }
 979
 980    if (params->has_decompress_threads && (params->decompress_threads < 1)) {
 981        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
 982                   "decompress_threads",
 983                   "is invalid, it should be in the range of 1 to 255");
 984        return false;
 985    }
 986
 987    if (params->has_cpu_throttle_initial &&
 988        (params->cpu_throttle_initial < 1 ||
 989         params->cpu_throttle_initial > 99)) {
 990        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
 991                   "cpu_throttle_initial",
 992                   "an integer in the range of 1 to 99");
 993        return false;
 994    }
 995
 996    if (params->has_cpu_throttle_increment &&
 997        (params->cpu_throttle_increment < 1 ||
 998         params->cpu_throttle_increment > 99)) {
 999        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1000                   "cpu_throttle_increment",
1001                   "an integer in the range of 1 to 99");
1002        return false;
1003    }
1004
1005    if (params->has_max_bandwidth && (params->max_bandwidth > SIZE_MAX)) {
1006        error_setg(errp, "Parameter 'max_bandwidth' expects an integer in the"
1007                         " range of 0 to %zu bytes/second", SIZE_MAX);
1008        return false;
1009    }
1010
1011    if (params->has_downtime_limit &&
1012        (params->downtime_limit > MAX_MIGRATE_DOWNTIME)) {
1013        error_setg(errp, "Parameter 'downtime_limit' expects an integer in "
1014                         "the range of 0 to %d milliseconds",
1015                         MAX_MIGRATE_DOWNTIME);
1016        return false;
1017    }
1018
1019    /* x_checkpoint_delay is now always positive */
1020
1021    if (params->has_x_multifd_channels && (params->x_multifd_channels < 1)) {
1022        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1023                   "multifd_channels",
1024                   "is invalid, it should be in the range of 1 to 255");
1025        return false;
1026    }
1027    if (params->has_x_multifd_page_count &&
1028        (params->x_multifd_page_count < 1 ||
1029         params->x_multifd_page_count > 10000)) {
1030        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1031                   "multifd_page_count",
1032                   "is invalid, it should be in the range of 1 to 10000");
1033        return false;
1034    }
1035
1036    if (params->has_xbzrle_cache_size &&
1037        (params->xbzrle_cache_size < qemu_target_page_size() ||
1038         !is_power_of_2(params->xbzrle_cache_size))) {
1039        error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
1040                   "xbzrle_cache_size",
1041                   "is invalid, it should be bigger than target page size"
1042                   " and a power of two");
1043        return false;
1044    }
1045
1046    return true;
1047}
1048
1049static void migrate_params_test_apply(MigrateSetParameters *params,
1050                                      MigrationParameters *dest)
1051{
1052    *dest = migrate_get_current()->parameters;
1053
1054    /* TODO use QAPI_CLONE() instead of duplicating it inline */
1055
1056    if (params->has_compress_level) {
1057        dest->compress_level = params->compress_level;
1058    }
1059
1060    if (params->has_compress_threads) {
1061        dest->compress_threads = params->compress_threads;
1062    }
1063
1064    if (params->has_decompress_threads) {
1065        dest->decompress_threads = params->decompress_threads;
1066    }
1067
1068    if (params->has_cpu_throttle_initial) {
1069        dest->cpu_throttle_initial = params->cpu_throttle_initial;
1070    }
1071
1072    if (params->has_cpu_throttle_increment) {
1073        dest->cpu_throttle_increment = params->cpu_throttle_increment;
1074    }
1075
1076    if (params->has_tls_creds) {
1077        assert(params->tls_creds->type == QTYPE_QSTRING);
1078        dest->tls_creds = g_strdup(params->tls_creds->u.s);
1079    }
1080
1081    if (params->has_tls_hostname) {
1082        assert(params->tls_hostname->type == QTYPE_QSTRING);
1083        dest->tls_hostname = g_strdup(params->tls_hostname->u.s);
1084    }
1085
1086    if (params->has_max_bandwidth) {
1087        dest->max_bandwidth = params->max_bandwidth;
1088    }
1089
1090    if (params->has_downtime_limit) {
1091        dest->downtime_limit = params->downtime_limit;
1092    }
1093
1094    if (params->has_x_checkpoint_delay) {
1095        dest->x_checkpoint_delay = params->x_checkpoint_delay;
1096    }
1097
1098    if (params->has_block_incremental) {
1099        dest->block_incremental = params->block_incremental;
1100    }
1101    if (params->has_x_multifd_channels) {
1102        dest->x_multifd_channels = params->x_multifd_channels;
1103    }
1104    if (params->has_x_multifd_page_count) {
1105        dest->x_multifd_page_count = params->x_multifd_page_count;
1106    }
1107    if (params->has_xbzrle_cache_size) {
1108        dest->xbzrle_cache_size = params->xbzrle_cache_size;
1109    }
1110    if (params->has_max_postcopy_bandwidth) {
1111        dest->max_postcopy_bandwidth = params->max_postcopy_bandwidth;
1112    }
1113}
1114
1115static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
1116{
1117    MigrationState *s = migrate_get_current();
1118
1119    /* TODO use QAPI_CLONE() instead of duplicating it inline */
1120
1121    if (params->has_compress_level) {
1122        s->parameters.compress_level = params->compress_level;
1123    }
1124
1125    if (params->has_compress_threads) {
1126        s->parameters.compress_threads = params->compress_threads;
1127    }
1128
1129    if (params->has_decompress_threads) {
1130        s->parameters.decompress_threads = params->decompress_threads;
1131    }
1132
1133    if (params->has_cpu_throttle_initial) {
1134        s->parameters.cpu_throttle_initial = params->cpu_throttle_initial;
1135    }
1136
1137    if (params->has_cpu_throttle_increment) {
1138        s->parameters.cpu_throttle_increment = params->cpu_throttle_increment;
1139    }
1140
1141    if (params->has_tls_creds) {
1142        g_free(s->parameters.tls_creds);
1143        assert(params->tls_creds->type == QTYPE_QSTRING);
1144        s->parameters.tls_creds = g_strdup(params->tls_creds->u.s);
1145    }
1146
1147    if (params->has_tls_hostname) {
1148        g_free(s->parameters.tls_hostname);
1149        assert(params->tls_hostname->type == QTYPE_QSTRING);
1150        s->parameters.tls_hostname = g_strdup(params->tls_hostname->u.s);
1151    }
1152
1153    if (params->has_max_bandwidth) {
1154        s->parameters.max_bandwidth = params->max_bandwidth;
1155        if (s->to_dst_file) {
1156            qemu_file_set_rate_limit(s->to_dst_file,
1157                                s->parameters.max_bandwidth / XFER_LIMIT_RATIO);
1158        }
1159    }
1160
1161    if (params->has_downtime_limit) {
1162        s->parameters.downtime_limit = params->downtime_limit;
1163    }
1164
1165    if (params->has_x_checkpoint_delay) {
1166        s->parameters.x_checkpoint_delay = params->x_checkpoint_delay;
1167        if (migration_in_colo_state()) {
1168            colo_checkpoint_notify(s);
1169        }
1170    }
1171
1172    if (params->has_block_incremental) {
1173        s->parameters.block_incremental = params->block_incremental;
1174    }
1175    if (params->has_x_multifd_channels) {
1176        s->parameters.x_multifd_channels = params->x_multifd_channels;
1177    }
1178    if (params->has_x_multifd_page_count) {
1179        s->parameters.x_multifd_page_count = params->x_multifd_page_count;
1180    }
1181    if (params->has_xbzrle_cache_size) {
1182        s->parameters.xbzrle_cache_size = params->xbzrle_cache_size;
1183        xbzrle_cache_resize(params->xbzrle_cache_size, errp);
1184    }
1185    if (params->has_max_postcopy_bandwidth) {
1186        s->parameters.max_postcopy_bandwidth = params->max_postcopy_bandwidth;
1187    }
1188}
1189
1190void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
1191{
1192    MigrationParameters tmp;
1193
1194    /* TODO Rewrite "" to null instead */
1195    if (params->has_tls_creds
1196        && params->tls_creds->type == QTYPE_QNULL) {
1197        qobject_unref(params->tls_creds->u.n);
1198        params->tls_creds->type = QTYPE_QSTRING;
1199        params->tls_creds->u.s = strdup("");
1200    }
1201    /* TODO Rewrite "" to null instead */
1202    if (params->has_tls_hostname
1203        && params->tls_hostname->type == QTYPE_QNULL) {
1204        qobject_unref(params->tls_hostname->u.n);
1205        params->tls_hostname->type = QTYPE_QSTRING;
1206        params->tls_hostname->u.s = strdup("");
1207    }
1208
1209    migrate_params_test_apply(params, &tmp);
1210
1211    if (!migrate_params_check(&tmp, errp)) {
1212        /* Invalid parameter */
1213        return;
1214    }
1215
1216    migrate_params_apply(params, errp);
1217}
1218
1219
1220void qmp_migrate_start_postcopy(Error **errp)
1221{
1222    MigrationState *s = migrate_get_current();
1223
1224    if (!migrate_postcopy()) {
1225        error_setg(errp, "Enable postcopy with migrate_set_capability before"
1226                         " the start of migration");
1227        return;
1228    }
1229
1230    if (s->state == MIGRATION_STATUS_NONE) {
1231        error_setg(errp, "Postcopy must be started after migration has been"
1232                         " started");
1233        return;
1234    }
1235    /*
1236     * we don't error if migration has finished since that would be racy
1237     * with issuing this command.
1238     */
1239    atomic_set(&s->start_postcopy, true);
1240}
1241
1242/* shared migration helpers */
1243
1244void migrate_set_state(int *state, int old_state, int new_state)
1245{
1246    assert(new_state < MIGRATION_STATUS__MAX);
1247    if (atomic_cmpxchg(state, old_state, new_state) == old_state) {
1248        trace_migrate_set_state(MigrationStatus_str(new_state));
1249        migrate_generate_event(new_state);
1250    }
1251}
1252
1253static MigrationCapabilityStatusList *migrate_cap_add(
1254    MigrationCapabilityStatusList *list,
1255    MigrationCapability index,
1256    bool state)
1257{
1258    MigrationCapabilityStatusList *cap;
1259
1260    cap = g_new0(MigrationCapabilityStatusList, 1);
1261    cap->value = g_new0(MigrationCapabilityStatus, 1);
1262    cap->value->capability = index;
1263    cap->value->state = state;
1264    cap->next = list;
1265
1266    return cap;
1267}
1268
1269void migrate_set_block_enabled(bool value, Error **errp)
1270{
1271    MigrationCapabilityStatusList *cap;
1272
1273    cap = migrate_cap_add(NULL, MIGRATION_CAPABILITY_BLOCK, value);
1274    qmp_migrate_set_capabilities(cap, errp);
1275    qapi_free_MigrationCapabilityStatusList(cap);
1276}
1277
1278static void migrate_set_block_incremental(MigrationState *s, bool value)
1279{
1280    s->parameters.block_incremental = value;
1281}
1282
1283static void block_cleanup_parameters(MigrationState *s)
1284{
1285    if (s->must_remove_block_options) {
1286        /* setting to false can never fail */
1287        migrate_set_block_enabled(false, &error_abort);
1288        migrate_set_block_incremental(s, false);
1289        s->must_remove_block_options = false;
1290    }
1291}
1292
1293static void migrate_fd_cleanup(void *opaque)
1294{
1295    MigrationState *s = opaque;
1296
1297    qemu_bh_delete(s->cleanup_bh);
1298    s->cleanup_bh = NULL;
1299
1300    qemu_savevm_state_cleanup();
1301
1302    if (s->to_dst_file) {
1303        Error *local_err = NULL;
1304        QEMUFile *tmp;
1305
1306        trace_migrate_fd_cleanup();
1307        qemu_mutex_unlock_iothread();
1308        if (s->migration_thread_running) {
1309            qemu_thread_join(&s->thread);
1310            s->migration_thread_running = false;
1311        }
1312        qemu_mutex_lock_iothread();
1313
1314        if (multifd_save_cleanup(&local_err) != 0) {
1315            error_report_err(local_err);
1316        }
1317        qemu_mutex_lock(&s->qemu_file_lock);
1318        tmp = s->to_dst_file;
1319        s->to_dst_file = NULL;
1320        qemu_mutex_unlock(&s->qemu_file_lock);
1321        /*
1322         * Close the file handle without the lock to make sure the
1323         * critical section won't block for long.
1324         */
1325        qemu_fclose(tmp);
1326    }
1327
1328    assert((s->state != MIGRATION_STATUS_ACTIVE) &&
1329           (s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE));
1330
1331    if (s->state == MIGRATION_STATUS_CANCELLING) {
1332        migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING,
1333                          MIGRATION_STATUS_CANCELLED);
1334    }
1335
1336    if (s->error) {
1337        /* It is used on info migrate.  We can't free it */
1338        error_report_err(error_copy(s->error));
1339    }
1340    notifier_list_notify(&migration_state_notifiers, s);
1341    block_cleanup_parameters(s);
1342}
1343
1344void migrate_set_error(MigrationState *s, const Error *error)
1345{
1346    qemu_mutex_lock(&s->error_mutex);
1347    if (!s->error) {
1348        s->error = error_copy(error);
1349    }
1350    qemu_mutex_unlock(&s->error_mutex);
1351}
1352
1353void migrate_fd_error(MigrationState *s, const Error *error)
1354{
1355    trace_migrate_fd_error(error_get_pretty(error));
1356    assert(s->to_dst_file == NULL);
1357    migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
1358                      MIGRATION_STATUS_FAILED);
1359    migrate_set_error(s, error);
1360}
1361
1362static void migrate_fd_cancel(MigrationState *s)
1363{
1364    int old_state ;
1365    QEMUFile *f = migrate_get_current()->to_dst_file;
1366    trace_migrate_fd_cancel();
1367
1368    if (s->rp_state.from_dst_file) {
1369        /* shutdown the rp socket, so causing the rp thread to shutdown */
1370        qemu_file_shutdown(s->rp_state.from_dst_file);
1371    }
1372
1373    do {
1374        old_state = s->state;
1375        if (!migration_is_setup_or_active(old_state)) {
1376            break;
1377        }
1378        /* If the migration is paused, kick it out of the pause */
1379        if (old_state == MIGRATION_STATUS_PRE_SWITCHOVER) {
1380            qemu_sem_post(&s->pause_sem);
1381        }
1382        migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING);
1383    } while (s->state != MIGRATION_STATUS_CANCELLING);
1384
1385    /*
1386     * If we're unlucky the migration code might be stuck somewhere in a
1387     * send/write while the network has failed and is waiting to timeout;
1388     * if we've got shutdown(2) available then we can force it to quit.
1389     * The outgoing qemu file gets closed in migrate_fd_cleanup that is
1390     * called in a bh, so there is no race against this cancel.
1391     */
1392    if (s->state == MIGRATION_STATUS_CANCELLING && f) {
1393        qemu_file_shutdown(f);
1394    }
1395    if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) {
1396        Error *local_err = NULL;
1397
1398        bdrv_invalidate_cache_all(&local_err);
1399        if (local_err) {
1400            error_report_err(local_err);
1401        } else {
1402            s->block_inactive = false;
1403        }
1404    }
1405}
1406
1407void add_migration_state_change_notifier(Notifier *notify)
1408{
1409    notifier_list_add(&migration_state_notifiers, notify);
1410}
1411
1412void remove_migration_state_change_notifier(Notifier *notify)
1413{
1414    notifier_remove(notify);
1415}
1416
1417bool migration_in_setup(MigrationState *s)
1418{
1419    return s->state == MIGRATION_STATUS_SETUP;
1420}
1421
1422bool migration_has_finished(MigrationState *s)
1423{
1424    return s->state == MIGRATION_STATUS_COMPLETED;
1425}
1426
1427bool migration_has_failed(MigrationState *s)
1428{
1429    return (s->state == MIGRATION_STATUS_CANCELLED ||
1430            s->state == MIGRATION_STATUS_FAILED);
1431}
1432
1433bool migration_in_postcopy(void)
1434{
1435    MigrationState *s = migrate_get_current();
1436
1437    return (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
1438}
1439
1440bool migration_in_postcopy_after_devices(MigrationState *s)
1441{
1442    return migration_in_postcopy() && s->postcopy_after_devices;
1443}
1444
1445bool migration_is_idle(void)
1446{
1447    MigrationState *s = migrate_get_current();
1448
1449    switch (s->state) {
1450    case MIGRATION_STATUS_NONE:
1451    case MIGRATION_STATUS_CANCELLED:
1452    case MIGRATION_STATUS_COMPLETED:
1453    case MIGRATION_STATUS_FAILED:
1454        return true;
1455    case MIGRATION_STATUS_SETUP:
1456    case MIGRATION_STATUS_CANCELLING:
1457    case MIGRATION_STATUS_ACTIVE:
1458    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
1459    case MIGRATION_STATUS_COLO:
1460    case MIGRATION_STATUS_PRE_SWITCHOVER:
1461    case MIGRATION_STATUS_DEVICE:
1462        return false;
1463    case MIGRATION_STATUS__MAX:
1464        g_assert_not_reached();
1465    }
1466
1467    return false;
1468}
1469
1470void migrate_init(MigrationState *s)
1471{
1472    /*
1473     * Reinitialise all migration state, except
1474     * parameters/capabilities that the user set, and
1475     * locks.
1476     */
1477    s->bytes_xfer = 0;
1478    s->xfer_limit = 0;
1479    s->cleanup_bh = 0;
1480    s->to_dst_file = NULL;
1481    s->state = MIGRATION_STATUS_NONE;
1482    s->rp_state.from_dst_file = NULL;
1483    s->rp_state.error = false;
1484    s->mbps = 0.0;
1485    s->downtime = 0;
1486    s->expected_downtime = 0;
1487    s->setup_time = 0;
1488    s->start_postcopy = false;
1489    s->postcopy_after_devices = false;
1490    s->migration_thread_running = false;
1491    error_free(s->error);
1492    s->error = NULL;
1493
1494    migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
1495
1496    s->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1497    s->total_time = 0;
1498    s->vm_was_running = false;
1499    s->iteration_initial_bytes = 0;
1500    s->threshold_size = 0;
1501}
1502
1503static GSList *migration_blockers;
1504
1505int migrate_add_blocker(Error *reason, Error **errp)
1506{
1507    if (migrate_get_current()->only_migratable) {
1508        error_propagate(errp, error_copy(reason));
1509        error_prepend(errp, "disallowing migration blocker "
1510                          "(--only_migratable) for: ");
1511        return -EACCES;
1512    }
1513
1514    if (migration_is_idle()) {
1515        migration_blockers = g_slist_prepend(migration_blockers, reason);
1516        return 0;
1517    }
1518
1519    error_propagate(errp, error_copy(reason));
1520    error_prepend(errp, "disallowing migration blocker (migration in "
1521                      "progress) for: ");
1522    return -EBUSY;
1523}
1524
1525void migrate_del_blocker(Error *reason)
1526{
1527    migration_blockers = g_slist_remove(migration_blockers, reason);
1528}
1529
1530void qmp_migrate_incoming(const char *uri, Error **errp)
1531{
1532    Error *local_err = NULL;
1533    static bool once = true;
1534
1535    if (!deferred_incoming) {
1536        error_setg(errp, "For use with '-incoming defer'");
1537        return;
1538    }
1539    if (!once) {
1540        error_setg(errp, "The incoming migration has already been started");
1541    }
1542
1543    qemu_start_incoming_migration(uri, &local_err);
1544
1545    if (local_err) {
1546        error_propagate(errp, local_err);
1547        return;
1548    }
1549
1550    once = false;
1551}
1552
1553void qmp_migrate_recover(const char *uri, Error **errp)
1554{
1555    MigrationIncomingState *mis = migration_incoming_get_current();
1556
1557    if (mis->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
1558        error_setg(errp, "Migrate recover can only be run "
1559                   "when postcopy is paused.");
1560        return;
1561    }
1562
1563    if (atomic_cmpxchg(&mis->postcopy_recover_triggered,
1564                       false, true) == true) {
1565        error_setg(errp, "Migrate recovery is triggered already");
1566        return;
1567    }
1568
1569    /*
1570     * Note that this call will never start a real migration; it will
1571     * only re-setup the migration stream and poke existing migration
1572     * to continue using that newly established channel.
1573     */
1574    qemu_start_incoming_migration(uri, errp);
1575}
1576
1577void qmp_migrate_pause(Error **errp)
1578{
1579    MigrationState *ms = migrate_get_current();
1580    MigrationIncomingState *mis = migration_incoming_get_current();
1581    int ret;
1582
1583    if (ms->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
1584        /* Source side, during postcopy */
1585        qemu_mutex_lock(&ms->qemu_file_lock);
1586        ret = qemu_file_shutdown(ms->to_dst_file);
1587        qemu_mutex_unlock(&ms->qemu_file_lock);
1588        if (ret) {
1589            error_setg(errp, "Failed to pause source migration");
1590        }
1591        return;
1592    }
1593
1594    if (mis->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
1595        ret = qemu_file_shutdown(mis->from_src_file);
1596        if (ret) {
1597            error_setg(errp, "Failed to pause destination migration");
1598        }
1599        return;
1600    }
1601
1602    error_setg(errp, "migrate-pause is currently only supported "
1603               "during postcopy-active state");
1604}
1605
1606bool migration_is_blocked(Error **errp)
1607{
1608    if (qemu_savevm_state_blocked(errp)) {
1609        return true;
1610    }
1611
1612    if (migration_blockers) {
1613        error_propagate(errp, error_copy(migration_blockers->data));
1614        return true;
1615    }
1616
1617    return false;
1618}
1619
1620/* Returns true if continue to migrate, or false if error detected */
1621static bool migrate_prepare(MigrationState *s, bool blk, bool blk_inc,
1622                            bool resume, Error **errp)
1623{
1624    Error *local_err = NULL;
1625
1626    if (resume) {
1627        if (s->state != MIGRATION_STATUS_POSTCOPY_PAUSED) {
1628            error_setg(errp, "Cannot resume if there is no "
1629                       "paused migration");
1630            return false;
1631        }
1632
1633        /*
1634         * Postcopy recovery won't work well with release-ram
1635         * capability since release-ram will drop the page buffer as
1636         * long as the page is put into the send buffer.  So if there
1637         * is a network failure happened, any page buffers that have
1638         * not yet reached the destination VM but have already been
1639         * sent from the source VM will be lost forever.  Let's refuse
1640         * the client from resuming such a postcopy migration.
1641         * Luckily release-ram was designed to only be used when src
1642         * and destination VMs are on the same host, so it should be
1643         * fine.
1644         */
1645        if (migrate_release_ram()) {
1646            error_setg(errp, "Postcopy recovery cannot work "
1647                       "when release-ram capability is set");
1648            return false;
1649        }
1650
1651        /* This is a resume, skip init status */
1652        return true;
1653    }
1654
1655    if (migration_is_setup_or_active(s->state) ||
1656        s->state == MIGRATION_STATUS_CANCELLING ||
1657        s->state == MIGRATION_STATUS_COLO) {
1658        error_setg(errp, QERR_MIGRATION_ACTIVE);
1659        return false;
1660    }
1661
1662    if (runstate_check(RUN_STATE_INMIGRATE)) {
1663        error_setg(errp, "Guest is waiting for an incoming migration");
1664        return false;
1665    }
1666
1667    if (migration_is_blocked(errp)) {
1668        return false;
1669    }
1670
1671    if (blk || blk_inc) {
1672        if (migrate_use_block() || migrate_use_block_incremental()) {
1673            error_setg(errp, "Command options are incompatible with "
1674                       "current migration capabilities");
1675            return false;
1676        }
1677        migrate_set_block_enabled(true, &local_err);
1678        if (local_err) {
1679            error_propagate(errp, local_err);
1680            return false;
1681        }
1682        s->must_remove_block_options = true;
1683    }
1684
1685    if (blk_inc) {
1686        migrate_set_block_incremental(s, true);
1687    }
1688
1689    migrate_init(s);
1690
1691    return true;
1692}
1693
1694void qmp_migrate(const char *uri, bool has_blk, bool blk,
1695                 bool has_inc, bool inc, bool has_detach, bool detach,
1696                 bool has_resume, bool resume, Error **errp)
1697{
1698    Error *local_err = NULL;
1699    MigrationState *s = migrate_get_current();
1700    const char *p;
1701
1702    if (!migrate_prepare(s, has_blk && blk, has_inc && inc,
1703                         has_resume && resume, errp)) {
1704        /* Error detected, put into errp */
1705        return;
1706    }
1707
1708    if (strstart(uri, "tcp:", &p)) {
1709        tcp_start_outgoing_migration(s, p, &local_err);
1710#ifdef CONFIG_RDMA
1711    } else if (strstart(uri, "rdma:", &p)) {
1712        rdma_start_outgoing_migration(s, p, &local_err);
1713#endif
1714    } else if (strstart(uri, "exec:", &p)) {
1715        exec_start_outgoing_migration(s, p, &local_err);
1716    } else if (strstart(uri, "unix:", &p)) {
1717        unix_start_outgoing_migration(s, p, &local_err);
1718    } else if (strstart(uri, "fd:", &p)) {
1719        fd_start_outgoing_migration(s, p, &local_err);
1720    } else {
1721        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri",
1722                   "a valid migration protocol");
1723        migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
1724                          MIGRATION_STATUS_FAILED);
1725        block_cleanup_parameters(s);
1726        return;
1727    }
1728
1729    if (local_err) {
1730        migrate_fd_error(s, local_err);
1731        error_propagate(errp, local_err);
1732        return;
1733    }
1734}
1735
1736void qmp_migrate_cancel(Error **errp)
1737{
1738    migrate_fd_cancel(migrate_get_current());
1739}
1740
1741void qmp_migrate_continue(MigrationStatus state, Error **errp)
1742{
1743    MigrationState *s = migrate_get_current();
1744    if (s->state != state) {
1745        error_setg(errp,  "Migration not in expected state: %s",
1746                   MigrationStatus_str(s->state));
1747        return;
1748    }
1749    qemu_sem_post(&s->pause_sem);
1750}
1751
1752void qmp_migrate_set_cache_size(int64_t value, Error **errp)
1753{
1754    MigrateSetParameters p = {
1755        .has_xbzrle_cache_size = true,
1756        .xbzrle_cache_size = value,
1757    };
1758
1759    qmp_migrate_set_parameters(&p, errp);
1760}
1761
1762int64_t qmp_query_migrate_cache_size(Error **errp)
1763{
1764    return migrate_xbzrle_cache_size();
1765}
1766
1767void qmp_migrate_set_speed(int64_t value, Error **errp)
1768{
1769    MigrateSetParameters p = {
1770        .has_max_bandwidth = true,
1771        .max_bandwidth = value,
1772    };
1773
1774    qmp_migrate_set_parameters(&p, errp);
1775}
1776
1777void qmp_migrate_set_downtime(double value, Error **errp)
1778{
1779    if (value < 0 || value > MAX_MIGRATE_DOWNTIME_SECONDS) {
1780        error_setg(errp, "Parameter 'downtime_limit' expects an integer in "
1781                         "the range of 0 to %d seconds",
1782                         MAX_MIGRATE_DOWNTIME_SECONDS);
1783        return;
1784    }
1785
1786    value *= 1000; /* Convert to milliseconds */
1787    value = MAX(0, MIN(INT64_MAX, value));
1788
1789    MigrateSetParameters p = {
1790        .has_downtime_limit = true,
1791        .downtime_limit = value,
1792    };
1793
1794    qmp_migrate_set_parameters(&p, errp);
1795}
1796
1797bool migrate_release_ram(void)
1798{
1799    MigrationState *s;
1800
1801    s = migrate_get_current();
1802
1803    return s->enabled_capabilities[MIGRATION_CAPABILITY_RELEASE_RAM];
1804}
1805
1806bool migrate_postcopy_ram(void)
1807{
1808    MigrationState *s;
1809
1810    s = migrate_get_current();
1811
1812    return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM];
1813}
1814
1815bool migrate_postcopy(void)
1816{
1817    return migrate_postcopy_ram() || migrate_dirty_bitmaps();
1818}
1819
1820bool migrate_auto_converge(void)
1821{
1822    MigrationState *s;
1823
1824    s = migrate_get_current();
1825
1826    return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE];
1827}
1828
1829bool migrate_zero_blocks(void)
1830{
1831    MigrationState *s;
1832
1833    s = migrate_get_current();
1834
1835    return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS];
1836}
1837
1838bool migrate_postcopy_blocktime(void)
1839{
1840    MigrationState *s;
1841
1842    s = migrate_get_current();
1843
1844    return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_BLOCKTIME];
1845}
1846
1847bool migrate_use_compression(void)
1848{
1849    MigrationState *s;
1850
1851    s = migrate_get_current();
1852
1853    return s->enabled_capabilities[MIGRATION_CAPABILITY_COMPRESS];
1854}
1855
1856int migrate_compress_level(void)
1857{
1858    MigrationState *s;
1859
1860    s = migrate_get_current();
1861
1862    return s->parameters.compress_level;
1863}
1864
1865int migrate_compress_threads(void)
1866{
1867    MigrationState *s;
1868
1869    s = migrate_get_current();
1870
1871    return s->parameters.compress_threads;
1872}
1873
1874int migrate_decompress_threads(void)
1875{
1876    MigrationState *s;
1877
1878    s = migrate_get_current();
1879
1880    return s->parameters.decompress_threads;
1881}
1882
1883bool migrate_dirty_bitmaps(void)
1884{
1885    MigrationState *s;
1886
1887    s = migrate_get_current();
1888
1889    return s->enabled_capabilities[MIGRATION_CAPABILITY_DIRTY_BITMAPS];
1890}
1891
1892bool migrate_use_events(void)
1893{
1894    MigrationState *s;
1895
1896    s = migrate_get_current();
1897
1898    return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS];
1899}
1900
1901bool migrate_use_multifd(void)
1902{
1903    MigrationState *s;
1904
1905    s = migrate_get_current();
1906
1907    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_MULTIFD];
1908}
1909
1910bool migrate_pause_before_switchover(void)
1911{
1912    MigrationState *s;
1913
1914    s = migrate_get_current();
1915
1916    return s->enabled_capabilities[
1917        MIGRATION_CAPABILITY_PAUSE_BEFORE_SWITCHOVER];
1918}
1919
1920int migrate_multifd_channels(void)
1921{
1922    MigrationState *s;
1923
1924    s = migrate_get_current();
1925
1926    return s->parameters.x_multifd_channels;
1927}
1928
1929int migrate_multifd_page_count(void)
1930{
1931    MigrationState *s;
1932
1933    s = migrate_get_current();
1934
1935    return s->parameters.x_multifd_page_count;
1936}
1937
1938int migrate_use_xbzrle(void)
1939{
1940    MigrationState *s;
1941
1942    s = migrate_get_current();
1943
1944    return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE];
1945}
1946
1947int64_t migrate_xbzrle_cache_size(void)
1948{
1949    MigrationState *s;
1950
1951    s = migrate_get_current();
1952
1953    return s->parameters.xbzrle_cache_size;
1954}
1955
1956static int64_t migrate_max_postcopy_bandwidth(void)
1957{
1958    MigrationState *s;
1959
1960    s = migrate_get_current();
1961
1962    return s->parameters.max_postcopy_bandwidth;
1963}
1964
1965
1966bool migrate_use_block(void)
1967{
1968    MigrationState *s;
1969
1970    s = migrate_get_current();
1971
1972    return s->enabled_capabilities[MIGRATION_CAPABILITY_BLOCK];
1973}
1974
1975bool migrate_use_return_path(void)
1976{
1977    MigrationState *s;
1978
1979    s = migrate_get_current();
1980
1981    return s->enabled_capabilities[MIGRATION_CAPABILITY_RETURN_PATH];
1982}
1983
1984bool migrate_use_block_incremental(void)
1985{
1986    MigrationState *s;
1987
1988    s = migrate_get_current();
1989
1990    return s->parameters.block_incremental;
1991}
1992
1993/* migration thread support */
1994/*
1995 * Something bad happened to the RP stream, mark an error
1996 * The caller shall print or trace something to indicate why
1997 */
1998static void mark_source_rp_bad(MigrationState *s)
1999{
2000    s->rp_state.error = true;
2001}
2002
2003static struct rp_cmd_args {
2004    ssize_t     len; /* -1 = variable */
2005    const char *name;
2006} rp_cmd_args[] = {
2007    [MIG_RP_MSG_INVALID]        = { .len = -1, .name = "INVALID" },
2008    [MIG_RP_MSG_SHUT]           = { .len =  4, .name = "SHUT" },
2009    [MIG_RP_MSG_PONG]           = { .len =  4, .name = "PONG" },
2010    [MIG_RP_MSG_REQ_PAGES]      = { .len = 12, .name = "REQ_PAGES" },
2011    [MIG_RP_MSG_REQ_PAGES_ID]   = { .len = -1, .name = "REQ_PAGES_ID" },
2012    [MIG_RP_MSG_RECV_BITMAP]    = { .len = -1, .name = "RECV_BITMAP" },
2013    [MIG_RP_MSG_RESUME_ACK]     = { .len =  4, .name = "RESUME_ACK" },
2014    [MIG_RP_MSG_MAX]            = { .len = -1, .name = "MAX" },
2015};
2016
2017/*
2018 * Process a request for pages received on the return path,
2019 * We're allowed to send more than requested (e.g. to round to our page size)
2020 * and we don't need to send pages that have already been sent.
2021 */
2022static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
2023                                       ram_addr_t start, size_t len)
2024{
2025    long our_host_ps = getpagesize();
2026
2027    trace_migrate_handle_rp_req_pages(rbname, start, len);
2028
2029    /*
2030     * Since we currently insist on matching page sizes, just sanity check
2031     * we're being asked for whole host pages.
2032     */
2033    if (start & (our_host_ps-1) ||
2034       (len & (our_host_ps-1))) {
2035        error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
2036                     " len: %zd", __func__, start, len);
2037        mark_source_rp_bad(ms);
2038        return;
2039    }
2040
2041    if (ram_save_queue_pages(rbname, start, len)) {
2042        mark_source_rp_bad(ms);
2043    }
2044}
2045
2046/* Return true to retry, false to quit */
2047static bool postcopy_pause_return_path_thread(MigrationState *s)
2048{
2049    trace_postcopy_pause_return_path();
2050
2051    qemu_sem_wait(&s->postcopy_pause_rp_sem);
2052
2053    trace_postcopy_pause_return_path_continued();
2054
2055    return true;
2056}
2057
2058static int migrate_handle_rp_recv_bitmap(MigrationState *s, char *block_name)
2059{
2060    RAMBlock *block = qemu_ram_block_by_name(block_name);
2061
2062    if (!block) {
2063        error_report("%s: invalid block name '%s'", __func__, block_name);
2064        return -EINVAL;
2065    }
2066
2067    /* Fetch the received bitmap and refresh the dirty bitmap */
2068    return ram_dirty_bitmap_reload(s, block);
2069}
2070
2071static int migrate_handle_rp_resume_ack(MigrationState *s, uint32_t value)
2072{
2073    trace_source_return_path_thread_resume_ack(value);
2074
2075    if (value != MIGRATION_RESUME_ACK_VALUE) {
2076        error_report("%s: illegal resume_ack value %"PRIu32,
2077                     __func__, value);
2078        return -1;
2079    }
2080
2081    /* Now both sides are active. */
2082    migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2083                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
2084
2085    /* Notify send thread that time to continue send pages */
2086    qemu_sem_post(&s->rp_state.rp_sem);
2087
2088    return 0;
2089}
2090
2091/*
2092 * Handles messages sent on the return path towards the source VM
2093 *
2094 */
2095static void *source_return_path_thread(void *opaque)
2096{
2097    MigrationState *ms = opaque;
2098    QEMUFile *rp = ms->rp_state.from_dst_file;
2099    uint16_t header_len, header_type;
2100    uint8_t buf[512];
2101    uint32_t tmp32, sibling_error;
2102    ram_addr_t start = 0; /* =0 to silence warning */
2103    size_t  len = 0, expected_len;
2104    int res;
2105
2106    trace_source_return_path_thread_entry();
2107
2108retry:
2109    while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
2110           migration_is_setup_or_active(ms->state)) {
2111        trace_source_return_path_thread_loop_top();
2112        header_type = qemu_get_be16(rp);
2113        header_len = qemu_get_be16(rp);
2114
2115        if (qemu_file_get_error(rp)) {
2116            mark_source_rp_bad(ms);
2117            goto out;
2118        }
2119
2120        if (header_type >= MIG_RP_MSG_MAX ||
2121            header_type == MIG_RP_MSG_INVALID) {
2122            error_report("RP: Received invalid message 0x%04x length 0x%04x",
2123                    header_type, header_len);
2124            mark_source_rp_bad(ms);
2125            goto out;
2126        }
2127
2128        if ((rp_cmd_args[header_type].len != -1 &&
2129            header_len != rp_cmd_args[header_type].len) ||
2130            header_len > sizeof(buf)) {
2131            error_report("RP: Received '%s' message (0x%04x) with"
2132                    "incorrect length %d expecting %zu",
2133                    rp_cmd_args[header_type].name, header_type, header_len,
2134                    (size_t)rp_cmd_args[header_type].len);
2135            mark_source_rp_bad(ms);
2136            goto out;
2137        }
2138
2139        /* We know we've got a valid header by this point */
2140        res = qemu_get_buffer(rp, buf, header_len);
2141        if (res != header_len) {
2142            error_report("RP: Failed reading data for message 0x%04x"
2143                         " read %d expected %d",
2144                         header_type, res, header_len);
2145            mark_source_rp_bad(ms);
2146            goto out;
2147        }
2148
2149        /* OK, we have the message and the data */
2150        switch (header_type) {
2151        case MIG_RP_MSG_SHUT:
2152            sibling_error = ldl_be_p(buf);
2153            trace_source_return_path_thread_shut(sibling_error);
2154            if (sibling_error) {
2155                error_report("RP: Sibling indicated error %d", sibling_error);
2156                mark_source_rp_bad(ms);
2157            }
2158            /*
2159             * We'll let the main thread deal with closing the RP
2160             * we could do a shutdown(2) on it, but we're the only user
2161             * anyway, so there's nothing gained.
2162             */
2163            goto out;
2164
2165        case MIG_RP_MSG_PONG:
2166            tmp32 = ldl_be_p(buf);
2167            trace_source_return_path_thread_pong(tmp32);
2168            break;
2169
2170        case MIG_RP_MSG_REQ_PAGES:
2171            start = ldq_be_p(buf);
2172            len = ldl_be_p(buf + 8);
2173            migrate_handle_rp_req_pages(ms, NULL, start, len);
2174            break;
2175
2176        case MIG_RP_MSG_REQ_PAGES_ID:
2177            expected_len = 12 + 1; /* header + termination */
2178
2179            if (header_len >= expected_len) {
2180                start = ldq_be_p(buf);
2181                len = ldl_be_p(buf + 8);
2182                /* Now we expect an idstr */
2183                tmp32 = buf[12]; /* Length of the following idstr */
2184                buf[13 + tmp32] = '\0';
2185                expected_len += tmp32;
2186            }
2187            if (header_len != expected_len) {
2188                error_report("RP: Req_Page_id with length %d expecting %zd",
2189                        header_len, expected_len);
2190                mark_source_rp_bad(ms);
2191                goto out;
2192            }
2193            migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
2194            break;
2195
2196        case MIG_RP_MSG_RECV_BITMAP:
2197            if (header_len < 1) {
2198                error_report("%s: missing block name", __func__);
2199                mark_source_rp_bad(ms);
2200                goto out;
2201            }
2202            /* Format: len (1B) + idstr (<255B). This ends the idstr. */
2203            buf[buf[0] + 1] = '\0';
2204            if (migrate_handle_rp_recv_bitmap(ms, (char *)(buf + 1))) {
2205                mark_source_rp_bad(ms);
2206                goto out;
2207            }
2208            break;
2209
2210        case MIG_RP_MSG_RESUME_ACK:
2211            tmp32 = ldl_be_p(buf);
2212            if (migrate_handle_rp_resume_ack(ms, tmp32)) {
2213                mark_source_rp_bad(ms);
2214                goto out;
2215            }
2216            break;
2217
2218        default:
2219            break;
2220        }
2221    }
2222
2223out:
2224    res = qemu_file_get_error(rp);
2225    if (res) {
2226        if (res == -EIO) {
2227            /*
2228             * Maybe there is something we can do: it looks like a
2229             * network down issue, and we pause for a recovery.
2230             */
2231            if (postcopy_pause_return_path_thread(ms)) {
2232                /* Reload rp, reset the rest */
2233                rp = ms->rp_state.from_dst_file;
2234                ms->rp_state.error = false;
2235                goto retry;
2236            }
2237        }
2238
2239        trace_source_return_path_thread_bad_end();
2240        mark_source_rp_bad(ms);
2241    }
2242
2243    trace_source_return_path_thread_end();
2244    ms->rp_state.from_dst_file = NULL;
2245    qemu_fclose(rp);
2246    return NULL;
2247}
2248
2249static int open_return_path_on_source(MigrationState *ms,
2250                                      bool create_thread)
2251{
2252
2253    ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file);
2254    if (!ms->rp_state.from_dst_file) {
2255        return -1;
2256    }
2257
2258    trace_open_return_path_on_source();
2259
2260    if (!create_thread) {
2261        /* We're done */
2262        return 0;
2263    }
2264
2265    qemu_thread_create(&ms->rp_state.rp_thread, "return path",
2266                       source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
2267
2268    trace_open_return_path_on_source_continue();
2269
2270    return 0;
2271}
2272
2273/* Returns 0 if the RP was ok, otherwise there was an error on the RP */
2274static int await_return_path_close_on_source(MigrationState *ms)
2275{
2276    /*
2277     * If this is a normal exit then the destination will send a SHUT and the
2278     * rp_thread will exit, however if there's an error we need to cause
2279     * it to exit.
2280     */
2281    if (qemu_file_get_error(ms->to_dst_file) && ms->rp_state.from_dst_file) {
2282        /*
2283         * shutdown(2), if we have it, will cause it to unblock if it's stuck
2284         * waiting for the destination.
2285         */
2286        qemu_file_shutdown(ms->rp_state.from_dst_file);
2287        mark_source_rp_bad(ms);
2288    }
2289    trace_await_return_path_close_on_source_joining();
2290    qemu_thread_join(&ms->rp_state.rp_thread);
2291    trace_await_return_path_close_on_source_close();
2292    return ms->rp_state.error;
2293}
2294
2295/*
2296 * Switch from normal iteration to postcopy
2297 * Returns non-0 on error
2298 */
2299static int postcopy_start(MigrationState *ms)
2300{
2301    int ret;
2302    QIOChannelBuffer *bioc;
2303    QEMUFile *fb;
2304    int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2305    int64_t bandwidth = migrate_max_postcopy_bandwidth();
2306    bool restart_block = false;
2307    int cur_state = MIGRATION_STATUS_ACTIVE;
2308    if (!migrate_pause_before_switchover()) {
2309        migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
2310                          MIGRATION_STATUS_POSTCOPY_ACTIVE);
2311    }
2312
2313    trace_postcopy_start();
2314    qemu_mutex_lock_iothread();
2315    trace_postcopy_start_set_run();
2316
2317    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
2318    global_state_store();
2319    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
2320    if (ret < 0) {
2321        goto fail;
2322    }
2323
2324    ret = migration_maybe_pause(ms, &cur_state,
2325                                MIGRATION_STATUS_POSTCOPY_ACTIVE);
2326    if (ret < 0) {
2327        goto fail;
2328    }
2329
2330    ret = bdrv_inactivate_all();
2331    if (ret < 0) {
2332        goto fail;
2333    }
2334    restart_block = true;
2335
2336    /*
2337     * Cause any non-postcopiable, but iterative devices to
2338     * send out their final data.
2339     */
2340    qemu_savevm_state_complete_precopy(ms->to_dst_file, true, false);
2341
2342    /*
2343     * in Finish migrate and with the io-lock held everything should
2344     * be quiet, but we've potentially still got dirty pages and we
2345     * need to tell the destination to throw any pages it's already received
2346     * that are dirty
2347     */
2348    if (migrate_postcopy_ram()) {
2349        if (ram_postcopy_send_discard_bitmap(ms)) {
2350            error_report("postcopy send discard bitmap failed");
2351            goto fail;
2352        }
2353    }
2354
2355    /*
2356     * send rest of state - note things that are doing postcopy
2357     * will notice we're in POSTCOPY_ACTIVE and not actually
2358     * wrap their state up here
2359     */
2360    /* 0 max-postcopy-bandwidth means unlimited */
2361    if (!bandwidth) {
2362        qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX);
2363    } else {
2364        qemu_file_set_rate_limit(ms->to_dst_file, bandwidth / XFER_LIMIT_RATIO);
2365    }
2366    if (migrate_postcopy_ram()) {
2367        /* Ping just for debugging, helps line traces up */
2368        qemu_savevm_send_ping(ms->to_dst_file, 2);
2369    }
2370
2371    /*
2372     * While loading the device state we may trigger page transfer
2373     * requests and the fd must be free to process those, and thus
2374     * the destination must read the whole device state off the fd before
2375     * it starts processing it.  Unfortunately the ad-hoc migration format
2376     * doesn't allow the destination to know the size to read without fully
2377     * parsing it through each devices load-state code (especially the open
2378     * coded devices that use get/put).
2379     * So we wrap the device state up in a package with a length at the start;
2380     * to do this we use a qemu_buf to hold the whole of the device state.
2381     */
2382    bioc = qio_channel_buffer_new(4096);
2383    qio_channel_set_name(QIO_CHANNEL(bioc), "migration-postcopy-buffer");
2384    fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc));
2385    object_unref(OBJECT(bioc));
2386
2387    /*
2388     * Make sure the receiver can get incoming pages before we send the rest
2389     * of the state
2390     */
2391    qemu_savevm_send_postcopy_listen(fb);
2392
2393    qemu_savevm_state_complete_precopy(fb, false, false);
2394    if (migrate_postcopy_ram()) {
2395        qemu_savevm_send_ping(fb, 3);
2396    }
2397
2398    qemu_savevm_send_postcopy_run(fb);
2399
2400    /* <><> end of stuff going into the package */
2401
2402    /* Last point of recovery; as soon as we send the package the destination
2403     * can open devices and potentially start running.
2404     * Lets just check again we've not got any errors.
2405     */
2406    ret = qemu_file_get_error(ms->to_dst_file);
2407    if (ret) {
2408        error_report("postcopy_start: Migration stream errored (pre package)");
2409        goto fail_closefb;
2410    }
2411
2412    restart_block = false;
2413
2414    /* Now send that blob */
2415    if (qemu_savevm_send_packaged(ms->to_dst_file, bioc->data, bioc->usage)) {
2416        goto fail_closefb;
2417    }
2418    qemu_fclose(fb);
2419
2420    /* Send a notify to give a chance for anything that needs to happen
2421     * at the transition to postcopy and after the device state; in particular
2422     * spice needs to trigger a transition now
2423     */
2424    ms->postcopy_after_devices = true;
2425    notifier_list_notify(&migration_state_notifiers, ms);
2426
2427    ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
2428
2429    qemu_mutex_unlock_iothread();
2430
2431    if (migrate_postcopy_ram()) {
2432        /*
2433         * Although this ping is just for debug, it could potentially be
2434         * used for getting a better measurement of downtime at the source.
2435         */
2436        qemu_savevm_send_ping(ms->to_dst_file, 4);
2437    }
2438
2439    if (migrate_release_ram()) {
2440        ram_postcopy_migrated_memory_release(ms);
2441    }
2442
2443    ret = qemu_file_get_error(ms->to_dst_file);
2444    if (ret) {
2445        error_report("postcopy_start: Migration stream errored");
2446        migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2447                              MIGRATION_STATUS_FAILED);
2448    }
2449
2450    return ret;
2451
2452fail_closefb:
2453    qemu_fclose(fb);
2454fail:
2455    migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2456                          MIGRATION_STATUS_FAILED);
2457    if (restart_block) {
2458        /* A failure happened early enough that we know the destination hasn't
2459         * accessed block devices, so we're safe to recover.
2460         */
2461        Error *local_err = NULL;
2462
2463        bdrv_invalidate_cache_all(&local_err);
2464        if (local_err) {
2465            error_report_err(local_err);
2466        }
2467    }
2468    qemu_mutex_unlock_iothread();
2469    return -1;
2470}
2471
2472/**
2473 * migration_maybe_pause: Pause if required to by
2474 * migrate_pause_before_switchover called with the iothread locked
2475 * Returns: 0 on success
2476 */
2477static int migration_maybe_pause(MigrationState *s,
2478                                 int *current_active_state,
2479                                 int new_state)
2480{
2481    if (!migrate_pause_before_switchover()) {
2482        return 0;
2483    }
2484
2485    /* Since leaving this state is not atomic with posting the semaphore
2486     * it's possible that someone could have issued multiple migrate_continue
2487     * and the semaphore is incorrectly positive at this point;
2488     * the docs say it's undefined to reinit a semaphore that's already
2489     * init'd, so use timedwait to eat up any existing posts.
2490     */
2491    while (qemu_sem_timedwait(&s->pause_sem, 1) == 0) {
2492        /* This block intentionally left blank */
2493    }
2494
2495    qemu_mutex_unlock_iothread();
2496    migrate_set_state(&s->state, *current_active_state,
2497                      MIGRATION_STATUS_PRE_SWITCHOVER);
2498    qemu_sem_wait(&s->pause_sem);
2499    migrate_set_state(&s->state, MIGRATION_STATUS_PRE_SWITCHOVER,
2500                      new_state);
2501    *current_active_state = new_state;
2502    qemu_mutex_lock_iothread();
2503
2504    return s->state == new_state ? 0 : -EINVAL;
2505}
2506
2507/**
2508 * migration_completion: Used by migration_thread when there's not much left.
2509 *   The caller 'breaks' the loop when this returns.
2510 *
2511 * @s: Current migration state
2512 */
2513static void migration_completion(MigrationState *s)
2514{
2515    int ret;
2516    int current_active_state = s->state;
2517
2518    if (s->state == MIGRATION_STATUS_ACTIVE) {
2519        qemu_mutex_lock_iothread();
2520        s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2521        qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
2522        s->vm_was_running = runstate_is_running();
2523        ret = global_state_store();
2524
2525        if (!ret) {
2526            bool inactivate = !migrate_colo_enabled();
2527            ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
2528            if (ret >= 0) {
2529                ret = migration_maybe_pause(s, &current_active_state,
2530                                            MIGRATION_STATUS_DEVICE);
2531            }
2532            if (ret >= 0) {
2533                qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
2534                ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
2535                                                         inactivate);
2536            }
2537            if (inactivate && ret >= 0) {
2538                s->block_inactive = true;
2539            }
2540        }
2541        qemu_mutex_unlock_iothread();
2542
2543        if (ret < 0) {
2544            goto fail;
2545        }
2546    } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2547        trace_migration_completion_postcopy_end();
2548
2549        qemu_savevm_state_complete_postcopy(s->to_dst_file);
2550        trace_migration_completion_postcopy_end_after_complete();
2551    }
2552
2553    /*
2554     * If rp was opened we must clean up the thread before
2555     * cleaning everything else up (since if there are no failures
2556     * it will wait for the destination to send it's status in
2557     * a SHUT command).
2558     */
2559    if (s->rp_state.from_dst_file) {
2560        int rp_error;
2561        trace_migration_return_path_end_before();
2562        rp_error = await_return_path_close_on_source(s);
2563        trace_migration_return_path_end_after(rp_error);
2564        if (rp_error) {
2565            goto fail_invalidate;
2566        }
2567    }
2568
2569    if (qemu_file_get_error(s->to_dst_file)) {
2570        trace_migration_completion_file_err();
2571        goto fail_invalidate;
2572    }
2573
2574    if (!migrate_colo_enabled()) {
2575        migrate_set_state(&s->state, current_active_state,
2576                          MIGRATION_STATUS_COMPLETED);
2577    }
2578
2579    return;
2580
2581fail_invalidate:
2582    /* If not doing postcopy, vm_start() will be called: let's regain
2583     * control on images.
2584     */
2585    if (s->state == MIGRATION_STATUS_ACTIVE ||
2586        s->state == MIGRATION_STATUS_DEVICE) {
2587        Error *local_err = NULL;
2588
2589        qemu_mutex_lock_iothread();
2590        bdrv_invalidate_cache_all(&local_err);
2591        if (local_err) {
2592            error_report_err(local_err);
2593        } else {
2594            s->block_inactive = false;
2595        }
2596        qemu_mutex_unlock_iothread();
2597    }
2598
2599fail:
2600    migrate_set_state(&s->state, current_active_state,
2601                      MIGRATION_STATUS_FAILED);
2602}
2603
2604bool migrate_colo_enabled(void)
2605{
2606    MigrationState *s = migrate_get_current();
2607    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_COLO];
2608}
2609
2610typedef enum MigThrError {
2611    /* No error detected */
2612    MIG_THR_ERR_NONE = 0,
2613    /* Detected error, but resumed successfully */
2614    MIG_THR_ERR_RECOVERED = 1,
2615    /* Detected fatal error, need to exit */
2616    MIG_THR_ERR_FATAL = 2,
2617} MigThrError;
2618
2619static int postcopy_resume_handshake(MigrationState *s)
2620{
2621    qemu_savevm_send_postcopy_resume(s->to_dst_file);
2622
2623    while (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
2624        qemu_sem_wait(&s->rp_state.rp_sem);
2625    }
2626
2627    if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2628        return 0;
2629    }
2630
2631    return -1;
2632}
2633
2634/* Return zero if success, or <0 for error */
2635static int postcopy_do_resume(MigrationState *s)
2636{
2637    int ret;
2638
2639    /*
2640     * Call all the resume_prepare() hooks, so that modules can be
2641     * ready for the migration resume.
2642     */
2643    ret = qemu_savevm_state_resume_prepare(s);
2644    if (ret) {
2645        error_report("%s: resume_prepare() failure detected: %d",
2646                     __func__, ret);
2647        return ret;
2648    }
2649
2650    /*
2651     * Last handshake with destination on the resume (destination will
2652     * switch to postcopy-active afterwards)
2653     */
2654    ret = postcopy_resume_handshake(s);
2655    if (ret) {
2656        error_report("%s: handshake failed: %d", __func__, ret);
2657        return ret;
2658    }
2659
2660    return 0;
2661}
2662
2663/*
2664 * We don't return until we are in a safe state to continue current
2665 * postcopy migration.  Returns MIG_THR_ERR_RECOVERED if recovered, or
2666 * MIG_THR_ERR_FATAL if unrecovery failure happened.
2667 */
2668static MigThrError postcopy_pause(MigrationState *s)
2669{
2670    assert(s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
2671
2672    while (true) {
2673        QEMUFile *file;
2674
2675        migrate_set_state(&s->state, s->state,
2676                          MIGRATION_STATUS_POSTCOPY_PAUSED);
2677
2678        /* Current channel is possibly broken. Release it. */
2679        assert(s->to_dst_file);
2680        qemu_mutex_lock(&s->qemu_file_lock);
2681        file = s->to_dst_file;
2682        s->to_dst_file = NULL;
2683        qemu_mutex_unlock(&s->qemu_file_lock);
2684
2685        qemu_file_shutdown(file);
2686        qemu_fclose(file);
2687
2688        error_report("Detected IO failure for postcopy. "
2689                     "Migration paused.");
2690
2691        /*
2692         * We wait until things fixed up. Then someone will setup the
2693         * status back for us.
2694         */
2695        while (s->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
2696            qemu_sem_wait(&s->postcopy_pause_sem);
2697        }
2698
2699        if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
2700            /* Woken up by a recover procedure. Give it a shot */
2701
2702            /*
2703             * Firstly, let's wake up the return path now, with a new
2704             * return path channel.
2705             */
2706            qemu_sem_post(&s->postcopy_pause_rp_sem);
2707
2708            /* Do the resume logic */
2709            if (postcopy_do_resume(s) == 0) {
2710                /* Let's continue! */
2711                trace_postcopy_pause_continued();
2712                return MIG_THR_ERR_RECOVERED;
2713            } else {
2714                /*
2715                 * Something wrong happened during the recovery, let's
2716                 * pause again. Pause is always better than throwing
2717                 * data away.
2718                 */
2719                continue;
2720            }
2721        } else {
2722            /* This is not right... Time to quit. */
2723            return MIG_THR_ERR_FATAL;
2724        }
2725    }
2726}
2727
2728static MigThrError migration_detect_error(MigrationState *s)
2729{
2730    int ret;
2731
2732    /* Try to detect any file errors */
2733    ret = qemu_file_get_error(s->to_dst_file);
2734
2735    if (!ret) {
2736        /* Everything is fine */
2737        return MIG_THR_ERR_NONE;
2738    }
2739
2740    if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE && ret == -EIO) {
2741        /*
2742         * For postcopy, we allow the network to be down for a
2743         * while. After that, it can be continued by a
2744         * recovery phase.
2745         */
2746        return postcopy_pause(s);
2747    } else {
2748        /*
2749         * For precopy (or postcopy with error outside IO), we fail
2750         * with no time.
2751         */
2752        migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
2753        trace_migration_thread_file_err();
2754
2755        /* Time to stop the migration, now. */
2756        return MIG_THR_ERR_FATAL;
2757    }
2758}
2759
2760/* How many bytes have we transferred since the beggining of the migration */
2761static uint64_t migration_total_bytes(MigrationState *s)
2762{
2763    return qemu_ftell(s->to_dst_file) + ram_counters.multifd_bytes;
2764}
2765
2766static void migration_calculate_complete(MigrationState *s)
2767{
2768    uint64_t bytes = migration_total_bytes(s);
2769    int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2770    int64_t transfer_time;
2771
2772    s->total_time = end_time - s->start_time;
2773    if (!s->downtime) {
2774        /*
2775         * It's still not set, so we are precopy migration.  For
2776         * postcopy, downtime is calculated during postcopy_start().
2777         */
2778        s->downtime = end_time - s->downtime_start;
2779    }
2780
2781    transfer_time = s->total_time - s->setup_time;
2782    if (transfer_time) {
2783        s->mbps = ((double) bytes * 8.0) / transfer_time / 1000;
2784    }
2785}
2786
2787static void migration_update_counters(MigrationState *s,
2788                                      int64_t current_time)
2789{
2790    uint64_t transferred, time_spent;
2791    uint64_t current_bytes; /* bytes transferred since the beginning */
2792    double bandwidth;
2793
2794    if (current_time < s->iteration_start_time + BUFFER_DELAY) {
2795        return;
2796    }
2797
2798    current_bytes = migration_total_bytes(s);
2799    transferred = current_bytes - s->iteration_initial_bytes;
2800    time_spent = current_time - s->iteration_start_time;
2801    bandwidth = (double)transferred / time_spent;
2802    s->threshold_size = bandwidth * s->parameters.downtime_limit;
2803
2804    s->mbps = (((double) transferred * 8.0) /
2805               ((double) time_spent / 1000.0)) / 1000.0 / 1000.0;
2806
2807    /*
2808     * if we haven't sent anything, we don't want to
2809     * recalculate. 10000 is a small enough number for our purposes
2810     */
2811    if (ram_counters.dirty_pages_rate && transferred > 10000) {
2812        s->expected_downtime = ram_counters.remaining / bandwidth;
2813    }
2814
2815    qemu_file_reset_rate_limit(s->to_dst_file);
2816
2817    s->iteration_start_time = current_time;
2818    s->iteration_initial_bytes = current_bytes;
2819
2820    trace_migrate_transferred(transferred, time_spent,
2821                              bandwidth, s->threshold_size);
2822}
2823
2824/* Migration thread iteration status */
2825typedef enum {
2826    MIG_ITERATE_RESUME,         /* Resume current iteration */
2827    MIG_ITERATE_SKIP,           /* Skip current iteration */
2828    MIG_ITERATE_BREAK,          /* Break the loop */
2829} MigIterateState;
2830
2831/*
2832 * Return true if continue to the next iteration directly, false
2833 * otherwise.
2834 */
2835static MigIterateState migration_iteration_run(MigrationState *s)
2836{
2837    uint64_t pending_size, pend_pre, pend_compat, pend_post;
2838    bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
2839
2840    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
2841                              &pend_compat, &pend_post);
2842    pending_size = pend_pre + pend_compat + pend_post;
2843
2844    trace_migrate_pending(pending_size, s->threshold_size,
2845                          pend_pre, pend_compat, pend_post);
2846
2847    if (pending_size && pending_size >= s->threshold_size) {
2848        /* Still a significant amount to transfer */
2849        if (migrate_postcopy() && !in_postcopy &&
2850            pend_pre <= s->threshold_size &&
2851            atomic_read(&s->start_postcopy)) {
2852            if (postcopy_start(s)) {
2853                error_report("%s: postcopy failed to start", __func__);
2854            }
2855            return MIG_ITERATE_SKIP;
2856        }
2857        /* Just another iteration step */
2858        qemu_savevm_state_iterate(s->to_dst_file,
2859            s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
2860    } else {
2861        trace_migration_thread_low_pending(pending_size);
2862        migration_completion(s);
2863        return MIG_ITERATE_BREAK;
2864    }
2865
2866    return MIG_ITERATE_RESUME;
2867}
2868
2869static void migration_iteration_finish(MigrationState *s)
2870{
2871    /* If we enabled cpu throttling for auto-converge, turn it off. */
2872    cpu_throttle_stop();
2873
2874    qemu_mutex_lock_iothread();
2875    switch (s->state) {
2876    case MIGRATION_STATUS_COMPLETED:
2877        migration_calculate_complete(s);
2878        runstate_set(RUN_STATE_POSTMIGRATE);
2879        break;
2880
2881    case MIGRATION_STATUS_ACTIVE:
2882        /*
2883         * We should really assert here, but since it's during
2884         * migration, let's try to reduce the usage of assertions.
2885         */
2886        if (!migrate_colo_enabled()) {
2887            error_report("%s: critical error: calling COLO code without "
2888                         "COLO enabled", __func__);
2889        }
2890        migrate_start_colo_process(s);
2891        /*
2892         * Fixme: we will run VM in COLO no matter its old running state.
2893         * After exited COLO, we will keep running.
2894         */
2895        s->vm_was_running = true;
2896        /* Fallthrough */
2897    case MIGRATION_STATUS_FAILED:
2898    case MIGRATION_STATUS_CANCELLED:
2899    case MIGRATION_STATUS_CANCELLING:
2900        if (s->vm_was_running) {
2901            vm_start();
2902        } else {
2903            if (runstate_check(RUN_STATE_FINISH_MIGRATE)) {
2904                runstate_set(RUN_STATE_POSTMIGRATE);
2905            }
2906        }
2907        break;
2908
2909    default:
2910        /* Should not reach here, but if so, forgive the VM. */
2911        error_report("%s: Unknown ending state %d", __func__, s->state);
2912        break;
2913    }
2914    qemu_bh_schedule(s->cleanup_bh);
2915    qemu_mutex_unlock_iothread();
2916}
2917
2918void migration_make_urgent_request(void)
2919{
2920    qemu_sem_post(&migrate_get_current()->rate_limit_sem);
2921}
2922
2923void migration_consume_urgent_request(void)
2924{
2925    qemu_sem_wait(&migrate_get_current()->rate_limit_sem);
2926}
2927
2928/*
2929 * Master migration thread on the source VM.
2930 * It drives the migration and pumps the data down the outgoing channel.
2931 */
2932static void *migration_thread(void *opaque)
2933{
2934    MigrationState *s = opaque;
2935    int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
2936    MigThrError thr_error;
2937    bool urgent = false;
2938
2939    rcu_register_thread();
2940
2941    s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
2942
2943    qemu_savevm_state_header(s->to_dst_file);
2944
2945    /*
2946     * If we opened the return path, we need to make sure dst has it
2947     * opened as well.
2948     */
2949    if (s->rp_state.from_dst_file) {
2950        /* Now tell the dest that it should open its end so it can reply */
2951        qemu_savevm_send_open_return_path(s->to_dst_file);
2952
2953        /* And do a ping that will make stuff easier to debug */
2954        qemu_savevm_send_ping(s->to_dst_file, 1);
2955    }
2956
2957    if (migrate_postcopy()) {
2958        /*
2959         * Tell the destination that we *might* want to do postcopy later;
2960         * if the other end can't do postcopy it should fail now, nice and
2961         * early.
2962         */
2963        qemu_savevm_send_postcopy_advise(s->to_dst_file);
2964    }
2965
2966    qemu_savevm_state_setup(s->to_dst_file);
2967
2968    s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
2969    migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
2970                      MIGRATION_STATUS_ACTIVE);
2971
2972    trace_migration_thread_setup_complete();
2973
2974    while (s->state == MIGRATION_STATUS_ACTIVE ||
2975           s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
2976        int64_t current_time;
2977
2978        if (urgent || !qemu_file_rate_limit(s->to_dst_file)) {
2979            MigIterateState iter_state = migration_iteration_run(s);
2980            if (iter_state == MIG_ITERATE_SKIP) {
2981                continue;
2982            } else if (iter_state == MIG_ITERATE_BREAK) {
2983                break;
2984            }
2985        }
2986
2987        /*
2988         * Try to detect any kind of failures, and see whether we
2989         * should stop the migration now.
2990         */
2991        thr_error = migration_detect_error(s);
2992        if (thr_error == MIG_THR_ERR_FATAL) {
2993            /* Stop migration */
2994            break;
2995        } else if (thr_error == MIG_THR_ERR_RECOVERED) {
2996            /*
2997             * Just recovered from a e.g. network failure, reset all
2998             * the local variables. This is important to avoid
2999             * breaking transferred_bytes and bandwidth calculation
3000             */
3001            s->iteration_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3002            s->iteration_initial_bytes = 0;
3003        }
3004
3005        current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
3006
3007        migration_update_counters(s, current_time);
3008
3009        urgent = false;
3010        if (qemu_file_rate_limit(s->to_dst_file)) {
3011            /* Wait for a delay to do rate limiting OR
3012             * something urgent to post the semaphore.
3013             */
3014            int ms = s->iteration_start_time + BUFFER_DELAY - current_time;
3015            trace_migration_thread_ratelimit_pre(ms);
3016            if (qemu_sem_timedwait(&s->rate_limit_sem, ms) == 0) {
3017                /* We were worken by one or more urgent things but
3018                 * the timedwait will have consumed one of them.
3019                 * The service routine for the urgent wake will dec
3020                 * the semaphore itself for each item it consumes,
3021                 * so add this one we just eat back.
3022                 */
3023                qemu_sem_post(&s->rate_limit_sem);
3024                urgent = true;
3025            }
3026            trace_migration_thread_ratelimit_post(urgent);
3027        }
3028    }
3029
3030    trace_migration_thread_after_loop();
3031    migration_iteration_finish(s);
3032    rcu_unregister_thread();
3033    return NULL;
3034}
3035
3036void migrate_fd_connect(MigrationState *s, Error *error_in)
3037{
3038    int64_t rate_limit;
3039    bool resume = s->state == MIGRATION_STATUS_POSTCOPY_PAUSED;
3040
3041    s->expected_downtime = s->parameters.downtime_limit;
3042    s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
3043    if (error_in) {
3044        migrate_fd_error(s, error_in);
3045        migrate_fd_cleanup(s);
3046        return;
3047    }
3048
3049    if (resume) {
3050        /* This is a resumed migration */
3051        rate_limit = INT64_MAX;
3052    } else {
3053        /* This is a fresh new migration */
3054        rate_limit = s->parameters.max_bandwidth / XFER_LIMIT_RATIO;
3055
3056        /* Notify before starting migration thread */
3057        notifier_list_notify(&migration_state_notifiers, s);
3058    }
3059
3060    qemu_file_set_rate_limit(s->to_dst_file, rate_limit);
3061    qemu_file_set_blocking(s->to_dst_file, true);
3062
3063    /*
3064     * Open the return path. For postcopy, it is used exclusively. For
3065     * precopy, only if user specified "return-path" capability would
3066     * QEMU uses the return path.
3067     */
3068    if (migrate_postcopy_ram() || migrate_use_return_path()) {
3069        if (open_return_path_on_source(s, !resume)) {
3070            error_report("Unable to open return-path for postcopy");
3071            migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED);
3072            migrate_fd_cleanup(s);
3073            return;
3074        }
3075    }
3076
3077    if (resume) {
3078        /* Wakeup the main migration thread to do the recovery */
3079        migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
3080                          MIGRATION_STATUS_POSTCOPY_RECOVER);
3081        qemu_sem_post(&s->postcopy_pause_sem);
3082        return;
3083    }
3084
3085    if (multifd_save_setup() != 0) {
3086        migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
3087                          MIGRATION_STATUS_FAILED);
3088        migrate_fd_cleanup(s);
3089        return;
3090    }
3091    qemu_thread_create(&s->thread, "live_migration", migration_thread, s,
3092                       QEMU_THREAD_JOINABLE);
3093    s->migration_thread_running = true;
3094}
3095
3096void migration_global_dump(Monitor *mon)
3097{
3098    MigrationState *ms = migrate_get_current();
3099
3100    monitor_printf(mon, "globals:\n");
3101    monitor_printf(mon, "store-global-state: %s\n",
3102                   ms->store_global_state ? "on" : "off");
3103    monitor_printf(mon, "only-migratable: %s\n",
3104                   ms->only_migratable ? "on" : "off");
3105    monitor_printf(mon, "send-configuration: %s\n",
3106                   ms->send_configuration ? "on" : "off");
3107    monitor_printf(mon, "send-section-footer: %s\n",
3108                   ms->send_section_footer ? "on" : "off");
3109    monitor_printf(mon, "decompress-error-check: %s\n",
3110                   ms->decompress_error_check ? "on" : "off");
3111}
3112
3113#define DEFINE_PROP_MIG_CAP(name, x)             \
3114    DEFINE_PROP_BOOL(name, MigrationState, enabled_capabilities[x], false)
3115
3116static Property migration_properties[] = {
3117    DEFINE_PROP_BOOL("store-global-state", MigrationState,
3118                     store_global_state, true),
3119    DEFINE_PROP_BOOL("only-migratable", MigrationState, only_migratable, false),
3120    DEFINE_PROP_BOOL("send-configuration", MigrationState,
3121                     send_configuration, true),
3122    DEFINE_PROP_BOOL("send-section-footer", MigrationState,
3123                     send_section_footer, true),
3124    DEFINE_PROP_BOOL("decompress-error-check", MigrationState,
3125                      decompress_error_check, true),
3126
3127    /* Migration parameters */
3128    DEFINE_PROP_UINT8("x-compress-level", MigrationState,
3129                      parameters.compress_level,
3130                      DEFAULT_MIGRATE_COMPRESS_LEVEL),
3131    DEFINE_PROP_UINT8("x-compress-threads", MigrationState,
3132                      parameters.compress_threads,
3133                      DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT),
3134    DEFINE_PROP_UINT8("x-decompress-threads", MigrationState,
3135                      parameters.decompress_threads,
3136                      DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT),
3137    DEFINE_PROP_UINT8("x-cpu-throttle-initial", MigrationState,
3138                      parameters.cpu_throttle_initial,
3139                      DEFAULT_MIGRATE_CPU_THROTTLE_INITIAL),
3140    DEFINE_PROP_UINT8("x-cpu-throttle-increment", MigrationState,
3141                      parameters.cpu_throttle_increment,
3142                      DEFAULT_MIGRATE_CPU_THROTTLE_INCREMENT),
3143    DEFINE_PROP_SIZE("x-max-bandwidth", MigrationState,
3144                      parameters.max_bandwidth, MAX_THROTTLE),
3145    DEFINE_PROP_UINT64("x-downtime-limit", MigrationState,
3146                      parameters.downtime_limit,
3147                      DEFAULT_MIGRATE_SET_DOWNTIME),
3148    DEFINE_PROP_UINT32("x-checkpoint-delay", MigrationState,
3149                      parameters.x_checkpoint_delay,
3150                      DEFAULT_MIGRATE_X_CHECKPOINT_DELAY),
3151    DEFINE_PROP_UINT8("x-multifd-channels", MigrationState,
3152                      parameters.x_multifd_channels,
3153                      DEFAULT_MIGRATE_MULTIFD_CHANNELS),
3154    DEFINE_PROP_UINT32("x-multifd-page-count", MigrationState,
3155                      parameters.x_multifd_page_count,
3156                      DEFAULT_MIGRATE_MULTIFD_PAGE_COUNT),
3157    DEFINE_PROP_SIZE("xbzrle-cache-size", MigrationState,
3158                      parameters.xbzrle_cache_size,
3159                      DEFAULT_MIGRATE_XBZRLE_CACHE_SIZE),
3160    DEFINE_PROP_SIZE("max-postcopy-bandwidth", MigrationState,
3161                      parameters.max_postcopy_bandwidth,
3162                      DEFAULT_MIGRATE_MAX_POSTCOPY_BANDWIDTH),
3163
3164    /* Migration capabilities */
3165    DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
3166    DEFINE_PROP_MIG_CAP("x-rdma-pin-all", MIGRATION_CAPABILITY_RDMA_PIN_ALL),
3167    DEFINE_PROP_MIG_CAP("x-auto-converge", MIGRATION_CAPABILITY_AUTO_CONVERGE),
3168    DEFINE_PROP_MIG_CAP("x-zero-blocks", MIGRATION_CAPABILITY_ZERO_BLOCKS),
3169    DEFINE_PROP_MIG_CAP("x-compress", MIGRATION_CAPABILITY_COMPRESS),
3170    DEFINE_PROP_MIG_CAP("x-events", MIGRATION_CAPABILITY_EVENTS),
3171    DEFINE_PROP_MIG_CAP("x-postcopy-ram", MIGRATION_CAPABILITY_POSTCOPY_RAM),
3172    DEFINE_PROP_MIG_CAP("x-colo", MIGRATION_CAPABILITY_X_COLO),
3173    DEFINE_PROP_MIG_CAP("x-release-ram", MIGRATION_CAPABILITY_RELEASE_RAM),
3174    DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK),
3175    DEFINE_PROP_MIG_CAP("x-return-path", MIGRATION_CAPABILITY_RETURN_PATH),
3176    DEFINE_PROP_MIG_CAP("x-multifd", MIGRATION_CAPABILITY_X_MULTIFD),
3177
3178    DEFINE_PROP_END_OF_LIST(),
3179};
3180
3181static void migration_class_init(ObjectClass *klass, void *data)
3182{
3183    DeviceClass *dc = DEVICE_CLASS(klass);
3184
3185    dc->user_creatable = false;
3186    dc->props = migration_properties;
3187}
3188
3189static void migration_instance_finalize(Object *obj)
3190{
3191    MigrationState *ms = MIGRATION_OBJ(obj);
3192    MigrationParameters *params = &ms->parameters;
3193
3194    qemu_mutex_destroy(&ms->error_mutex);
3195    qemu_mutex_destroy(&ms->qemu_file_lock);
3196    g_free(params->tls_hostname);
3197    g_free(params->tls_creds);
3198    qemu_sem_destroy(&ms->rate_limit_sem);
3199    qemu_sem_destroy(&ms->pause_sem);
3200    qemu_sem_destroy(&ms->postcopy_pause_sem);
3201    qemu_sem_destroy(&ms->postcopy_pause_rp_sem);
3202    qemu_sem_destroy(&ms->rp_state.rp_sem);
3203    error_free(ms->error);
3204}
3205
3206static void migration_instance_init(Object *obj)
3207{
3208    MigrationState *ms = MIGRATION_OBJ(obj);
3209    MigrationParameters *params = &ms->parameters;
3210
3211    ms->state = MIGRATION_STATUS_NONE;
3212    ms->mbps = -1;
3213    qemu_sem_init(&ms->pause_sem, 0);
3214    qemu_mutex_init(&ms->error_mutex);
3215
3216    params->tls_hostname = g_strdup("");
3217    params->tls_creds = g_strdup("");
3218
3219    /* Set has_* up only for parameter checks */
3220    params->has_compress_level = true;
3221    params->has_compress_threads = true;
3222    params->has_decompress_threads = true;
3223    params->has_cpu_throttle_initial = true;
3224    params->has_cpu_throttle_increment = true;
3225    params->has_max_bandwidth = true;
3226    params->has_downtime_limit = true;
3227    params->has_x_checkpoint_delay = true;
3228    params->has_block_incremental = true;
3229    params->has_x_multifd_channels = true;
3230    params->has_x_multifd_page_count = true;
3231    params->has_xbzrle_cache_size = true;
3232    params->has_max_postcopy_bandwidth = true;
3233
3234    qemu_sem_init(&ms->postcopy_pause_sem, 0);
3235    qemu_sem_init(&ms->postcopy_pause_rp_sem, 0);
3236    qemu_sem_init(&ms->rp_state.rp_sem, 0);
3237    qemu_sem_init(&ms->rate_limit_sem, 0);
3238    qemu_mutex_init(&ms->qemu_file_lock);
3239}
3240
3241/*
3242 * Return true if check pass, false otherwise. Error will be put
3243 * inside errp if provided.
3244 */
3245static bool migration_object_check(MigrationState *ms, Error **errp)
3246{
3247    MigrationCapabilityStatusList *head = NULL;
3248    /* Assuming all off */
3249    bool cap_list[MIGRATION_CAPABILITY__MAX] = { 0 }, ret;
3250    int i;
3251
3252    if (!migrate_params_check(&ms->parameters, errp)) {
3253        return false;
3254    }
3255
3256    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
3257        if (ms->enabled_capabilities[i]) {
3258            head = migrate_cap_add(head, i, true);
3259        }
3260    }
3261
3262    ret = migrate_caps_check(cap_list, head, errp);
3263
3264    /* It works with head == NULL */
3265    qapi_free_MigrationCapabilityStatusList(head);
3266
3267    return ret;
3268}
3269
3270static const TypeInfo migration_type = {
3271    .name = TYPE_MIGRATION,
3272    /*
3273     * NOTE: TYPE_MIGRATION is not really a device, as the object is
3274     * not created using qdev_create(), it is not attached to the qdev
3275     * device tree, and it is never realized.
3276     *
3277     * TODO: Make this TYPE_OBJECT once QOM provides something like
3278     * TYPE_DEVICE's "-global" properties.
3279     */
3280    .parent = TYPE_DEVICE,
3281    .class_init = migration_class_init,
3282    .class_size = sizeof(MigrationClass),
3283    .instance_size = sizeof(MigrationState),
3284    .instance_init = migration_instance_init,
3285    .instance_finalize = migration_instance_finalize,
3286};
3287
3288static void register_migration_types(void)
3289{
3290    type_register_static(&migration_type);
3291}
3292
3293type_init(register_migration_types);
3294