qemu/migration/savevm.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2009-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include "hw/boards.h"
  31#include "net/net.h"
  32#include "migration.h"
  33#include "migration/snapshot.h"
  34#include "migration-stats.h"
  35#include "migration/vmstate.h"
  36#include "migration/misc.h"
  37#include "migration/register.h"
  38#include "migration/global_state.h"
  39#include "migration/channel-block.h"
  40#include "multifd.h"
  41#include "ram.h"
  42#include "qemu-file.h"
  43#include "savevm.h"
  44#include "postcopy-ram.h"
  45#include "qapi/error.h"
  46#include "qapi/qapi-commands-migration.h"
  47#include "qapi/clone-visitor.h"
  48#include "qapi/qapi-builtin-visit.h"
  49#include "qemu/error-report.h"
  50#include "system/cpus.h"
  51#include "system/memory.h"
  52#include "exec/target_page.h"
  53#include "exec/page-vary.h"
  54#include "trace.h"
  55#include "qemu/iov.h"
  56#include "qemu/job.h"
  57#include "qemu/main-loop.h"
  58#include "block/snapshot.h"
  59#include "block/thread-pool.h"
  60#include "qemu/cutils.h"
  61#include "io/channel-buffer.h"
  62#include "io/channel-file.h"
  63#include "system/replay.h"
  64#include "system/runstate.h"
  65#include "system/system.h"
  66#include "system/xen.h"
  67#include "migration/colo.h"
  68#include "qemu/bitmap.h"
  69#include "net/announce.h"
  70#include "qemu/yank.h"
  71#include "yank_functions.h"
  72#include "system/qtest.h"
  73#include "options.h"
  74
  75const unsigned int postcopy_ram_discard_version;
  76
  77/* Subcommands for QEMU_VM_COMMAND */
  78enum qemu_vm_cmd {
  79    MIG_CMD_INVALID = 0,   /* Must be 0 */
  80    MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
  81    MIG_CMD_PING,              /* Request a PONG on the RP */
  82
  83    MIG_CMD_POSTCOPY_ADVISE,       /* Prior to any page transfers, just
  84                                      warn we might want to do PC */
  85    MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
  86                                      pages as it's running. */
  87    MIG_CMD_POSTCOPY_RUN,          /* Start execution */
  88
  89    MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
  90                                      were previously sent during
  91                                      precopy but are dirty. */
  92    MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
  93    MIG_CMD_ENABLE_COLO,       /* Enable COLO */
  94    MIG_CMD_POSTCOPY_RESUME,   /* resume postcopy on dest */
  95    MIG_CMD_RECV_BITMAP,       /* Request for recved bitmap on dst */
  96    MIG_CMD_SWITCHOVER_START,  /* Switchover start notification */
  97    MIG_CMD_MAX
  98};
  99
 100#define MAX_VM_CMD_PACKAGED_SIZE UINT32_MAX
 101static struct mig_cmd_args {
 102    ssize_t     len; /* -1 = variable */
 103    const char *name;
 104} mig_cmd_args[] = {
 105    [MIG_CMD_INVALID]          = { .len = -1, .name = "INVALID" },
 106    [MIG_CMD_OPEN_RETURN_PATH] = { .len =  0, .name = "OPEN_RETURN_PATH" },
 107    [MIG_CMD_PING]             = { .len = sizeof(uint32_t), .name = "PING" },
 108    [MIG_CMD_POSTCOPY_ADVISE]  = { .len = -1, .name = "POSTCOPY_ADVISE" },
 109    [MIG_CMD_POSTCOPY_LISTEN]  = { .len =  0, .name = "POSTCOPY_LISTEN" },
 110    [MIG_CMD_POSTCOPY_RUN]     = { .len =  0, .name = "POSTCOPY_RUN" },
 111    [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
 112                                   .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
 113    [MIG_CMD_POSTCOPY_RESUME]  = { .len =  0, .name = "POSTCOPY_RESUME" },
 114    [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
 115    [MIG_CMD_RECV_BITMAP]      = { .len = -1, .name = "RECV_BITMAP" },
 116    [MIG_CMD_SWITCHOVER_START] = { .len =  0, .name = "SWITCHOVER_START" },
 117    [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
 118};
 119
 120/* Note for MIG_CMD_POSTCOPY_ADVISE:
 121 * The format of arguments is depending on postcopy mode:
 122 * - postcopy RAM only
 123 *   uint64_t host page size
 124 *   uint64_t target page size
 125 *
 126 * - postcopy RAM and postcopy dirty bitmaps
 127 *   format is the same as for postcopy RAM only
 128 *
 129 * - postcopy dirty bitmaps only
 130 *   Nothing. Command length field is 0.
 131 *
 132 * Be careful: adding a new postcopy entity with some other parameters should
 133 * not break format self-description ability. Good way is to introduce some
 134 * generic extendable format with an exception for two old entities.
 135 */
 136
 137/***********************************************************/
 138/* Optional load threads pool support */
 139
 140static void qemu_loadvm_thread_pool_create(MigrationIncomingState *mis)
 141{
 142    assert(!mis->load_threads);
 143    mis->load_threads = thread_pool_new();
 144    mis->load_threads_abort = false;
 145}
 146
 147static void qemu_loadvm_thread_pool_destroy(MigrationIncomingState *mis)
 148{
 149    qatomic_set(&mis->load_threads_abort, true);
 150
 151    bql_unlock(); /* Load threads might be waiting for BQL */
 152    g_clear_pointer(&mis->load_threads, thread_pool_free);
 153    bql_lock();
 154}
 155
 156static bool qemu_loadvm_thread_pool_wait(MigrationState *s,
 157                                         MigrationIncomingState *mis)
 158{
 159    bql_unlock(); /* Let load threads do work requiring BQL */
 160    thread_pool_wait(mis->load_threads);
 161    bql_lock();
 162
 163    return !migrate_has_error(s);
 164}
 165
 166/***********************************************************/
 167/* savevm/loadvm support */
 168
 169static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
 170{
 171    if (is_writable) {
 172        return qemu_file_new_output(QIO_CHANNEL(qio_channel_block_new(bs)));
 173    } else {
 174        return qemu_file_new_input(QIO_CHANNEL(qio_channel_block_new(bs)));
 175    }
 176}
 177
 178
 179/* QEMUFile timer support.
 180 * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c
 181 */
 182
 183void timer_put(QEMUFile *f, QEMUTimer *ts)
 184{
 185    uint64_t expire_time;
 186
 187    expire_time = timer_expire_time_ns(ts);
 188    qemu_put_be64(f, expire_time);
 189}
 190
 191void timer_get(QEMUFile *f, QEMUTimer *ts)
 192{
 193    uint64_t expire_time;
 194
 195    expire_time = qemu_get_be64(f);
 196    if (expire_time != -1) {
 197        timer_mod_ns(ts, expire_time);
 198    } else {
 199        timer_del(ts);
 200    }
 201}
 202
 203
 204/* VMState timer support.
 205 * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c
 206 */
 207
 208static int get_timer(QEMUFile *f, void *pv, size_t size,
 209                     const VMStateField *field)
 210{
 211    QEMUTimer *v = pv;
 212    timer_get(f, v);
 213    return 0;
 214}
 215
 216static int put_timer(QEMUFile *f, void *pv, size_t size,
 217                     const VMStateField *field, JSONWriter *vmdesc)
 218{
 219    QEMUTimer *v = pv;
 220    timer_put(f, v);
 221
 222    return 0;
 223}
 224
 225const VMStateInfo vmstate_info_timer = {
 226    .name = "timer",
 227    .get  = get_timer,
 228    .put  = put_timer,
 229};
 230
 231
 232typedef struct CompatEntry {
 233    char idstr[256];
 234    int instance_id;
 235} CompatEntry;
 236
 237typedef struct SaveStateEntry {
 238    QTAILQ_ENTRY(SaveStateEntry) entry;
 239    char idstr[256];
 240    uint32_t instance_id;
 241    int alias_id;
 242    int version_id;
 243    /* version id read from the stream */
 244    int load_version_id;
 245    int section_id;
 246    /* section id read from the stream */
 247    int load_section_id;
 248    const SaveVMHandlers *ops;
 249    const VMStateDescription *vmsd;
 250    void *opaque;
 251    CompatEntry *compat;
 252    int is_ram;
 253} SaveStateEntry;
 254
 255typedef struct SaveState {
 256    QTAILQ_HEAD(, SaveStateEntry) handlers;
 257    SaveStateEntry *handler_pri_head[MIG_PRI_MAX + 1];
 258    int global_section_id;
 259    uint32_t len;
 260    const char *name;
 261    uint32_t target_page_bits;
 262    uint32_t caps_count;
 263    MigrationCapability *capabilities;
 264    QemuUUID uuid;
 265} SaveState;
 266
 267static SaveState savevm_state = {
 268    .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
 269    .handler_pri_head = { [0 ... MIG_PRI_MAX] = NULL },
 270    .global_section_id = 0,
 271};
 272
 273static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id);
 274
 275static bool should_validate_capability(int capability)
 276{
 277    assert(capability >= 0 && capability < MIGRATION_CAPABILITY__MAX);
 278    /* Validate only new capabilities to keep compatibility. */
 279    switch (capability) {
 280    case MIGRATION_CAPABILITY_X_IGNORE_SHARED:
 281    case MIGRATION_CAPABILITY_MAPPED_RAM:
 282        return true;
 283    default:
 284        return false;
 285    }
 286}
 287
 288static uint32_t get_validatable_capabilities_count(void)
 289{
 290    MigrationState *s = migrate_get_current();
 291    uint32_t result = 0;
 292    int i;
 293    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 294        if (should_validate_capability(i) && s->capabilities[i]) {
 295            result++;
 296        }
 297    }
 298    return result;
 299}
 300
 301static int configuration_pre_save(void *opaque)
 302{
 303    SaveState *state = opaque;
 304    const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
 305    MigrationState *s = migrate_get_current();
 306    int i, j;
 307
 308    state->len = strlen(current_name);
 309    state->name = current_name;
 310    state->target_page_bits = qemu_target_page_bits();
 311
 312    state->caps_count = get_validatable_capabilities_count();
 313    state->capabilities = g_renew(MigrationCapability, state->capabilities,
 314                                  state->caps_count);
 315    for (i = j = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 316        if (should_validate_capability(i) && s->capabilities[i]) {
 317            state->capabilities[j++] = i;
 318        }
 319    }
 320    state->uuid = qemu_uuid;
 321
 322    return 0;
 323}
 324
 325static int configuration_post_save(void *opaque)
 326{
 327    SaveState *state = opaque;
 328
 329    g_free(state->capabilities);
 330    state->capabilities = NULL;
 331    state->caps_count = 0;
 332    return 0;
 333}
 334
 335static int configuration_pre_load(void *opaque)
 336{
 337    SaveState *state = opaque;
 338
 339    /* If there is no target-page-bits subsection it means the source
 340     * predates the variable-target-page-bits support and is using the
 341     * minimum possible value for this CPU.
 342     */
 343    state->target_page_bits = migration_legacy_page_bits();
 344    return 0;
 345}
 346
 347static bool configuration_validate_capabilities(SaveState *state)
 348{
 349    bool ret = true;
 350    MigrationState *s = migrate_get_current();
 351    unsigned long *source_caps_bm;
 352    int i;
 353
 354    source_caps_bm = bitmap_new(MIGRATION_CAPABILITY__MAX);
 355    for (i = 0; i < state->caps_count; i++) {
 356        MigrationCapability capability = state->capabilities[i];
 357        set_bit(capability, source_caps_bm);
 358    }
 359
 360    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 361        bool source_state, target_state;
 362        if (!should_validate_capability(i)) {
 363            continue;
 364        }
 365        source_state = test_bit(i, source_caps_bm);
 366        target_state = s->capabilities[i];
 367        if (source_state != target_state) {
 368            error_report("Capability %s is %s, but received capability is %s",
 369                         MigrationCapability_str(i),
 370                         target_state ? "on" : "off",
 371                         source_state ? "on" : "off");
 372            ret = false;
 373            /* Don't break here to report all failed capabilities */
 374        }
 375    }
 376
 377    g_free(source_caps_bm);
 378    return ret;
 379}
 380
 381static int configuration_post_load(void *opaque, int version_id)
 382{
 383    SaveState *state = opaque;
 384    const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
 385    int ret = 0;
 386
 387    if (strncmp(state->name, current_name, state->len) != 0) {
 388        error_report("Machine type received is '%.*s' and local is '%s'",
 389                     (int) state->len, state->name, current_name);
 390        ret = -EINVAL;
 391        goto out;
 392    }
 393
 394    if (state->target_page_bits != qemu_target_page_bits()) {
 395        error_report("Received TARGET_PAGE_BITS is %d but local is %d",
 396                     state->target_page_bits, qemu_target_page_bits());
 397        ret = -EINVAL;
 398        goto out;
 399    }
 400
 401    if (!configuration_validate_capabilities(state)) {
 402        ret = -EINVAL;
 403        goto out;
 404    }
 405
 406out:
 407    g_free((void *)state->name);
 408    state->name = NULL;
 409    state->len = 0;
 410    g_free(state->capabilities);
 411    state->capabilities = NULL;
 412    state->caps_count = 0;
 413
 414    return ret;
 415}
 416
 417static int get_capability(QEMUFile *f, void *pv, size_t size,
 418                          const VMStateField *field)
 419{
 420    MigrationCapability *capability = pv;
 421    char capability_str[UINT8_MAX + 1];
 422    uint8_t len;
 423    int i;
 424
 425    len = qemu_get_byte(f);
 426    qemu_get_buffer(f, (uint8_t *)capability_str, len);
 427    capability_str[len] = '\0';
 428    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 429        if (!strcmp(MigrationCapability_str(i), capability_str)) {
 430            *capability = i;
 431            return 0;
 432        }
 433    }
 434    error_report("Received unknown capability %s", capability_str);
 435    return -EINVAL;
 436}
 437
 438static int put_capability(QEMUFile *f, void *pv, size_t size,
 439                          const VMStateField *field, JSONWriter *vmdesc)
 440{
 441    MigrationCapability *capability = pv;
 442    const char *capability_str = MigrationCapability_str(*capability);
 443    size_t len = strlen(capability_str);
 444    assert(len <= UINT8_MAX);
 445
 446    qemu_put_byte(f, len);
 447    qemu_put_buffer(f, (uint8_t *)capability_str, len);
 448    return 0;
 449}
 450
 451static const VMStateInfo vmstate_info_capability = {
 452    .name = "capability",
 453    .get  = get_capability,
 454    .put  = put_capability,
 455};
 456
 457/* The target-page-bits subsection is present only if the
 458 * target page size is not the same as the default (ie the
 459 * minimum page size for a variable-page-size guest CPU).
 460 * If it is present then it contains the actual target page
 461 * bits for the machine, and migration will fail if the
 462 * two ends don't agree about it.
 463 */
 464static bool vmstate_target_page_bits_needed(void *opaque)
 465{
 466    return qemu_target_page_bits() > migration_legacy_page_bits();
 467}
 468
 469static const VMStateDescription vmstate_target_page_bits = {
 470    .name = "configuration/target-page-bits",
 471    .version_id = 1,
 472    .minimum_version_id = 1,
 473    .needed = vmstate_target_page_bits_needed,
 474    .fields = (const VMStateField[]) {
 475        VMSTATE_UINT32(target_page_bits, SaveState),
 476        VMSTATE_END_OF_LIST()
 477    }
 478};
 479
 480static bool vmstate_capabilites_needed(void *opaque)
 481{
 482    return get_validatable_capabilities_count() > 0;
 483}
 484
 485static const VMStateDescription vmstate_capabilites = {
 486    .name = "configuration/capabilities",
 487    .version_id = 1,
 488    .minimum_version_id = 1,
 489    .needed = vmstate_capabilites_needed,
 490    .fields = (const VMStateField[]) {
 491        VMSTATE_UINT32_V(caps_count, SaveState, 1),
 492        VMSTATE_VARRAY_UINT32_ALLOC(capabilities, SaveState, caps_count, 1,
 493                                    vmstate_info_capability,
 494                                    MigrationCapability),
 495        VMSTATE_END_OF_LIST()
 496    }
 497};
 498
 499static bool vmstate_uuid_needed(void *opaque)
 500{
 501    return qemu_uuid_set && migrate_validate_uuid();
 502}
 503
 504static int vmstate_uuid_post_load(void *opaque, int version_id)
 505{
 506    SaveState *state = opaque;
 507    char uuid_src[UUID_STR_LEN];
 508    char uuid_dst[UUID_STR_LEN];
 509
 510    if (!qemu_uuid_set) {
 511        /*
 512         * It's warning because user might not know UUID in some cases,
 513         * e.g. load an old snapshot
 514         */
 515        qemu_uuid_unparse(&state->uuid, uuid_src);
 516        warn_report("UUID is received %s, but local uuid isn't set",
 517                     uuid_src);
 518        return 0;
 519    }
 520    if (!qemu_uuid_is_equal(&state->uuid, &qemu_uuid)) {
 521        qemu_uuid_unparse(&state->uuid, uuid_src);
 522        qemu_uuid_unparse(&qemu_uuid, uuid_dst);
 523        error_report("UUID received is %s and local is %s", uuid_src, uuid_dst);
 524        return -EINVAL;
 525    }
 526    return 0;
 527}
 528
 529static const VMStateDescription vmstate_uuid = {
 530    .name = "configuration/uuid",
 531    .version_id = 1,
 532    .minimum_version_id = 1,
 533    .needed = vmstate_uuid_needed,
 534    .post_load = vmstate_uuid_post_load,
 535    .fields = (const VMStateField[]) {
 536        VMSTATE_UINT8_ARRAY_V(uuid.data, SaveState, sizeof(QemuUUID), 1),
 537        VMSTATE_END_OF_LIST()
 538    }
 539};
 540
 541static const VMStateDescription vmstate_configuration = {
 542    .name = "configuration",
 543    .version_id = 1,
 544    .pre_load = configuration_pre_load,
 545    .post_load = configuration_post_load,
 546    .pre_save = configuration_pre_save,
 547    .post_save = configuration_post_save,
 548    .fields = (const VMStateField[]) {
 549        VMSTATE_UINT32(len, SaveState),
 550        VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
 551        VMSTATE_END_OF_LIST()
 552    },
 553    .subsections = (const VMStateDescription * const []) {
 554        &vmstate_target_page_bits,
 555        &vmstate_capabilites,
 556        &vmstate_uuid,
 557        NULL
 558    }
 559};
 560
 561static void dump_vmstate_vmsd(FILE *out_file,
 562                              const VMStateDescription *vmsd, int indent,
 563                              bool is_subsection);
 564
 565static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field,
 566                              int indent)
 567{
 568    fprintf(out_file, "%*s{\n", indent, "");
 569    indent += 2;
 570    fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name);
 571    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 572            field->version_id);
 573    fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "",
 574            field->field_exists ? "true" : "false");
 575    if (field->flags & VMS_ARRAY) {
 576        fprintf(out_file, "%*s\"num\": %d,\n", indent, "", field->num);
 577    }
 578    fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size);
 579    if (field->vmsd != NULL) {
 580        fprintf(out_file, ",\n");
 581        dump_vmstate_vmsd(out_file, field->vmsd, indent, false);
 582    }
 583    fprintf(out_file, "\n%*s}", indent - 2, "");
 584}
 585
 586static void dump_vmstate_vmss(FILE *out_file,
 587                              const VMStateDescription *subsection,
 588                              int indent)
 589{
 590    if (subsection != NULL) {
 591        dump_vmstate_vmsd(out_file, subsection, indent, true);
 592    }
 593}
 594
 595static void dump_vmstate_vmsd(FILE *out_file,
 596                              const VMStateDescription *vmsd, int indent,
 597                              bool is_subsection)
 598{
 599    if (is_subsection) {
 600        fprintf(out_file, "%*s{\n", indent, "");
 601    } else {
 602        fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description");
 603    }
 604    indent += 2;
 605    fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name);
 606    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 607            vmsd->version_id);
 608    fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "",
 609            vmsd->minimum_version_id);
 610    if (vmsd->fields != NULL) {
 611        const VMStateField *field = vmsd->fields;
 612        bool first;
 613
 614        fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, "");
 615        first = true;
 616        while (field->name != NULL) {
 617            if (field->flags & VMS_MUST_EXIST) {
 618                /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */
 619                field++;
 620                continue;
 621            }
 622            if (!first) {
 623                fprintf(out_file, ",\n");
 624            }
 625            dump_vmstate_vmsf(out_file, field, indent + 2);
 626            field++;
 627            first = false;
 628        }
 629        assert(field->flags == VMS_END);
 630        fprintf(out_file, "\n%*s]", indent, "");
 631    }
 632    if (vmsd->subsections != NULL) {
 633        const VMStateDescription * const *subsection = vmsd->subsections;
 634        bool first;
 635
 636        fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, "");
 637        first = true;
 638        while (*subsection != NULL) {
 639            if (!first) {
 640                fprintf(out_file, ",\n");
 641            }
 642            dump_vmstate_vmss(out_file, *subsection, indent + 2);
 643            subsection++;
 644            first = false;
 645        }
 646        fprintf(out_file, "\n%*s]", indent, "");
 647    }
 648    fprintf(out_file, "\n%*s}", indent - 2, "");
 649}
 650
 651static void dump_machine_type(FILE *out_file)
 652{
 653    MachineClass *mc;
 654
 655    mc = MACHINE_GET_CLASS(current_machine);
 656
 657    fprintf(out_file, "  \"vmschkmachine\": {\n");
 658    fprintf(out_file, "    \"Name\": \"%s\"\n", mc->name);
 659    fprintf(out_file, "  },\n");
 660}
 661
 662void dump_vmstate_json_to_file(FILE *out_file)
 663{
 664    GSList *list, *elt;
 665    bool first;
 666
 667    fprintf(out_file, "{\n");
 668    dump_machine_type(out_file);
 669
 670    first = true;
 671    list = object_class_get_list(TYPE_DEVICE, true);
 672    for (elt = list; elt; elt = elt->next) {
 673        DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data,
 674                                             TYPE_DEVICE);
 675        const char *name;
 676        int indent = 2;
 677
 678        if (!dc->vmsd) {
 679            continue;
 680        }
 681
 682        if (!first) {
 683            fprintf(out_file, ",\n");
 684        }
 685        name = object_class_get_name(OBJECT_CLASS(dc));
 686        fprintf(out_file, "%*s\"%s\": {\n", indent, "", name);
 687        indent += 2;
 688        fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name);
 689        fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 690                dc->vmsd->version_id);
 691        fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "",
 692                dc->vmsd->minimum_version_id);
 693
 694        dump_vmstate_vmsd(out_file, dc->vmsd, indent, false);
 695
 696        fprintf(out_file, "\n%*s}", indent - 2, "");
 697        first = false;
 698    }
 699    fprintf(out_file, "\n}\n");
 700    fclose(out_file);
 701    g_slist_free(list);
 702}
 703
 704static uint32_t calculate_new_instance_id(const char *idstr)
 705{
 706    SaveStateEntry *se;
 707    uint32_t instance_id = 0;
 708
 709    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 710        if (strcmp(idstr, se->idstr) == 0
 711            && instance_id <= se->instance_id) {
 712            instance_id = se->instance_id + 1;
 713        }
 714    }
 715    /* Make sure we never loop over without being noticed */
 716    assert(instance_id != VMSTATE_INSTANCE_ID_ANY);
 717    return instance_id;
 718}
 719
 720static int calculate_compat_instance_id(const char *idstr)
 721{
 722    SaveStateEntry *se;
 723    int instance_id = 0;
 724
 725    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 726        if (!se->compat) {
 727            continue;
 728        }
 729
 730        if (strcmp(idstr, se->compat->idstr) == 0
 731            && instance_id <= se->compat->instance_id) {
 732            instance_id = se->compat->instance_id + 1;
 733        }
 734    }
 735    return instance_id;
 736}
 737
 738static inline MigrationPriority save_state_priority(SaveStateEntry *se)
 739{
 740    if (se->vmsd && se->vmsd->priority) {
 741        return se->vmsd->priority;
 742    }
 743    return MIG_PRI_DEFAULT;
 744}
 745
 746static void savevm_state_handler_insert(SaveStateEntry *nse)
 747{
 748    MigrationPriority priority = save_state_priority(nse);
 749    SaveStateEntry *se;
 750    int i;
 751
 752    assert(priority <= MIG_PRI_MAX);
 753
 754    /*
 755     * This should never happen otherwise migration will probably fail
 756     * silently somewhere because we can be wrongly applying one
 757     * object properties upon another one.  Bail out ASAP.
 758     */
 759    if (find_se(nse->idstr, nse->instance_id)) {
 760        error_report("%s: Detected duplicate SaveStateEntry: "
 761                     "id=%s, instance_id=0x%"PRIx32, __func__,
 762                     nse->idstr, nse->instance_id);
 763        exit(EXIT_FAILURE);
 764    }
 765
 766    for (i = priority - 1; i >= 0; i--) {
 767        se = savevm_state.handler_pri_head[i];
 768        if (se != NULL) {
 769            assert(save_state_priority(se) < priority);
 770            break;
 771        }
 772    }
 773
 774    if (i >= 0) {
 775        QTAILQ_INSERT_BEFORE(se, nse, entry);
 776    } else {
 777        QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry);
 778    }
 779
 780    if (savevm_state.handler_pri_head[priority] == NULL) {
 781        savevm_state.handler_pri_head[priority] = nse;
 782    }
 783}
 784
 785static void savevm_state_handler_remove(SaveStateEntry *se)
 786{
 787    SaveStateEntry *next;
 788    MigrationPriority priority = save_state_priority(se);
 789
 790    if (se == savevm_state.handler_pri_head[priority]) {
 791        next = QTAILQ_NEXT(se, entry);
 792        if (next != NULL && save_state_priority(next) == priority) {
 793            savevm_state.handler_pri_head[priority] = next;
 794        } else {
 795            savevm_state.handler_pri_head[priority] = NULL;
 796        }
 797    }
 798    QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
 799}
 800
 801/* TODO: Individual devices generally have very little idea about the rest
 802   of the system, so instance_id should be removed/replaced.
 803   Meanwhile pass -1 as instance_id if you do not already have a clearly
 804   distinguishing id for all instances of your device class. */
 805int register_savevm_live(const char *idstr,
 806                         uint32_t instance_id,
 807                         int version_id,
 808                         const SaveVMHandlers *ops,
 809                         void *opaque)
 810{
 811    SaveStateEntry *se;
 812
 813    se = g_new0(SaveStateEntry, 1);
 814    se->version_id = version_id;
 815    se->section_id = savevm_state.global_section_id++;
 816    se->ops = ops;
 817    se->opaque = opaque;
 818    se->vmsd = NULL;
 819    /* if this is a live_savem then set is_ram */
 820    if (ops->save_setup != NULL) {
 821        se->is_ram = 1;
 822    }
 823
 824    pstrcat(se->idstr, sizeof(se->idstr), idstr);
 825
 826    if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
 827        se->instance_id = calculate_new_instance_id(se->idstr);
 828    } else {
 829        se->instance_id = instance_id;
 830    }
 831    assert(!se->compat || se->instance_id == 0);
 832    savevm_state_handler_insert(se);
 833    return 0;
 834}
 835
 836void unregister_savevm(VMStateIf *obj, const char *idstr, void *opaque)
 837{
 838    SaveStateEntry *se, *new_se;
 839    char id[256] = "";
 840
 841    if (obj) {
 842        char *oid = vmstate_if_get_id(obj);
 843        if (oid) {
 844            pstrcpy(id, sizeof(id), oid);
 845            pstrcat(id, sizeof(id), "/");
 846            g_free(oid);
 847        }
 848    }
 849    pstrcat(id, sizeof(id), idstr);
 850
 851    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
 852        if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
 853            savevm_state_handler_remove(se);
 854            g_free(se->compat);
 855            g_free(se);
 856        }
 857    }
 858}
 859
 860/*
 861 * Perform some basic checks on vmsd's at registration
 862 * time.
 863 */
 864static void vmstate_check(const VMStateDescription *vmsd)
 865{
 866    const VMStateField *field = vmsd->fields;
 867    const VMStateDescription * const *subsection = vmsd->subsections;
 868
 869    if (field) {
 870        while (field->name) {
 871            if (field->flags & (VMS_STRUCT | VMS_VSTRUCT)) {
 872                /* Recurse to sub structures */
 873                vmstate_check(field->vmsd);
 874            }
 875            /* Carry on */
 876            field++;
 877        }
 878        /* Check for the end of field list canary */
 879        if (field->flags != VMS_END) {
 880            error_report("VMSTATE not ending with VMS_END: %s", vmsd->name);
 881            g_assert_not_reached();
 882        }
 883    }
 884
 885    while (subsection && *subsection) {
 886        /*
 887         * The name of a subsection should start with the name of the
 888         * current object.
 889         */
 890        assert(!strncmp(vmsd->name, (*subsection)->name, strlen(vmsd->name)));
 891        vmstate_check(*subsection);
 892        subsection++;
 893    }
 894}
 895
 896
 897int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
 898                                   const VMStateDescription *vmsd,
 899                                   void *opaque, int alias_id,
 900                                   int required_for_version,
 901                                   Error **errp)
 902{
 903    SaveStateEntry *se;
 904
 905    /* If this triggers, alias support can be dropped for the vmsd. */
 906    assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id);
 907
 908    se = g_new0(SaveStateEntry, 1);
 909    se->version_id = vmsd->version_id;
 910    se->section_id = savevm_state.global_section_id++;
 911    se->opaque = opaque;
 912    se->vmsd = vmsd;
 913    se->alias_id = alias_id;
 914
 915    if (obj) {
 916        char *id = vmstate_if_get_id(obj);
 917        if (id) {
 918            if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
 919                sizeof(se->idstr)) {
 920                error_setg(errp, "Path too long for VMState (%s)", id);
 921                g_free(id);
 922                g_free(se);
 923
 924                return -1;
 925            }
 926            g_free(id);
 927
 928            se->compat = g_new0(CompatEntry, 1);
 929            pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name);
 930            se->compat->instance_id = instance_id == VMSTATE_INSTANCE_ID_ANY ?
 931                         calculate_compat_instance_id(vmsd->name) : instance_id;
 932            instance_id = VMSTATE_INSTANCE_ID_ANY;
 933        }
 934    }
 935    pstrcat(se->idstr, sizeof(se->idstr), vmsd->name);
 936
 937    if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
 938        se->instance_id = calculate_new_instance_id(se->idstr);
 939    } else {
 940        se->instance_id = instance_id;
 941    }
 942
 943    /* Perform a recursive sanity check during the test runs */
 944    if (qtest_enabled()) {
 945        vmstate_check(vmsd);
 946    }
 947    assert(!se->compat || se->instance_id == 0);
 948    savevm_state_handler_insert(se);
 949    return 0;
 950}
 951
 952void vmstate_unregister(VMStateIf *obj, const VMStateDescription *vmsd,
 953                        void *opaque)
 954{
 955    SaveStateEntry *se, *new_se;
 956
 957    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
 958        if (se->vmsd == vmsd && se->opaque == opaque) {
 959            savevm_state_handler_remove(se);
 960            g_free(se->compat);
 961            g_free(se);
 962        }
 963    }
 964}
 965
 966static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
 967{
 968    trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
 969    if (!se->vmsd) {         /* Old style */
 970        return se->ops->load_state(f, se->opaque, se->load_version_id);
 971    }
 972    return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id);
 973}
 974
 975static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se,
 976                                   JSONWriter *vmdesc)
 977{
 978    uint64_t old_offset = qemu_file_transferred(f);
 979    se->ops->save_state(f, se->opaque);
 980    uint64_t size = qemu_file_transferred(f) - old_offset;
 981
 982    if (vmdesc) {
 983        json_writer_int64(vmdesc, "size", size);
 984        json_writer_start_array(vmdesc, "fields");
 985        json_writer_start_object(vmdesc, NULL);
 986        json_writer_str(vmdesc, "name", "data");
 987        json_writer_int64(vmdesc, "size", size);
 988        json_writer_str(vmdesc, "type", "buffer");
 989        json_writer_end_object(vmdesc);
 990        json_writer_end_array(vmdesc);
 991    }
 992}
 993
 994/*
 995 * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
 996 */
 997static void save_section_header(QEMUFile *f, SaveStateEntry *se,
 998                                uint8_t section_type)
 999{
1000    qemu_put_byte(f, section_type);
1001    qemu_put_be32(f, se->section_id);
1002
1003    if (section_type == QEMU_VM_SECTION_FULL ||
1004        section_type == QEMU_VM_SECTION_START) {
1005        /* ID string */
1006        size_t len = strlen(se->idstr);
1007        qemu_put_byte(f, len);
1008        qemu_put_buffer(f, (uint8_t *)se->idstr, len);
1009
1010        qemu_put_be32(f, se->instance_id);
1011        qemu_put_be32(f, se->version_id);
1012    }
1013}
1014
1015/*
1016 * Write a footer onto device sections that catches cases misformatted device
1017 * sections.
1018 */
1019static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
1020{
1021    if (migrate_get_current()->send_section_footer) {
1022        qemu_put_byte(f, QEMU_VM_SECTION_FOOTER);
1023        qemu_put_be32(f, se->section_id);
1024    }
1025}
1026
1027static int vmstate_save(QEMUFile *f, SaveStateEntry *se, JSONWriter *vmdesc,
1028                        Error **errp)
1029{
1030    int ret;
1031
1032    if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1033        return 0;
1034    }
1035    if (se->vmsd && !vmstate_section_needed(se->vmsd, se->opaque)) {
1036        trace_savevm_section_skip(se->idstr, se->section_id);
1037        return 0;
1038    }
1039
1040    trace_savevm_section_start(se->idstr, se->section_id);
1041    save_section_header(f, se, QEMU_VM_SECTION_FULL);
1042    if (vmdesc) {
1043        json_writer_start_object(vmdesc, NULL);
1044        json_writer_str(vmdesc, "name", se->idstr);
1045        json_writer_int64(vmdesc, "instance_id", se->instance_id);
1046    }
1047
1048    trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
1049    if (!se->vmsd) {
1050        vmstate_save_old_style(f, se, vmdesc);
1051    } else {
1052        ret = vmstate_save_state_with_err(f, se->vmsd, se->opaque, vmdesc,
1053                                          errp);
1054        if (ret) {
1055            return ret;
1056        }
1057    }
1058
1059    trace_savevm_section_end(se->idstr, se->section_id, 0);
1060    save_section_footer(f, se);
1061    if (vmdesc) {
1062        json_writer_end_object(vmdesc);
1063    }
1064    return 0;
1065}
1066/**
1067 * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
1068 *                           command and associated data.
1069 *
1070 * @f: File to send command on
1071 * @command: Command type to send
1072 * @len: Length of associated data
1073 * @data: Data associated with command.
1074 */
1075static void qemu_savevm_command_send(QEMUFile *f,
1076                                     enum qemu_vm_cmd command,
1077                                     uint16_t len,
1078                                     uint8_t *data)
1079{
1080    trace_savevm_command_send(command, len);
1081    qemu_put_byte(f, QEMU_VM_COMMAND);
1082    qemu_put_be16(f, (uint16_t)command);
1083    qemu_put_be16(f, len);
1084    qemu_put_buffer(f, data, len);
1085    qemu_fflush(f);
1086}
1087
1088void qemu_savevm_send_colo_enable(QEMUFile *f)
1089{
1090    trace_savevm_send_colo_enable();
1091    qemu_savevm_command_send(f, MIG_CMD_ENABLE_COLO, 0, NULL);
1092}
1093
1094void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
1095{
1096    uint32_t buf;
1097
1098    trace_savevm_send_ping(value);
1099    buf = cpu_to_be32(value);
1100    qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
1101}
1102
1103void qemu_savevm_send_open_return_path(QEMUFile *f)
1104{
1105    trace_savevm_send_open_return_path();
1106    qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
1107}
1108
1109/* We have a buffer of data to send; we don't want that all to be loaded
1110 * by the command itself, so the command contains just the length of the
1111 * extra buffer that we then send straight after it.
1112 * TODO: Must be a better way to organise that
1113 *
1114 * Returns:
1115 *    0 on success
1116 *    -ve on error
1117 */
1118int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len)
1119{
1120    uint32_t tmp;
1121    MigrationState *ms = migrate_get_current();
1122    Error *local_err = NULL;
1123
1124    if (len > MAX_VM_CMD_PACKAGED_SIZE) {
1125        error_setg(&local_err, "%s: Unreasonably large packaged state: %zu",
1126                     __func__, len);
1127        migrate_set_error(ms, local_err);
1128        error_report_err(local_err);
1129        return -1;
1130    }
1131
1132    tmp = cpu_to_be32(len);
1133
1134    trace_qemu_savevm_send_packaged();
1135    qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
1136
1137    qemu_put_buffer(f, buf, len);
1138
1139    return 0;
1140}
1141
1142/* Send prior to any postcopy transfer */
1143void qemu_savevm_send_postcopy_advise(QEMUFile *f)
1144{
1145    if (migrate_postcopy_ram()) {
1146        uint64_t tmp[2];
1147        tmp[0] = cpu_to_be64(ram_pagesize_summary());
1148        tmp[1] = cpu_to_be64(qemu_target_page_size());
1149
1150        trace_qemu_savevm_send_postcopy_advise();
1151        qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE,
1152                                 16, (uint8_t *)tmp);
1153    } else {
1154        qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 0, NULL);
1155    }
1156}
1157
1158/* Sent prior to starting the destination running in postcopy, discard pages
1159 * that have already been sent but redirtied on the source.
1160 * CMD_POSTCOPY_RAM_DISCARD consist of:
1161 *      byte   version (0)
1162 *      byte   Length of name field (not including 0)
1163 *  n x byte   RAM block name
1164 *      byte   0 terminator (just for safety)
1165 *  n x        Byte ranges within the named RAMBlock
1166 *      be64   Start of the range
1167 *      be64   Length
1168 *
1169 *  name:  RAMBlock name that these entries are part of
1170 *  len: Number of page entries
1171 *  start_list: 'len' addresses
1172 *  length_list: 'len' addresses
1173 *
1174 */
1175void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
1176                                           uint16_t len,
1177                                           uint64_t *start_list,
1178                                           uint64_t *length_list)
1179{
1180    uint8_t *buf;
1181    uint16_t tmplen;
1182    uint16_t t;
1183    size_t name_len = strlen(name);
1184
1185    trace_qemu_savevm_send_postcopy_ram_discard(name, len);
1186    assert(name_len < 256);
1187    buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
1188    buf[0] = postcopy_ram_discard_version;
1189    buf[1] = name_len;
1190    memcpy(buf + 2, name, name_len);
1191    tmplen = 2 + name_len;
1192    buf[tmplen++] = '\0';
1193
1194    for (t = 0; t < len; t++) {
1195        stq_be_p(buf + tmplen, start_list[t]);
1196        tmplen += 8;
1197        stq_be_p(buf + tmplen, length_list[t]);
1198        tmplen += 8;
1199    }
1200    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
1201    g_free(buf);
1202}
1203
1204/* Get the destination into a state where it can receive postcopy data. */
1205void qemu_savevm_send_postcopy_listen(QEMUFile *f)
1206{
1207    trace_savevm_send_postcopy_listen();
1208    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
1209}
1210
1211/* Kick the destination into running */
1212void qemu_savevm_send_postcopy_run(QEMUFile *f)
1213{
1214    trace_savevm_send_postcopy_run();
1215    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
1216}
1217
1218void qemu_savevm_send_postcopy_resume(QEMUFile *f)
1219{
1220    trace_savevm_send_postcopy_resume();
1221    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RESUME, 0, NULL);
1222}
1223
1224void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name)
1225{
1226    size_t len;
1227    char buf[256];
1228
1229    trace_savevm_send_recv_bitmap(block_name);
1230
1231    buf[0] = len = strlen(block_name);
1232    memcpy(buf + 1, block_name, len);
1233
1234    qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
1235}
1236
1237static void qemu_savevm_send_switchover_start(QEMUFile *f)
1238{
1239    trace_savevm_send_switchover_start();
1240    qemu_savevm_command_send(f, MIG_CMD_SWITCHOVER_START, 0, NULL);
1241}
1242
1243void qemu_savevm_maybe_send_switchover_start(QEMUFile *f)
1244{
1245    if (migrate_send_switchover_start()) {
1246        qemu_savevm_send_switchover_start(f);
1247    }
1248}
1249
1250bool qemu_savevm_state_blocked(Error **errp)
1251{
1252    SaveStateEntry *se;
1253
1254    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1255        if (se->vmsd && se->vmsd->unmigratable) {
1256            error_setg(errp, "State blocked by non-migratable device '%s'",
1257                       se->idstr);
1258            return true;
1259        }
1260    }
1261    return false;
1262}
1263
1264void qemu_savevm_non_migratable_list(strList **reasons)
1265{
1266    SaveStateEntry *se;
1267
1268    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1269        if (se->vmsd && se->vmsd->unmigratable) {
1270            QAPI_LIST_PREPEND(*reasons,
1271                              g_strdup_printf("non-migratable device: %s",
1272                                              se->idstr));
1273        }
1274    }
1275}
1276
1277void qemu_savevm_state_header(QEMUFile *f)
1278{
1279    MigrationState *s = migrate_get_current();
1280    JSONWriter *vmdesc = s->vmdesc;
1281
1282    trace_savevm_state_header();
1283    qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1284    qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1285
1286    if (s->send_configuration) {
1287        qemu_put_byte(f, QEMU_VM_CONFIGURATION);
1288
1289        if (vmdesc) {
1290            /*
1291             * This starts the main json object and is paired with the
1292             * json_writer_end_object in
1293             * qemu_savevm_state_complete_precopy_non_iterable
1294             */
1295            json_writer_start_object(vmdesc, NULL);
1296            json_writer_start_object(vmdesc, "configuration");
1297        }
1298
1299        vmstate_save_state(f, &vmstate_configuration, &savevm_state, vmdesc);
1300
1301        if (vmdesc) {
1302            json_writer_end_object(vmdesc);
1303        }
1304    }
1305}
1306
1307bool qemu_savevm_state_guest_unplug_pending(void)
1308{
1309    SaveStateEntry *se;
1310
1311    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1312        if (se->vmsd && se->vmsd->dev_unplug_pending &&
1313            se->vmsd->dev_unplug_pending(se->opaque)) {
1314            return true;
1315        }
1316    }
1317
1318    return false;
1319}
1320
1321int qemu_savevm_state_prepare(Error **errp)
1322{
1323    SaveStateEntry *se;
1324    int ret;
1325
1326    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1327        if (!se->ops || !se->ops->save_prepare) {
1328            continue;
1329        }
1330        if (se->ops->is_active) {
1331            if (!se->ops->is_active(se->opaque)) {
1332                continue;
1333            }
1334        }
1335
1336        ret = se->ops->save_prepare(se->opaque, errp);
1337        if (ret < 0) {
1338            return ret;
1339        }
1340    }
1341
1342    return 0;
1343}
1344
1345int qemu_savevm_state_setup(QEMUFile *f, Error **errp)
1346{
1347    ERRP_GUARD();
1348    MigrationState *ms = migrate_get_current();
1349    JSONWriter *vmdesc = ms->vmdesc;
1350    SaveStateEntry *se;
1351    int ret = 0;
1352
1353    if (vmdesc) {
1354        json_writer_int64(vmdesc, "page_size", qemu_target_page_size());
1355        json_writer_start_array(vmdesc, "devices");
1356    }
1357
1358    trace_savevm_state_setup();
1359    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1360        if (se->vmsd && se->vmsd->early_setup) {
1361            ret = vmstate_save(f, se, vmdesc, errp);
1362            if (ret) {
1363                migrate_set_error(ms, *errp);
1364                qemu_file_set_error(f, ret);
1365                break;
1366            }
1367            continue;
1368        }
1369
1370        if (!se->ops || !se->ops->save_setup) {
1371            continue;
1372        }
1373        if (se->ops->is_active) {
1374            if (!se->ops->is_active(se->opaque)) {
1375                continue;
1376            }
1377        }
1378        save_section_header(f, se, QEMU_VM_SECTION_START);
1379
1380        ret = se->ops->save_setup(f, se->opaque, errp);
1381        save_section_footer(f, se);
1382        if (ret < 0) {
1383            qemu_file_set_error(f, ret);
1384            break;
1385        }
1386    }
1387
1388    if (ret) {
1389        return ret;
1390    }
1391
1392    /* TODO: Should we check that errp is set in case of failure ? */
1393    return precopy_notify(PRECOPY_NOTIFY_SETUP, errp);
1394}
1395
1396int qemu_savevm_state_resume_prepare(MigrationState *s)
1397{
1398    SaveStateEntry *se;
1399    int ret;
1400
1401    trace_savevm_state_resume_prepare();
1402
1403    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1404        if (!se->ops || !se->ops->resume_prepare) {
1405            continue;
1406        }
1407        if (se->ops->is_active) {
1408            if (!se->ops->is_active(se->opaque)) {
1409                continue;
1410            }
1411        }
1412        ret = se->ops->resume_prepare(s, se->opaque);
1413        if (ret < 0) {
1414            return ret;
1415        }
1416    }
1417
1418    return 0;
1419}
1420
1421/*
1422 * this function has three return values:
1423 *   negative: there was one error, and we have -errno.
1424 *   0 : We haven't finished, caller have to go again
1425 *   1 : We have finished, we can go to complete phase
1426 */
1427int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
1428{
1429    SaveStateEntry *se;
1430    bool all_finished = true;
1431    int ret;
1432
1433    trace_savevm_state_iterate();
1434    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1435        if (!se->ops || !se->ops->save_live_iterate) {
1436            continue;
1437        }
1438        if (se->ops->is_active &&
1439            !se->ops->is_active(se->opaque)) {
1440            continue;
1441        }
1442        if (se->ops->is_active_iterate &&
1443            !se->ops->is_active_iterate(se->opaque)) {
1444            continue;
1445        }
1446        /*
1447         * In the postcopy phase, any device that doesn't know how to
1448         * do postcopy should have saved it's state in the _complete
1449         * call that's already run, it might get confused if we call
1450         * iterate afterwards.
1451         */
1452        if (postcopy &&
1453            !(se->ops->has_postcopy && se->ops->has_postcopy(se->opaque))) {
1454            continue;
1455        }
1456        if (migration_rate_exceeded(f)) {
1457            return 0;
1458        }
1459        trace_savevm_section_start(se->idstr, se->section_id);
1460
1461        save_section_header(f, se, QEMU_VM_SECTION_PART);
1462
1463        ret = se->ops->save_live_iterate(f, se->opaque);
1464        trace_savevm_section_end(se->idstr, se->section_id, ret);
1465        save_section_footer(f, se);
1466
1467        if (ret < 0) {
1468            error_report("failed to save SaveStateEntry with id(name): "
1469                         "%d(%s): %d",
1470                         se->section_id, se->idstr, ret);
1471            qemu_file_set_error(f, ret);
1472            return ret;
1473        } else if (!ret) {
1474            all_finished = false;
1475        }
1476    }
1477    return all_finished;
1478}
1479
1480bool should_send_vmdesc(void)
1481{
1482    MachineState *machine = MACHINE(qdev_get_machine());
1483
1484    return !machine->suppress_vmdesc;
1485}
1486
1487static bool qemu_savevm_complete_exists(SaveStateEntry *se)
1488{
1489    return se->ops && se->ops->save_complete;
1490}
1491
1492/*
1493 * Invoke the ->save_complete() if necessary.
1494 * Returns: 0 if skip the current SE or succeeded, <0 if error happened.
1495 */
1496static int qemu_savevm_complete(SaveStateEntry *se, QEMUFile *f)
1497{
1498    int ret;
1499
1500    if (se->ops->is_active) {
1501        if (!se->ops->is_active(se->opaque)) {
1502            return 0;
1503        }
1504    }
1505
1506    trace_savevm_section_start(se->idstr, se->section_id);
1507    save_section_header(f, se, QEMU_VM_SECTION_END);
1508    ret = se->ops->save_complete(f, se->opaque);
1509    trace_savevm_section_end(se->idstr, se->section_id, ret);
1510    save_section_footer(f, se);
1511
1512    if (ret < 0) {
1513        qemu_file_set_error(f, ret);
1514    }
1515
1516    return ret;
1517}
1518
1519/*
1520 * Complete saving any postcopy-able devices.
1521 *
1522 * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
1523 * all the other devices, but that happens at the point we switch to postcopy.
1524 */
1525void qemu_savevm_state_complete_postcopy(QEMUFile *f)
1526{
1527    SaveStateEntry *se;
1528
1529    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1530        if (!qemu_savevm_complete_exists(se)) {
1531            continue;
1532        }
1533
1534        if (qemu_savevm_complete(se, f) < 0) {
1535            return;
1536        }
1537    }
1538
1539    qemu_put_byte(f, QEMU_VM_EOF);
1540    qemu_fflush(f);
1541}
1542
1543bool qemu_savevm_state_postcopy_prepare(QEMUFile *f, Error **errp)
1544{
1545    SaveStateEntry *se;
1546    bool ret;
1547
1548    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1549        if (!se->ops || !se->ops->save_postcopy_prepare) {
1550            continue;
1551        }
1552
1553        if (se->ops->is_active) {
1554            if (!se->ops->is_active(se->opaque)) {
1555                continue;
1556            }
1557        }
1558
1559        trace_savevm_section_start(se->idstr, se->section_id);
1560
1561        save_section_header(f, se, QEMU_VM_SECTION_PART);
1562        ret = se->ops->save_postcopy_prepare(f, se->opaque, errp);
1563        save_section_footer(f, se);
1564
1565        trace_savevm_section_end(se->idstr, se->section_id, ret);
1566
1567        if (!ret) {
1568            assert(*errp);
1569            return false;
1570        }
1571    }
1572
1573    return true;
1574}
1575
1576int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
1577{
1578    int64_t start_ts_each, end_ts_each;
1579    SaveStateEntry *se;
1580    bool multifd_device_state = multifd_device_state_supported();
1581
1582    if (multifd_device_state) {
1583        QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1584            SaveCompletePrecopyThreadHandler hdlr;
1585
1586            if (!se->ops || (in_postcopy && se->ops->has_postcopy &&
1587                             se->ops->has_postcopy(se->opaque)) ||
1588                !se->ops->save_complete_precopy_thread) {
1589                continue;
1590            }
1591
1592            hdlr = se->ops->save_complete_precopy_thread;
1593            multifd_spawn_device_state_save_thread(hdlr,
1594                                                   se->idstr, se->instance_id,
1595                                                   se->opaque);
1596        }
1597    }
1598
1599    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1600        if (!qemu_savevm_complete_exists(se)) {
1601            continue;
1602        }
1603
1604        if (in_postcopy && se->ops->has_postcopy &&
1605            se->ops->has_postcopy(se->opaque)) {
1606            /*
1607             * If postcopy will start soon, and if the SE supports
1608             * postcopy, then we can skip the SE for the postcopy phase.
1609             */
1610            continue;
1611        }
1612
1613        start_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
1614        if (qemu_savevm_complete(se, f) < 0) {
1615            goto ret_fail_abort_threads;
1616        }
1617        end_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
1618
1619        trace_vmstate_downtime_save("iterable", se->idstr, se->instance_id,
1620                                    end_ts_each - start_ts_each);
1621    }
1622
1623    if (multifd_device_state) {
1624        if (migrate_has_error(migrate_get_current())) {
1625            multifd_abort_device_state_save_threads();
1626        }
1627
1628        if (!multifd_join_device_state_save_threads()) {
1629            qemu_file_set_error(f, -EINVAL);
1630            return -1;
1631        }
1632    }
1633
1634    trace_vmstate_downtime_checkpoint("src-iterable-saved");
1635
1636    return 0;
1637
1638ret_fail_abort_threads:
1639    if (multifd_device_state) {
1640        multifd_abort_device_state_save_threads();
1641        multifd_join_device_state_save_threads();
1642    }
1643
1644    return -1;
1645}
1646
1647int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
1648                                                    bool in_postcopy)
1649{
1650    MigrationState *ms = migrate_get_current();
1651    int64_t start_ts_each, end_ts_each;
1652    JSONWriter *vmdesc = ms->vmdesc;
1653    int vmdesc_len;
1654    SaveStateEntry *se;
1655    Error *local_err = NULL;
1656    int ret;
1657
1658    /* Making sure cpu states are synchronized before saving non-iterable */
1659    cpu_synchronize_all_states();
1660
1661    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1662        if (se->vmsd && se->vmsd->early_setup) {
1663            /* Already saved during qemu_savevm_state_setup(). */
1664            continue;
1665        }
1666
1667        start_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
1668
1669        ret = vmstate_save(f, se, vmdesc, &local_err);
1670        if (ret) {
1671            migrate_set_error(ms, local_err);
1672            error_report_err(local_err);
1673            qemu_file_set_error(f, ret);
1674            return ret;
1675        }
1676
1677        end_ts_each = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
1678        trace_vmstate_downtime_save("non-iterable", se->idstr, se->instance_id,
1679                                    end_ts_each - start_ts_each);
1680    }
1681
1682    if (!in_postcopy) {
1683        /* Postcopy stream will still be going */
1684        qemu_put_byte(f, QEMU_VM_EOF);
1685
1686        if (vmdesc) {
1687            json_writer_end_array(vmdesc);
1688            json_writer_end_object(vmdesc);
1689            vmdesc_len = strlen(json_writer_get(vmdesc));
1690
1691            qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
1692            qemu_put_be32(f, vmdesc_len);
1693            qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len);
1694        }
1695    }
1696
1697    trace_vmstate_downtime_checkpoint("src-non-iterable-saved");
1698
1699    return 0;
1700}
1701
1702int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only)
1703{
1704    int ret;
1705
1706    ret = qemu_savevm_state_complete_precopy_iterable(f, false);
1707    if (ret) {
1708        return ret;
1709    }
1710
1711    if (!iterable_only) {
1712        ret = qemu_savevm_state_complete_precopy_non_iterable(f, false);
1713        if (ret) {
1714            return ret;
1715        }
1716    }
1717
1718    return qemu_fflush(f);
1719}
1720
1721/* Give an estimate of the amount left to be transferred,
1722 * the result is split into the amount for units that can and
1723 * for units that can't do postcopy.
1724 */
1725void qemu_savevm_state_pending_estimate(uint64_t *must_precopy,
1726                                        uint64_t *can_postcopy)
1727{
1728    SaveStateEntry *se;
1729
1730    *must_precopy = 0;
1731    *can_postcopy = 0;
1732
1733    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1734        if (!se->ops || !se->ops->state_pending_estimate) {
1735            continue;
1736        }
1737        if (se->ops->is_active) {
1738            if (!se->ops->is_active(se->opaque)) {
1739                continue;
1740            }
1741        }
1742        se->ops->state_pending_estimate(se->opaque, must_precopy, can_postcopy);
1743    }
1744}
1745
1746void qemu_savevm_state_pending_exact(uint64_t *must_precopy,
1747                                     uint64_t *can_postcopy)
1748{
1749    SaveStateEntry *se;
1750
1751    *must_precopy = 0;
1752    *can_postcopy = 0;
1753
1754    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1755        if (!se->ops || !se->ops->state_pending_exact) {
1756            continue;
1757        }
1758        if (se->ops->is_active) {
1759            if (!se->ops->is_active(se->opaque)) {
1760                continue;
1761            }
1762        }
1763        se->ops->state_pending_exact(se->opaque, must_precopy, can_postcopy);
1764    }
1765}
1766
1767void qemu_savevm_state_cleanup(void)
1768{
1769    SaveStateEntry *se;
1770    Error *local_err = NULL;
1771
1772    if (precopy_notify(PRECOPY_NOTIFY_CLEANUP, &local_err)) {
1773        error_report_err(local_err);
1774    }
1775
1776    trace_savevm_state_cleanup();
1777    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1778        if (se->ops && se->ops->save_cleanup) {
1779            se->ops->save_cleanup(se->opaque);
1780        }
1781    }
1782}
1783
1784static int qemu_savevm_state(QEMUFile *f, Error **errp)
1785{
1786    int ret;
1787    MigrationState *ms = migrate_get_current();
1788    MigrationStatus status;
1789
1790    if (migration_is_running()) {
1791        error_setg(errp, "There's a migration process in progress");
1792        return -EINVAL;
1793    }
1794
1795    ret = migrate_init(ms, errp);
1796    if (ret) {
1797        return ret;
1798    }
1799    ms->to_dst_file = f;
1800
1801    qemu_savevm_state_header(f);
1802    ret = qemu_savevm_state_setup(f, errp);
1803    if (ret) {
1804        goto cleanup;
1805    }
1806
1807    while (qemu_file_get_error(f) == 0) {
1808        if (qemu_savevm_state_iterate(f, false) > 0) {
1809            break;
1810        }
1811    }
1812
1813    ret = qemu_file_get_error(f);
1814    if (ret == 0) {
1815        qemu_savevm_maybe_send_switchover_start(f);
1816        qemu_savevm_state_complete_precopy(f, false);
1817        ret = qemu_file_get_error(f);
1818    }
1819    if (ret != 0) {
1820        error_setg_errno(errp, -ret, "Error while writing VM state");
1821    }
1822cleanup:
1823    qemu_savevm_state_cleanup();
1824
1825    if (ret != 0) {
1826        status = MIGRATION_STATUS_FAILED;
1827    } else {
1828        status = MIGRATION_STATUS_COMPLETED;
1829    }
1830    migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status);
1831
1832    /* f is outer parameter, it should not stay in global migration state after
1833     * this function finished */
1834    ms->to_dst_file = NULL;
1835
1836    return ret;
1837}
1838
1839void qemu_savevm_live_state(QEMUFile *f)
1840{
1841    /* save QEMU_VM_SECTION_END section */
1842    qemu_savevm_state_complete_precopy(f, true);
1843    qemu_put_byte(f, QEMU_VM_EOF);
1844}
1845
1846int qemu_save_device_state(QEMUFile *f)
1847{
1848    MigrationState *ms = migrate_get_current();
1849    Error *local_err = NULL;
1850    SaveStateEntry *se;
1851
1852    if (!migration_in_colo_state()) {
1853        qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1854        qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1855    }
1856    cpu_synchronize_all_states();
1857
1858    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1859        int ret;
1860
1861        if (se->is_ram) {
1862            continue;
1863        }
1864        ret = vmstate_save(f, se, NULL, &local_err);
1865        if (ret) {
1866            migrate_set_error(ms, local_err);
1867            error_report_err(local_err);
1868            return ret;
1869        }
1870    }
1871
1872    qemu_put_byte(f, QEMU_VM_EOF);
1873
1874    return qemu_file_get_error(f);
1875}
1876
1877static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id)
1878{
1879    SaveStateEntry *se;
1880
1881    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1882        if (!strcmp(se->idstr, idstr) &&
1883            (instance_id == se->instance_id ||
1884             instance_id == se->alias_id))
1885            return se;
1886        /* Migrating from an older version? */
1887        if (strstr(se->idstr, idstr) && se->compat) {
1888            if (!strcmp(se->compat->idstr, idstr) &&
1889                (instance_id == se->compat->instance_id ||
1890                 instance_id == se->alias_id))
1891                return se;
1892        }
1893    }
1894    return NULL;
1895}
1896
1897enum LoadVMExitCodes {
1898    /* Allow a command to quit all layers of nested loadvm loops */
1899    LOADVM_QUIT     =  1,
1900};
1901
1902/* ------ incoming postcopy messages ------ */
1903/* 'advise' arrives before any transfers just to tell us that a postcopy
1904 * *might* happen - it might be skipped if precopy transferred everything
1905 * quickly.
1906 */
1907static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
1908                                         uint16_t len)
1909{
1910    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1911    uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps;
1912    size_t page_size = qemu_target_page_size();
1913    Error *local_err = NULL;
1914
1915    trace_loadvm_postcopy_handle_advise();
1916    if (ps != POSTCOPY_INCOMING_NONE) {
1917        error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
1918        return -1;
1919    }
1920
1921    switch (len) {
1922    case 0:
1923        if (migrate_postcopy_ram()) {
1924            error_report("RAM postcopy is enabled but have 0 byte advise");
1925            return -EINVAL;
1926        }
1927        return 0;
1928    case 8 + 8:
1929        if (!migrate_postcopy_ram()) {
1930            error_report("RAM postcopy is disabled but have 16 byte advise");
1931            return -EINVAL;
1932        }
1933        break;
1934    default:
1935        error_report("CMD_POSTCOPY_ADVISE invalid length (%d)", len);
1936        return -EINVAL;
1937    }
1938
1939    if (!postcopy_ram_supported_by_host(mis, &local_err)) {
1940        error_report_err(local_err);
1941        postcopy_state_set(POSTCOPY_INCOMING_NONE);
1942        return -1;
1943    }
1944
1945    remote_pagesize_summary = qemu_get_be64(mis->from_src_file);
1946    local_pagesize_summary = ram_pagesize_summary();
1947
1948    if (remote_pagesize_summary != local_pagesize_summary)  {
1949        /*
1950         * This detects two potential causes of mismatch:
1951         *   a) A mismatch in host page sizes
1952         *      Some combinations of mismatch are probably possible but it gets
1953         *      a bit more complicated.  In particular we need to place whole
1954         *      host pages on the dest at once, and we need to ensure that we
1955         *      handle dirtying to make sure we never end up sending part of
1956         *      a hostpage on it's own.
1957         *   b) The use of different huge page sizes on source/destination
1958         *      a more fine grain test is performed during RAM block migration
1959         *      but this test here causes a nice early clear failure, and
1960         *      also fails when passed to an older qemu that doesn't
1961         *      do huge pages.
1962         */
1963        error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64
1964                                                             " d=%" PRIx64 ")",
1965                     remote_pagesize_summary, local_pagesize_summary);
1966        return -1;
1967    }
1968
1969    remote_tps = qemu_get_be64(mis->from_src_file);
1970    if (remote_tps != page_size) {
1971        /*
1972         * Again, some differences could be dealt with, but for now keep it
1973         * simple.
1974         */
1975        error_report("Postcopy needs matching target page sizes (s=%d d=%zd)",
1976                     (int)remote_tps, page_size);
1977        return -1;
1978    }
1979
1980    if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_ADVISE, &local_err)) {
1981        error_report_err(local_err);
1982        return -1;
1983    }
1984
1985    if (ram_postcopy_incoming_init(mis)) {
1986        return -1;
1987    }
1988
1989    return 0;
1990}
1991
1992/* After postcopy we will be told to throw some pages away since they're
1993 * dirty and will have to be demand fetched.  Must happen before CPU is
1994 * started.
1995 * There can be 0..many of these messages, each encoding multiple pages.
1996 */
1997static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
1998                                              uint16_t len)
1999{
2000    int tmp;
2001    char ramid[256];
2002    PostcopyState ps = postcopy_state_get();
2003
2004    trace_loadvm_postcopy_ram_handle_discard();
2005
2006    switch (ps) {
2007    case POSTCOPY_INCOMING_ADVISE:
2008        /* 1st discard */
2009        tmp = postcopy_ram_prepare_discard(mis);
2010        if (tmp) {
2011            return tmp;
2012        }
2013        break;
2014
2015    case POSTCOPY_INCOMING_DISCARD:
2016        /* Expected state */
2017        break;
2018
2019    default:
2020        error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
2021                     ps);
2022        return -1;
2023    }
2024    /* We're expecting a
2025     *    Version (0)
2026     *    a RAM ID string (length byte, name, 0 term)
2027     *    then at least 1 16 byte chunk
2028    */
2029    if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
2030        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
2031        return -1;
2032    }
2033
2034    tmp = qemu_get_byte(mis->from_src_file);
2035    if (tmp != postcopy_ram_discard_version) {
2036        error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
2037        return -1;
2038    }
2039
2040    if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
2041        error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
2042        return -1;
2043    }
2044    tmp = qemu_get_byte(mis->from_src_file);
2045    if (tmp != 0) {
2046        error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
2047        return -1;
2048    }
2049
2050    len -= 3 + strlen(ramid);
2051    if (len % 16) {
2052        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
2053        return -1;
2054    }
2055    trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
2056    while (len) {
2057        uint64_t start_addr, block_length;
2058        start_addr = qemu_get_be64(mis->from_src_file);
2059        block_length = qemu_get_be64(mis->from_src_file);
2060
2061        len -= 16;
2062        int ret = ram_discard_range(ramid, start_addr, block_length);
2063        if (ret) {
2064            return ret;
2065        }
2066    }
2067    trace_loadvm_postcopy_ram_handle_discard_end();
2068
2069    return 0;
2070}
2071
2072/*
2073 * Triggered by a postcopy_listen command; this thread takes over reading
2074 * the input stream, leaving the main thread free to carry on loading the rest
2075 * of the device state (from RAM).
2076 * (TODO:This could do with being in a postcopy file - but there again it's
2077 * just another input loop, not that postcopy specific)
2078 */
2079static void *postcopy_ram_listen_thread(void *opaque)
2080{
2081    MigrationIncomingState *mis = migration_incoming_get_current();
2082    QEMUFile *f = mis->from_src_file;
2083    int load_res;
2084    MigrationState *migr = migrate_get_current();
2085
2086    object_ref(OBJECT(migr));
2087
2088    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
2089                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);
2090    qemu_event_set(&mis->thread_sync_event);
2091    trace_postcopy_ram_listen_thread_start();
2092
2093    rcu_register_thread();
2094    /*
2095     * Because we're a thread and not a coroutine we can't yield
2096     * in qemu_file, and thus we must be blocking now.
2097     */
2098    qemu_file_set_blocking(f, true);
2099
2100    /* TODO: sanity check that only postcopiable data will be loaded here */
2101    load_res = qemu_loadvm_state_main(f, mis);
2102
2103    /*
2104     * This is tricky, but, mis->from_src_file can change after it
2105     * returns, when postcopy recovery happened. In the future, we may
2106     * want a wrapper for the QEMUFile handle.
2107     */
2108    f = mis->from_src_file;
2109
2110    /* And non-blocking again so we don't block in any cleanup */
2111    qemu_file_set_blocking(f, false);
2112
2113    trace_postcopy_ram_listen_thread_exit();
2114    if (load_res < 0) {
2115        qemu_file_set_error(f, load_res);
2116        dirty_bitmap_mig_cancel_incoming();
2117        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
2118            !migrate_postcopy_ram() && migrate_dirty_bitmaps())
2119        {
2120            error_report("%s: loadvm failed during postcopy: %d. All states "
2121                         "are migrated except dirty bitmaps. Some dirty "
2122                         "bitmaps may be lost, and present migrated dirty "
2123                         "bitmaps are correctly migrated and valid.",
2124                         __func__, load_res);
2125            load_res = 0; /* prevent further exit() */
2126        } else {
2127            error_report("%s: loadvm failed: %d", __func__, load_res);
2128            migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2129                                           MIGRATION_STATUS_FAILED);
2130        }
2131    }
2132    if (load_res >= 0) {
2133        /*
2134         * This looks good, but it's possible that the device loading in the
2135         * main thread hasn't finished yet, and so we might not be in 'RUN'
2136         * state yet; wait for the end of the main thread.
2137         */
2138        qemu_event_wait(&mis->main_thread_load_event);
2139    }
2140    postcopy_ram_incoming_cleanup(mis);
2141
2142    if (load_res < 0) {
2143        /*
2144         * If something went wrong then we have a bad state so exit;
2145         * depending how far we got it might be possible at this point
2146         * to leave the guest running and fire MCEs for pages that never
2147         * arrived as a desperate recovery step.
2148         */
2149        rcu_unregister_thread();
2150        exit(EXIT_FAILURE);
2151    }
2152
2153    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2154                                   MIGRATION_STATUS_COMPLETED);
2155    /*
2156     * If everything has worked fine, then the main thread has waited
2157     * for us to start, and we're the last use of the mis.
2158     * (If something broke then qemu will have to exit anyway since it's
2159     * got a bad migration state).
2160     */
2161    bql_lock();
2162    migration_incoming_state_destroy();
2163    bql_unlock();
2164
2165    rcu_unregister_thread();
2166    mis->have_listen_thread = false;
2167    postcopy_state_set(POSTCOPY_INCOMING_END);
2168
2169    object_unref(OBJECT(migr));
2170
2171    return NULL;
2172}
2173
2174/* After this message we must be able to immediately receive postcopy data */
2175static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
2176{
2177    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
2178    Error *local_err = NULL;
2179
2180    trace_loadvm_postcopy_handle_listen("enter");
2181
2182    if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
2183        error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
2184        return -1;
2185    }
2186    if (ps == POSTCOPY_INCOMING_ADVISE) {
2187        /*
2188         * A rare case, we entered listen without having to do any discards,
2189         * so do the setup that's normally done at the time of the 1st discard.
2190         */
2191        if (migrate_postcopy_ram()) {
2192            postcopy_ram_prepare_discard(mis);
2193        }
2194    }
2195
2196    trace_loadvm_postcopy_handle_listen("after discard");
2197
2198    /*
2199     * Sensitise RAM - can now generate requests for blocks that don't exist
2200     * However, at this point the CPU shouldn't be running, and the IO
2201     * shouldn't be doing anything yet so don't actually expect requests
2202     */
2203    if (migrate_postcopy_ram()) {
2204        if (postcopy_ram_incoming_setup(mis)) {
2205            postcopy_ram_incoming_cleanup(mis);
2206            return -1;
2207        }
2208    }
2209
2210    trace_loadvm_postcopy_handle_listen("after uffd");
2211
2212    if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) {
2213        error_report_err(local_err);
2214        return -1;
2215    }
2216
2217    mis->have_listen_thread = true;
2218    postcopy_thread_create(mis, &mis->listen_thread,
2219                           MIGRATION_THREAD_DST_LISTEN,
2220                           postcopy_ram_listen_thread, QEMU_THREAD_DETACHED);
2221    trace_loadvm_postcopy_handle_listen("return");
2222
2223    return 0;
2224}
2225
2226static void loadvm_postcopy_handle_run_bh(void *opaque)
2227{
2228    MigrationIncomingState *mis = opaque;
2229
2230    trace_vmstate_downtime_checkpoint("dst-postcopy-bh-enter");
2231
2232    /* TODO we should move all of this lot into postcopy_ram.c or a shared code
2233     * in migration.c
2234     */
2235    cpu_synchronize_all_post_init();
2236
2237    trace_vmstate_downtime_checkpoint("dst-postcopy-bh-cpu-synced");
2238
2239    qemu_announce_self(&mis->announce_timer, migrate_announce_params());
2240
2241    trace_vmstate_downtime_checkpoint("dst-postcopy-bh-announced");
2242
2243    dirty_bitmap_mig_before_vm_start();
2244
2245    if (autostart) {
2246        /*
2247         * Make sure all file formats throw away their mutable metadata.
2248         * If we get an error here, just don't restart the VM yet.
2249         */
2250        bool success = migration_block_activate(NULL);
2251
2252        trace_vmstate_downtime_checkpoint("dst-postcopy-bh-cache-invalidated");
2253
2254        if (success) {
2255            vm_start();
2256        }
2257    } else {
2258        /* leave it paused and let management decide when to start the CPU */
2259        runstate_set(RUN_STATE_PAUSED);
2260    }
2261
2262    trace_vmstate_downtime_checkpoint("dst-postcopy-bh-vm-started");
2263}
2264
2265/* After all discards we can start running and asking for pages */
2266static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
2267{
2268    PostcopyState ps = postcopy_state_get();
2269
2270    trace_loadvm_postcopy_handle_run();
2271    if (ps != POSTCOPY_INCOMING_LISTENING) {
2272        error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
2273        return -1;
2274    }
2275
2276    postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
2277    migration_bh_schedule(loadvm_postcopy_handle_run_bh, mis);
2278
2279    /* We need to finish reading the stream from the package
2280     * and also stop reading anything more from the stream that loaded the
2281     * package (since it's now being read by the listener thread).
2282     * LOADVM_QUIT will quit all the layers of nested loadvm loops.
2283     */
2284    return LOADVM_QUIT;
2285}
2286
2287/* We must be with page_request_mutex held */
2288static gboolean postcopy_sync_page_req(gpointer key, gpointer value,
2289                                       gpointer data)
2290{
2291    MigrationIncomingState *mis = data;
2292    void *host_addr = (void *) key;
2293    ram_addr_t rb_offset;
2294    RAMBlock *rb;
2295    int ret;
2296
2297    rb = qemu_ram_block_from_host(host_addr, true, &rb_offset);
2298    if (!rb) {
2299        /*
2300         * This should _never_ happen.  However be nice for a migrating VM to
2301         * not crash/assert.  Post an error (note: intended to not use *_once
2302         * because we do want to see all the illegal addresses; and this can
2303         * never be triggered by the guest so we're safe) and move on next.
2304         */
2305        error_report("%s: illegal host addr %p", __func__, host_addr);
2306        /* Try the next entry */
2307        return FALSE;
2308    }
2309
2310    ret = migrate_send_rp_message_req_pages(mis, rb, rb_offset);
2311    if (ret) {
2312        /* Please refer to above comment. */
2313        error_report("%s: send rp message failed for addr %p",
2314                     __func__, host_addr);
2315        return FALSE;
2316    }
2317
2318    trace_postcopy_page_req_sync(host_addr);
2319
2320    return FALSE;
2321}
2322
2323static void migrate_send_rp_req_pages_pending(MigrationIncomingState *mis)
2324{
2325    WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
2326        g_tree_foreach(mis->page_requested, postcopy_sync_page_req, mis);
2327    }
2328}
2329
2330static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis)
2331{
2332    if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
2333        error_report("%s: illegal resume received", __func__);
2334        /* Don't fail the load, only for this. */
2335        return 0;
2336    }
2337
2338    /*
2339     * Reset the last_rb before we resend any page req to source again, since
2340     * the source should have it reset already.
2341     */
2342    mis->last_rb = NULL;
2343
2344    /*
2345     * This means source VM is ready to resume the postcopy migration.
2346     */
2347    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2348                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
2349
2350    trace_loadvm_postcopy_handle_resume();
2351
2352    /* Tell source that "we are ready" */
2353    migrate_send_rp_resume_ack(mis, MIGRATION_RESUME_ACK_VALUE);
2354
2355    /*
2356     * After a postcopy recovery, the source should have lost the postcopy
2357     * queue, or potentially the requested pages could have been lost during
2358     * the network down phase.  Let's re-sync with the source VM by re-sending
2359     * all the pending pages that we eagerly need, so these threads won't get
2360     * blocked too long due to the recovery.
2361     *
2362     * Without this procedure, the faulted destination VM threads (waiting for
2363     * page requests right before the postcopy is interrupted) can keep hanging
2364     * until the pages are sent by the source during the background copying of
2365     * pages, or another thread faulted on the same address accidentally.
2366     */
2367    migrate_send_rp_req_pages_pending(mis);
2368
2369    /*
2370     * It's time to switch state and release the fault thread to continue
2371     * service page faults.  Note that this should be explicitly after the
2372     * above call to migrate_send_rp_req_pages_pending().  In short:
2373     * migrate_send_rp_message_req_pages() is not thread safe, yet.
2374     */
2375    qemu_sem_post(&mis->postcopy_pause_sem_fault);
2376
2377    if (migrate_postcopy_preempt()) {
2378        /*
2379         * The preempt channel will be created in async manner, now let's
2380         * wait for it and make sure it's created.
2381         */
2382        qemu_sem_wait(&mis->postcopy_qemufile_dst_done);
2383        assert(mis->postcopy_qemufile_dst);
2384        /* Kick the fast ram load thread too */
2385        qemu_sem_post(&mis->postcopy_pause_sem_fast_load);
2386    }
2387
2388    return 0;
2389}
2390
2391/**
2392 * Immediately following this command is a blob of data containing an embedded
2393 * chunk of migration stream; read it and load it.
2394 *
2395 * @mis: Incoming state
2396 * @length: Length of packaged data to read
2397 *
2398 * Returns: Negative values on error
2399 *
2400 */
2401static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
2402{
2403    int ret;
2404    size_t length;
2405    QIOChannelBuffer *bioc;
2406
2407    length = qemu_get_be32(mis->from_src_file);
2408    trace_loadvm_handle_cmd_packaged(length);
2409
2410    if (length > MAX_VM_CMD_PACKAGED_SIZE) {
2411        error_report("Unreasonably large packaged state: %zu", length);
2412        return -1;
2413    }
2414
2415    bioc = qio_channel_buffer_new(length);
2416    qio_channel_set_name(QIO_CHANNEL(bioc), "migration-loadvm-buffer");
2417    ret = qemu_get_buffer(mis->from_src_file,
2418                          bioc->data,
2419                          length);
2420    if (ret != length) {
2421        object_unref(OBJECT(bioc));
2422        error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu",
2423                     ret, length);
2424        return (ret < 0) ? ret : -EAGAIN;
2425    }
2426    bioc->usage += length;
2427    trace_loadvm_handle_cmd_packaged_received(ret);
2428
2429    QEMUFile *packf = qemu_file_new_input(QIO_CHANNEL(bioc));
2430
2431    /*
2432     * Before loading the guest states, ensure that the preempt channel has
2433     * been ready to use, as some of the states (e.g. via virtio_load) might
2434     * trigger page faults that will be handled through the preempt channel.
2435     * So yield to the main thread in the case that the channel create event
2436     * hasn't been dispatched.
2437     *
2438     * TODO: if we can move migration loadvm out of main thread, then we
2439     * won't block main thread from polling the accept() fds.  We can drop
2440     * this as a whole when that is done.
2441     */
2442    do {
2443        if (!migrate_postcopy_preempt() || !qemu_in_coroutine() ||
2444            mis->postcopy_qemufile_dst) {
2445            break;
2446        }
2447
2448        aio_co_schedule(qemu_get_current_aio_context(), qemu_coroutine_self());
2449        qemu_coroutine_yield();
2450    } while (1);
2451
2452    ret = qemu_loadvm_state_main(packf, mis);
2453    trace_loadvm_handle_cmd_packaged_main(ret);
2454    qemu_fclose(packf);
2455    object_unref(OBJECT(bioc));
2456
2457    return ret;
2458}
2459
2460/*
2461 * Handle request that source requests for recved_bitmap on
2462 * destination. Payload format:
2463 *
2464 * len (1 byte) + ramblock_name (<255 bytes)
2465 */
2466static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
2467                                     uint16_t len)
2468{
2469    QEMUFile *file = mis->from_src_file;
2470    RAMBlock *rb;
2471    char block_name[256];
2472    size_t cnt;
2473
2474    cnt = qemu_get_counted_string(file, block_name);
2475    if (!cnt) {
2476        error_report("%s: failed to read block name", __func__);
2477        return -EINVAL;
2478    }
2479
2480    /* Validate before using the data */
2481    if (qemu_file_get_error(file)) {
2482        return qemu_file_get_error(file);
2483    }
2484
2485    if (len != cnt + 1) {
2486        error_report("%s: invalid payload length (%d)", __func__, len);
2487        return -EINVAL;
2488    }
2489
2490    rb = qemu_ram_block_by_name(block_name);
2491    if (!rb) {
2492        error_report("%s: block '%s' not found", __func__, block_name);
2493        return -EINVAL;
2494    }
2495
2496    migrate_send_rp_recv_bitmap(mis, block_name);
2497
2498    trace_loadvm_handle_recv_bitmap(block_name);
2499
2500    return 0;
2501}
2502
2503static int loadvm_process_enable_colo(MigrationIncomingState *mis)
2504{
2505    int ret = migration_incoming_enable_colo();
2506
2507    if (!ret) {
2508        ret = colo_init_ram_cache();
2509        if (ret) {
2510            migration_incoming_disable_colo();
2511        }
2512    }
2513    return ret;
2514}
2515
2516static int loadvm_postcopy_handle_switchover_start(void)
2517{
2518    SaveStateEntry *se;
2519
2520    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2521        int ret;
2522
2523        if (!se->ops || !se->ops->switchover_start) {
2524            continue;
2525        }
2526
2527        ret = se->ops->switchover_start(se->opaque);
2528        if (ret < 0) {
2529            return ret;
2530        }
2531    }
2532
2533    return 0;
2534}
2535
2536/*
2537 * Process an incoming 'QEMU_VM_COMMAND'
2538 * 0           just a normal return
2539 * LOADVM_QUIT All good, but exit the loop
2540 * <0          Error
2541 */
2542static int loadvm_process_command(QEMUFile *f)
2543{
2544    MigrationIncomingState *mis = migration_incoming_get_current();
2545    uint16_t cmd;
2546    uint16_t len;
2547    uint32_t tmp32;
2548
2549    cmd = qemu_get_be16(f);
2550    len = qemu_get_be16(f);
2551
2552    /* Check validity before continue processing of cmds */
2553    if (qemu_file_get_error(f)) {
2554        return qemu_file_get_error(f);
2555    }
2556
2557    if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
2558        error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
2559        return -EINVAL;
2560    }
2561
2562    trace_loadvm_process_command(mig_cmd_args[cmd].name, len);
2563
2564    if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
2565        error_report("%s received with bad length - expecting %zu, got %d",
2566                     mig_cmd_args[cmd].name,
2567                     (size_t)mig_cmd_args[cmd].len, len);
2568        return -ERANGE;
2569    }
2570
2571    switch (cmd) {
2572    case MIG_CMD_OPEN_RETURN_PATH:
2573        if (mis->to_src_file) {
2574            error_report("CMD_OPEN_RETURN_PATH called when RP already open");
2575            /* Not really a problem, so don't give up */
2576            return 0;
2577        }
2578        mis->to_src_file = qemu_file_get_return_path(f);
2579        if (!mis->to_src_file) {
2580            error_report("CMD_OPEN_RETURN_PATH failed");
2581            return -1;
2582        }
2583
2584        /*
2585         * Switchover ack is enabled but no device uses it, so send an ACK to
2586         * source that it's OK to switchover. Do it here, after return path has
2587         * been created.
2588         */
2589        if (migrate_switchover_ack() && !mis->switchover_ack_pending_num) {
2590            int ret = migrate_send_rp_switchover_ack(mis);
2591            if (ret) {
2592                error_report(
2593                    "Could not send switchover ack RP MSG, err %d (%s)", ret,
2594                    strerror(-ret));
2595                return ret;
2596            }
2597        }
2598        break;
2599
2600    case MIG_CMD_PING:
2601        tmp32 = qemu_get_be32(f);
2602        trace_loadvm_process_command_ping(tmp32);
2603        if (!mis->to_src_file) {
2604            error_report("CMD_PING (0x%x) received with no return path",
2605                         tmp32);
2606            return -1;
2607        }
2608        migrate_send_rp_pong(mis, tmp32);
2609        break;
2610
2611    case MIG_CMD_PACKAGED:
2612        return loadvm_handle_cmd_packaged(mis);
2613
2614    case MIG_CMD_POSTCOPY_ADVISE:
2615        return loadvm_postcopy_handle_advise(mis, len);
2616
2617    case MIG_CMD_POSTCOPY_LISTEN:
2618        return loadvm_postcopy_handle_listen(mis);
2619
2620    case MIG_CMD_POSTCOPY_RUN:
2621        return loadvm_postcopy_handle_run(mis);
2622
2623    case MIG_CMD_POSTCOPY_RAM_DISCARD:
2624        return loadvm_postcopy_ram_handle_discard(mis, len);
2625
2626    case MIG_CMD_POSTCOPY_RESUME:
2627        return loadvm_postcopy_handle_resume(mis);
2628
2629    case MIG_CMD_RECV_BITMAP:
2630        return loadvm_handle_recv_bitmap(mis, len);
2631
2632    case MIG_CMD_ENABLE_COLO:
2633        return loadvm_process_enable_colo(mis);
2634
2635    case MIG_CMD_SWITCHOVER_START:
2636        return loadvm_postcopy_handle_switchover_start();
2637    }
2638
2639    return 0;
2640}
2641
2642/*
2643 * Read a footer off the wire and check that it matches the expected section
2644 *
2645 * Returns: true if the footer was good
2646 *          false if there is a problem (and calls error_report to say why)
2647 */
2648static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
2649{
2650    int ret;
2651    uint8_t read_mark;
2652    uint32_t read_section_id;
2653
2654    if (!migrate_get_current()->send_section_footer) {
2655        /* No footer to check */
2656        return true;
2657    }
2658
2659    read_mark = qemu_get_byte(f);
2660
2661    ret = qemu_file_get_error(f);
2662    if (ret) {
2663        error_report("%s: Read section footer failed: %d",
2664                     __func__, ret);
2665        return false;
2666    }
2667
2668    if (read_mark != QEMU_VM_SECTION_FOOTER) {
2669        error_report("Missing section footer for %s", se->idstr);
2670        return false;
2671    }
2672
2673    read_section_id = qemu_get_be32(f);
2674    if (read_section_id != se->load_section_id) {
2675        error_report("Mismatched section id in footer for %s -"
2676                     " read 0x%x expected 0x%x",
2677                     se->idstr, read_section_id, se->load_section_id);
2678        return false;
2679    }
2680
2681    /* All good */
2682    return true;
2683}
2684
2685static int
2686qemu_loadvm_section_start_full(QEMUFile *f, uint8_t type)
2687{
2688    bool trace_downtime = (type == QEMU_VM_SECTION_FULL);
2689    uint32_t instance_id, version_id, section_id;
2690    int64_t start_ts, end_ts;
2691    SaveStateEntry *se;
2692    char idstr[256];
2693    int ret;
2694
2695    /* Read section start */
2696    section_id = qemu_get_be32(f);
2697    if (!qemu_get_counted_string(f, idstr)) {
2698        error_report("Unable to read ID string for section %u",
2699                     section_id);
2700        return -EINVAL;
2701    }
2702    instance_id = qemu_get_be32(f);
2703    version_id = qemu_get_be32(f);
2704
2705    ret = qemu_file_get_error(f);
2706    if (ret) {
2707        error_report("%s: Failed to read instance/version ID: %d",
2708                     __func__, ret);
2709        return ret;
2710    }
2711
2712    trace_qemu_loadvm_state_section_startfull(section_id, idstr,
2713            instance_id, version_id);
2714    /* Find savevm section */
2715    se = find_se(idstr, instance_id);
2716    if (se == NULL) {
2717        error_report("Unknown savevm section or instance '%s' %"PRIu32". "
2718                     "Make sure that your current VM setup matches your "
2719                     "saved VM setup, including any hotplugged devices",
2720                     idstr, instance_id);
2721        return -EINVAL;
2722    }
2723
2724    /* Validate version */
2725    if (version_id > se->version_id) {
2726        error_report("savevm: unsupported version %d for '%s' v%d",
2727                     version_id, idstr, se->version_id);
2728        return -EINVAL;
2729    }
2730    se->load_version_id = version_id;
2731    se->load_section_id = section_id;
2732
2733    /* Validate if it is a device's state */
2734    if (xen_enabled() && se->is_ram) {
2735        error_report("loadvm: %s RAM loading not allowed on Xen", idstr);
2736        return -EINVAL;
2737    }
2738
2739    if (trace_downtime) {
2740        start_ts = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
2741    }
2742
2743    ret = vmstate_load(f, se);
2744    if (ret < 0) {
2745        error_report("error while loading state for instance 0x%"PRIx32" of"
2746                     " device '%s'", instance_id, idstr);
2747        return ret;
2748    }
2749
2750    if (trace_downtime) {
2751        end_ts = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
2752        trace_vmstate_downtime_load("non-iterable", se->idstr,
2753                                    se->instance_id, end_ts - start_ts);
2754    }
2755
2756    if (!check_section_footer(f, se)) {
2757        return -EINVAL;
2758    }
2759
2760    return 0;
2761}
2762
2763static int
2764qemu_loadvm_section_part_end(QEMUFile *f, uint8_t type)
2765{
2766    bool trace_downtime = (type == QEMU_VM_SECTION_END);
2767    int64_t start_ts, end_ts;
2768    uint32_t section_id;
2769    SaveStateEntry *se;
2770    int ret;
2771
2772    section_id = qemu_get_be32(f);
2773
2774    ret = qemu_file_get_error(f);
2775    if (ret) {
2776        error_report("%s: Failed to read section ID: %d",
2777                     __func__, ret);
2778        return ret;
2779    }
2780
2781    trace_qemu_loadvm_state_section_partend(section_id);
2782    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2783        if (se->load_section_id == section_id) {
2784            break;
2785        }
2786    }
2787    if (se == NULL) {
2788        error_report("Unknown savevm section %d", section_id);
2789        return -EINVAL;
2790    }
2791
2792    if (trace_downtime) {
2793        start_ts = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
2794    }
2795
2796    ret = vmstate_load(f, se);
2797    if (ret < 0) {
2798        error_report("error while loading state section id %d(%s)",
2799                     section_id, se->idstr);
2800        return ret;
2801    }
2802
2803    if (trace_downtime) {
2804        end_ts = qemu_clock_get_us(QEMU_CLOCK_REALTIME);
2805        trace_vmstate_downtime_load("iterable", se->idstr,
2806                                    se->instance_id, end_ts - start_ts);
2807    }
2808
2809    if (!check_section_footer(f, se)) {
2810        return -EINVAL;
2811    }
2812
2813    return 0;
2814}
2815
2816static int qemu_loadvm_state_header(QEMUFile *f)
2817{
2818    unsigned int v;
2819    int ret;
2820
2821    v = qemu_get_be32(f);
2822    if (v != QEMU_VM_FILE_MAGIC) {
2823        error_report("Not a migration stream");
2824        return -EINVAL;
2825    }
2826
2827    v = qemu_get_be32(f);
2828    if (v == QEMU_VM_FILE_VERSION_COMPAT) {
2829        error_report("SaveVM v2 format is obsolete and don't work anymore");
2830        return -ENOTSUP;
2831    }
2832    if (v != QEMU_VM_FILE_VERSION) {
2833        error_report("Unsupported migration stream version");
2834        return -ENOTSUP;
2835    }
2836
2837    if (migrate_get_current()->send_configuration) {
2838        if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
2839            error_report("Configuration section missing");
2840            return -EINVAL;
2841        }
2842        ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
2843
2844        if (ret) {
2845            return ret;
2846        }
2847    }
2848    return 0;
2849}
2850
2851static void qemu_loadvm_state_switchover_ack_needed(MigrationIncomingState *mis)
2852{
2853    SaveStateEntry *se;
2854
2855    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2856        if (!se->ops || !se->ops->switchover_ack_needed) {
2857            continue;
2858        }
2859
2860        if (se->ops->switchover_ack_needed(se->opaque)) {
2861            mis->switchover_ack_pending_num++;
2862        }
2863    }
2864
2865    trace_loadvm_state_switchover_ack_needed(mis->switchover_ack_pending_num);
2866}
2867
2868static int qemu_loadvm_state_setup(QEMUFile *f, Error **errp)
2869{
2870    ERRP_GUARD();
2871    SaveStateEntry *se;
2872    int ret;
2873
2874    trace_loadvm_state_setup();
2875    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2876        if (!se->ops || !se->ops->load_setup) {
2877            continue;
2878        }
2879        if (se->ops->is_active) {
2880            if (!se->ops->is_active(se->opaque)) {
2881                continue;
2882            }
2883        }
2884
2885        ret = se->ops->load_setup(f, se->opaque, errp);
2886        if (ret < 0) {
2887            error_prepend(errp, "Load state of device %s failed: ",
2888                          se->idstr);
2889            qemu_file_set_error(f, ret);
2890            return ret;
2891        }
2892    }
2893    return 0;
2894}
2895
2896struct LoadThreadData {
2897    MigrationLoadThread function;
2898    void *opaque;
2899};
2900
2901static int qemu_loadvm_load_thread(void *thread_opaque)
2902{
2903    struct LoadThreadData *data = thread_opaque;
2904    MigrationIncomingState *mis = migration_incoming_get_current();
2905    g_autoptr(Error) local_err = NULL;
2906
2907    if (!data->function(data->opaque, &mis->load_threads_abort, &local_err)) {
2908        MigrationState *s = migrate_get_current();
2909
2910        /*
2911         * Can't set load_threads_abort here since processing of main migration
2912         * channel data could still be happening, resulting in launching of new
2913         * load threads.
2914         */
2915
2916        assert(local_err);
2917
2918        /*
2919         * In case of multiple load threads failing which thread error
2920         * return we end setting is purely arbitrary.
2921         */
2922        migrate_set_error(s, local_err);
2923    }
2924
2925    return 0;
2926}
2927
2928void qemu_loadvm_start_load_thread(MigrationLoadThread function,
2929                                   void *opaque)
2930{
2931    MigrationIncomingState *mis = migration_incoming_get_current();
2932    struct LoadThreadData *data;
2933
2934    /* We only set it from this thread so it's okay to read it directly */
2935    assert(!mis->load_threads_abort);
2936
2937    data = g_new(struct LoadThreadData, 1);
2938    data->function = function;
2939    data->opaque = opaque;
2940
2941    thread_pool_submit_immediate(mis->load_threads, qemu_loadvm_load_thread,
2942                                 data, g_free);
2943}
2944
2945void qemu_loadvm_state_cleanup(MigrationIncomingState *mis)
2946{
2947    SaveStateEntry *se;
2948
2949    trace_loadvm_state_cleanup();
2950
2951    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2952        if (se->ops && se->ops->load_cleanup) {
2953            se->ops->load_cleanup(se->opaque);
2954        }
2955    }
2956
2957    qemu_loadvm_thread_pool_destroy(mis);
2958}
2959
2960/* Return true if we should continue the migration, or false. */
2961static bool postcopy_pause_incoming(MigrationIncomingState *mis)
2962{
2963    int i;
2964
2965    trace_postcopy_pause_incoming();
2966
2967    assert(migrate_postcopy_ram());
2968
2969    /*
2970     * Unregister yank with either from/to src would work, since ioc behind it
2971     * is the same
2972     */
2973    migration_ioc_unregister_yank_from_file(mis->from_src_file);
2974
2975    assert(mis->from_src_file);
2976    qemu_file_shutdown(mis->from_src_file);
2977    qemu_fclose(mis->from_src_file);
2978    mis->from_src_file = NULL;
2979
2980    assert(mis->to_src_file);
2981    qemu_file_shutdown(mis->to_src_file);
2982    qemu_mutex_lock(&mis->rp_mutex);
2983    qemu_fclose(mis->to_src_file);
2984    mis->to_src_file = NULL;
2985    qemu_mutex_unlock(&mis->rp_mutex);
2986
2987    /*
2988     * NOTE: this must happen before reset the PostcopyTmpPages below,
2989     * otherwise it's racy to reset those fields when the fast load thread
2990     * can be accessing it in parallel.
2991     */
2992    if (mis->postcopy_qemufile_dst) {
2993        qemu_file_shutdown(mis->postcopy_qemufile_dst);
2994        /* Take the mutex to make sure the fast ram load thread halted */
2995        qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
2996        migration_ioc_unregister_yank_from_file(mis->postcopy_qemufile_dst);
2997        qemu_fclose(mis->postcopy_qemufile_dst);
2998        mis->postcopy_qemufile_dst = NULL;
2999        qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
3000    }
3001
3002    /* Current state can be either ACTIVE or RECOVER */
3003    migrate_set_state(&mis->state, mis->state,
3004                      MIGRATION_STATUS_POSTCOPY_PAUSED);
3005
3006    /* Notify the fault thread for the invalidated file handle */
3007    postcopy_fault_thread_notify(mis);
3008
3009    /*
3010     * If network is interrupted, any temp page we received will be useless
3011     * because we didn't mark them as "received" in receivedmap.  After a
3012     * proper recovery later (which will sync src dirty bitmap with receivedmap
3013     * on dest) these cached small pages will be resent again.
3014     */
3015    for (i = 0; i < mis->postcopy_channels; i++) {
3016        postcopy_temp_page_reset(&mis->postcopy_tmp_pages[i]);
3017    }
3018
3019    error_report("Detected IO failure for postcopy. "
3020                 "Migration paused.");
3021
3022    do {
3023        qemu_sem_wait(&mis->postcopy_pause_sem_dst);
3024    } while (postcopy_is_paused(mis->state));
3025
3026    trace_postcopy_pause_incoming_continued();
3027
3028    return true;
3029}
3030
3031int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
3032{
3033    uint8_t section_type;
3034    int ret = 0;
3035
3036retry:
3037    while (true) {
3038        section_type = qemu_get_byte(f);
3039
3040        ret = qemu_file_get_error_obj_any(f, mis->postcopy_qemufile_dst, NULL);
3041        if (ret) {
3042            break;
3043        }
3044
3045        trace_qemu_loadvm_state_section(section_type);
3046        switch (section_type) {
3047        case QEMU_VM_SECTION_START:
3048        case QEMU_VM_SECTION_FULL:
3049            ret = qemu_loadvm_section_start_full(f, section_type);
3050            if (ret < 0) {
3051                goto out;
3052            }
3053            break;
3054        case QEMU_VM_SECTION_PART:
3055        case QEMU_VM_SECTION_END:
3056            ret = qemu_loadvm_section_part_end(f, section_type);
3057            if (ret < 0) {
3058                goto out;
3059            }
3060            break;
3061        case QEMU_VM_COMMAND:
3062            ret = loadvm_process_command(f);
3063            trace_qemu_loadvm_state_section_command(ret);
3064            if ((ret < 0) || (ret == LOADVM_QUIT)) {
3065                goto out;
3066            }
3067            break;
3068        case QEMU_VM_EOF:
3069            /* This is the end of migration */
3070            goto out;
3071        default:
3072            error_report("Unknown savevm section type %d", section_type);
3073            ret = -EINVAL;
3074            goto out;
3075        }
3076    }
3077
3078out:
3079    if (ret < 0) {
3080        qemu_file_set_error(f, ret);
3081
3082        /* Cancel bitmaps incoming regardless of recovery */
3083        dirty_bitmap_mig_cancel_incoming();
3084
3085        /*
3086         * If we are during an active postcopy, then we pause instead
3087         * of bail out to at least keep the VM's dirty data.  Note
3088         * that POSTCOPY_INCOMING_LISTENING stage is still not enough,
3089         * during which we're still receiving device states and we
3090         * still haven't yet started the VM on destination.
3091         *
3092         * Only RAM postcopy supports recovery. Still, if RAM postcopy is
3093         * enabled, canceled bitmaps postcopy will not affect RAM postcopy
3094         * recovering.
3095         */
3096        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
3097            migrate_postcopy_ram() && postcopy_pause_incoming(mis)) {
3098            /* Reset f to point to the newly created channel */
3099            f = mis->from_src_file;
3100            goto retry;
3101        }
3102    }
3103    return ret;
3104}
3105
3106int qemu_loadvm_state(QEMUFile *f)
3107{
3108    MigrationState *s = migrate_get_current();
3109    MigrationIncomingState *mis = migration_incoming_get_current();
3110    Error *local_err = NULL;
3111    int ret;
3112
3113    if (qemu_savevm_state_blocked(&local_err)) {
3114        error_report_err(local_err);
3115        return -EINVAL;
3116    }
3117
3118    qemu_loadvm_thread_pool_create(mis);
3119
3120    ret = qemu_loadvm_state_header(f);
3121    if (ret) {
3122        return ret;
3123    }
3124
3125    if (qemu_loadvm_state_setup(f, &local_err) != 0) {
3126        error_report_err(local_err);
3127        return -EINVAL;
3128    }
3129
3130    if (migrate_switchover_ack()) {
3131        qemu_loadvm_state_switchover_ack_needed(mis);
3132    }
3133
3134    cpu_synchronize_all_pre_loadvm();
3135
3136    ret = qemu_loadvm_state_main(f, mis);
3137    qemu_event_set(&mis->main_thread_load_event);
3138
3139    trace_qemu_loadvm_state_post_main(ret);
3140
3141    if (mis->have_listen_thread) {
3142        /*
3143         * Postcopy listen thread still going, don't synchronize the
3144         * cpus yet.
3145         */
3146        return ret;
3147    }
3148
3149    /* When reaching here, it must be precopy */
3150    if (ret == 0) {
3151        if (migrate_has_error(migrate_get_current()) ||
3152            !qemu_loadvm_thread_pool_wait(s, mis)) {
3153            ret = -EINVAL;
3154        } else {
3155            ret = qemu_file_get_error(f);
3156        }
3157    }
3158    /*
3159     * Set this flag unconditionally so we'll catch further attempts to
3160     * start additional threads via an appropriate assert()
3161     */
3162    qatomic_set(&mis->load_threads_abort, true);
3163
3164    /*
3165     * Try to read in the VMDESC section as well, so that dumping tools that
3166     * intercept our migration stream have the chance to see it.
3167     */
3168
3169    /* We've got to be careful; if we don't read the data and just shut the fd
3170     * then the sender can error if we close while it's still sending.
3171     * We also mustn't read data that isn't there; some transports (RDMA)
3172     * will stall waiting for that data when the source has already closed.
3173     */
3174    if (ret == 0 && should_send_vmdesc()) {
3175        uint8_t *buf;
3176        uint32_t size;
3177        uint8_t  section_type = qemu_get_byte(f);
3178
3179        if (section_type != QEMU_VM_VMDESCRIPTION) {
3180            error_report("Expected vmdescription section, but got %d",
3181                         section_type);
3182            /*
3183             * It doesn't seem worth failing at this point since
3184             * we apparently have an otherwise valid VM state
3185             */
3186        } else {
3187            buf = g_malloc(0x1000);
3188            size = qemu_get_be32(f);
3189
3190            while (size > 0) {
3191                uint32_t read_chunk = MIN(size, 0x1000);
3192                qemu_get_buffer(f, buf, read_chunk);
3193                size -= read_chunk;
3194            }
3195            g_free(buf);
3196        }
3197    }
3198
3199    cpu_synchronize_all_post_init();
3200
3201    return ret;
3202}
3203
3204int qemu_load_device_state(QEMUFile *f)
3205{
3206    MigrationIncomingState *mis = migration_incoming_get_current();
3207    int ret;
3208
3209    /* Load QEMU_VM_SECTION_FULL section */
3210    ret = qemu_loadvm_state_main(f, mis);
3211    if (ret < 0) {
3212        error_report("Failed to load device state: %d", ret);
3213        return ret;
3214    }
3215
3216    cpu_synchronize_all_post_init();
3217    return 0;
3218}
3219
3220int qemu_loadvm_approve_switchover(void)
3221{
3222    MigrationIncomingState *mis = migration_incoming_get_current();
3223
3224    if (!mis->switchover_ack_pending_num) {
3225        return -EINVAL;
3226    }
3227
3228    mis->switchover_ack_pending_num--;
3229    trace_loadvm_approve_switchover(mis->switchover_ack_pending_num);
3230
3231    if (mis->switchover_ack_pending_num) {
3232        return 0;
3233    }
3234
3235    return migrate_send_rp_switchover_ack(mis);
3236}
3237
3238bool qemu_loadvm_load_state_buffer(const char *idstr, uint32_t instance_id,
3239                                   char *buf, size_t len, Error **errp)
3240{
3241    SaveStateEntry *se;
3242
3243    se = find_se(idstr, instance_id);
3244    if (!se) {
3245        error_setg(errp,
3246                   "Unknown idstr %s or instance id %u for load state buffer",
3247                   idstr, instance_id);
3248        return false;
3249    }
3250
3251    if (!se->ops || !se->ops->load_state_buffer) {
3252        error_setg(errp,
3253                   "idstr %s / instance %u has no load state buffer operation",
3254                   idstr, instance_id);
3255        return false;
3256    }
3257
3258    return se->ops->load_state_buffer(se->opaque, buf, len, errp);
3259}
3260
3261bool save_snapshot(const char *name, bool overwrite, const char *vmstate,
3262                  bool has_devices, strList *devices, Error **errp)
3263{
3264    BlockDriverState *bs;
3265    QEMUSnapshotInfo sn1, *sn = &sn1;
3266    int ret = -1, ret2;
3267    QEMUFile *f;
3268    RunState saved_state = runstate_get();
3269    uint64_t vm_state_size;
3270    g_autoptr(GDateTime) now = g_date_time_new_now_local();
3271
3272    GLOBAL_STATE_CODE();
3273
3274    if (migration_is_blocked(errp)) {
3275        return false;
3276    }
3277
3278    if (!replay_can_snapshot()) {
3279        error_setg(errp, "Record/replay does not allow making snapshot "
3280                   "right now. Try once more later.");
3281        return false;
3282    }
3283
3284    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
3285        return false;
3286    }
3287
3288    /* Delete old snapshots of the same name */
3289    if (name) {
3290        if (overwrite) {
3291            if (bdrv_all_delete_snapshot(name, has_devices,
3292                                         devices, errp) < 0) {
3293                return false;
3294            }
3295        } else {
3296            ret2 = bdrv_all_has_snapshot(name, has_devices, devices, errp);
3297            if (ret2 < 0) {
3298                return false;
3299            }
3300            if (ret2 == 1) {
3301                error_setg(errp,
3302                           "Snapshot '%s' already exists in one or more devices",
3303                           name);
3304                return false;
3305            }
3306        }
3307    }
3308
3309    bs = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
3310    if (bs == NULL) {
3311        return false;
3312    }
3313
3314    global_state_store();
3315    vm_stop(RUN_STATE_SAVE_VM);
3316
3317    bdrv_drain_all_begin();
3318
3319    memset(sn, 0, sizeof(*sn));
3320
3321    /* fill auxiliary fields */
3322    sn->date_sec = g_date_time_to_unix(now);
3323    sn->date_nsec = g_date_time_get_microsecond(now) * 1000;
3324    sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
3325    if (replay_mode != REPLAY_MODE_NONE) {
3326        sn->icount = replay_get_current_icount();
3327    } else {
3328        sn->icount = -1ULL;
3329    }
3330
3331    if (name) {
3332        pstrcpy(sn->name, sizeof(sn->name), name);
3333    } else {
3334        g_autofree char *autoname = g_date_time_format(now,  "vm-%Y%m%d%H%M%S");
3335        pstrcpy(sn->name, sizeof(sn->name), autoname);
3336    }
3337
3338    /* save the VM state */
3339    f = qemu_fopen_bdrv(bs, 1);
3340    if (!f) {
3341        error_setg(errp, "Could not open VM state file");
3342        goto the_end;
3343    }
3344    ret = qemu_savevm_state(f, errp);
3345    vm_state_size = qemu_file_transferred(f);
3346    ret2 = qemu_fclose(f);
3347    if (ret < 0) {
3348        goto the_end;
3349    }
3350    if (ret2 < 0) {
3351        ret = ret2;
3352        goto the_end;
3353    }
3354
3355    ret = bdrv_all_create_snapshot(sn, bs, vm_state_size,
3356                                   has_devices, devices, errp);
3357    if (ret < 0) {
3358        bdrv_all_delete_snapshot(sn->name, has_devices, devices, NULL);
3359        goto the_end;
3360    }
3361
3362    ret = 0;
3363
3364 the_end:
3365    bdrv_drain_all_end();
3366
3367    vm_resume(saved_state);
3368    return ret == 0;
3369}
3370
3371void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
3372                                Error **errp)
3373{
3374    QEMUFile *f;
3375    QIOChannelFile *ioc;
3376    int saved_vm_running;
3377    int ret;
3378
3379    if (!has_live) {
3380        /* live default to true so old version of Xen tool stack can have a
3381         * successful live migration */
3382        live = true;
3383    }
3384
3385    saved_vm_running = runstate_is_running();
3386    vm_stop(RUN_STATE_SAVE_VM);
3387    global_state_store_running();
3388
3389    ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT | O_TRUNC,
3390                                    0660, errp);
3391    if (!ioc) {
3392        goto the_end;
3393    }
3394    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-save-state");
3395    f = qemu_file_new_output(QIO_CHANNEL(ioc));
3396    object_unref(OBJECT(ioc));
3397    ret = qemu_save_device_state(f);
3398    if (ret < 0 || qemu_fclose(f) < 0) {
3399        error_setg(errp, "saving Xen device state failed");
3400    } else {
3401        /* libxl calls the QMP command "stop" before calling
3402         * "xen-save-devices-state" and in case of migration failure, libxl
3403         * would call "cont".
3404         * So call bdrv_inactivate_all (release locks) here to let the other
3405         * side of the migration take control of the images.
3406         */
3407        if (live && !saved_vm_running) {
3408            migration_block_inactivate();
3409        }
3410    }
3411
3412 the_end:
3413    if (saved_vm_running) {
3414        vm_start();
3415    }
3416}
3417
3418void qmp_xen_load_devices_state(const char *filename, Error **errp)
3419{
3420    QEMUFile *f;
3421    QIOChannelFile *ioc;
3422    int ret;
3423
3424    /* Guest must be paused before loading the device state; the RAM state
3425     * will already have been loaded by xc
3426     */
3427    if (runstate_is_running()) {
3428        error_setg(errp, "Cannot update device state while vm is running");
3429        return;
3430    }
3431    vm_stop(RUN_STATE_RESTORE_VM);
3432
3433    ioc = qio_channel_file_new_path(filename, O_RDONLY | O_BINARY, 0, errp);
3434    if (!ioc) {
3435        return;
3436    }
3437    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-load-state");
3438    f = qemu_file_new_input(QIO_CHANNEL(ioc));
3439    object_unref(OBJECT(ioc));
3440
3441    ret = qemu_loadvm_state(f);
3442    qemu_fclose(f);
3443    if (ret < 0) {
3444        error_setg(errp, "loading Xen device state failed");
3445    }
3446    migration_incoming_state_destroy();
3447}
3448
3449bool load_snapshot(const char *name, const char *vmstate,
3450                   bool has_devices, strList *devices, Error **errp)
3451{
3452    BlockDriverState *bs_vm_state;
3453    QEMUSnapshotInfo sn;
3454    QEMUFile *f;
3455    int ret;
3456    MigrationIncomingState *mis = migration_incoming_get_current();
3457
3458    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
3459        return false;
3460    }
3461    ret = bdrv_all_has_snapshot(name, has_devices, devices, errp);
3462    if (ret < 0) {
3463        return false;
3464    }
3465    if (ret == 0) {
3466        error_setg(errp, "Snapshot '%s' does not exist in one or more devices",
3467                   name);
3468        return false;
3469    }
3470
3471    bs_vm_state = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
3472    if (!bs_vm_state) {
3473        return false;
3474    }
3475
3476    /* Don't even try to load empty VM states */
3477    ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
3478    if (ret < 0) {
3479        error_setg(errp, "Snapshot can not be found");
3480        return false;
3481    } else if (sn.vm_state_size == 0) {
3482        error_setg(errp, "This is a disk-only snapshot. Revert to it "
3483                   " offline using qemu-img");
3484        return false;
3485    }
3486
3487    /*
3488     * Flush the record/replay queue. Now the VM state is going
3489     * to change. Therefore we don't need to preserve its consistency
3490     */
3491    replay_flush_events();
3492
3493    /* Flush all IO requests so they don't interfere with the new state.  */
3494    bdrv_drain_all_begin();
3495
3496    ret = bdrv_all_goto_snapshot(name, has_devices, devices, errp);
3497    if (ret < 0) {
3498        goto err_drain;
3499    }
3500
3501    /* restore the VM state */
3502    f = qemu_fopen_bdrv(bs_vm_state, 0);
3503    if (!f) {
3504        error_setg(errp, "Could not open VM state file");
3505        goto err_drain;
3506    }
3507
3508    qemu_system_reset(SHUTDOWN_CAUSE_SNAPSHOT_LOAD);
3509    mis->from_src_file = f;
3510
3511    if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
3512        ret = -EINVAL;
3513        goto err_drain;
3514    }
3515    ret = qemu_loadvm_state(f);
3516    migration_incoming_state_destroy();
3517
3518    bdrv_drain_all_end();
3519
3520    if (ret < 0) {
3521        error_setg(errp, "Error %d while loading VM state", ret);
3522        return false;
3523    }
3524
3525    return true;
3526
3527err_drain:
3528    bdrv_drain_all_end();
3529    return false;
3530}
3531
3532void load_snapshot_resume(RunState state)
3533{
3534    vm_resume(state);
3535    if (state == RUN_STATE_RUNNING && runstate_get() == RUN_STATE_SUSPENDED) {
3536        qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, &error_abort);
3537    }
3538}
3539
3540bool delete_snapshot(const char *name, bool has_devices,
3541                     strList *devices, Error **errp)
3542{
3543    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
3544        return false;
3545    }
3546
3547    if (bdrv_all_delete_snapshot(name, has_devices, devices, errp) < 0) {
3548        return false;
3549    }
3550
3551    return true;
3552}
3553
3554void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
3555{
3556    qemu_ram_set_idstr(mr->ram_block,
3557                       memory_region_name(mr), dev);
3558    qemu_ram_set_migratable(mr->ram_block);
3559    ram_block_add_cpr_blocker(mr->ram_block, &error_fatal);
3560}
3561
3562void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
3563{
3564    qemu_ram_unset_idstr(mr->ram_block);
3565    qemu_ram_unset_migratable(mr->ram_block);
3566    ram_block_del_cpr_blocker(mr->ram_block);
3567}
3568
3569void vmstate_register_ram_global(MemoryRegion *mr)
3570{
3571    vmstate_register_ram(mr, NULL);
3572}
3573
3574bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
3575{
3576    /* check needed if --only-migratable is specified */
3577    if (!only_migratable) {
3578        return true;
3579    }
3580
3581    return !(vmsd && vmsd->unmigratable);
3582}
3583
3584typedef struct SnapshotJob {
3585    Job common;
3586    char *tag;
3587    char *vmstate;
3588    strList *devices;
3589    Coroutine *co;
3590    Error **errp;
3591    bool ret;
3592} SnapshotJob;
3593
3594static void qmp_snapshot_job_free(SnapshotJob *s)
3595{
3596    g_free(s->tag);
3597    g_free(s->vmstate);
3598    qapi_free_strList(s->devices);
3599}
3600
3601
3602static void snapshot_load_job_bh(void *opaque)
3603{
3604    Job *job = opaque;
3605    SnapshotJob *s = container_of(job, SnapshotJob, common);
3606    RunState orig_state = runstate_get();
3607
3608    job_progress_set_remaining(&s->common, 1);
3609
3610    vm_stop(RUN_STATE_RESTORE_VM);
3611
3612    s->ret = load_snapshot(s->tag, s->vmstate, true, s->devices, s->errp);
3613    if (s->ret) {
3614        load_snapshot_resume(orig_state);
3615    }
3616
3617    job_progress_update(&s->common, 1);
3618
3619    qmp_snapshot_job_free(s);
3620    aio_co_wake(s->co);
3621}
3622
3623static void snapshot_save_job_bh(void *opaque)
3624{
3625    Job *job = opaque;
3626    SnapshotJob *s = container_of(job, SnapshotJob, common);
3627
3628    job_progress_set_remaining(&s->common, 1);
3629    s->ret = save_snapshot(s->tag, false, s->vmstate,
3630                           true, s->devices, s->errp);
3631    job_progress_update(&s->common, 1);
3632
3633    qmp_snapshot_job_free(s);
3634    aio_co_wake(s->co);
3635}
3636
3637static void snapshot_delete_job_bh(void *opaque)
3638{
3639    Job *job = opaque;
3640    SnapshotJob *s = container_of(job, SnapshotJob, common);
3641
3642    job_progress_set_remaining(&s->common, 1);
3643    s->ret = delete_snapshot(s->tag, true, s->devices, s->errp);
3644    job_progress_update(&s->common, 1);
3645
3646    qmp_snapshot_job_free(s);
3647    aio_co_wake(s->co);
3648}
3649
3650static int coroutine_fn snapshot_save_job_run(Job *job, Error **errp)
3651{
3652    SnapshotJob *s = container_of(job, SnapshotJob, common);
3653    s->errp = errp;
3654    s->co = qemu_coroutine_self();
3655    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3656                            snapshot_save_job_bh, job);
3657    qemu_coroutine_yield();
3658    return s->ret ? 0 : -1;
3659}
3660
3661static int coroutine_fn snapshot_load_job_run(Job *job, Error **errp)
3662{
3663    SnapshotJob *s = container_of(job, SnapshotJob, common);
3664    s->errp = errp;
3665    s->co = qemu_coroutine_self();
3666    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3667                            snapshot_load_job_bh, job);
3668    qemu_coroutine_yield();
3669    return s->ret ? 0 : -1;
3670}
3671
3672static int coroutine_fn snapshot_delete_job_run(Job *job, Error **errp)
3673{
3674    SnapshotJob *s = container_of(job, SnapshotJob, common);
3675    s->errp = errp;
3676    s->co = qemu_coroutine_self();
3677    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3678                            snapshot_delete_job_bh, job);
3679    qemu_coroutine_yield();
3680    return s->ret ? 0 : -1;
3681}
3682
3683
3684static const JobDriver snapshot_load_job_driver = {
3685    .instance_size = sizeof(SnapshotJob),
3686    .job_type      = JOB_TYPE_SNAPSHOT_LOAD,
3687    .run           = snapshot_load_job_run,
3688};
3689
3690static const JobDriver snapshot_save_job_driver = {
3691    .instance_size = sizeof(SnapshotJob),
3692    .job_type      = JOB_TYPE_SNAPSHOT_SAVE,
3693    .run           = snapshot_save_job_run,
3694};
3695
3696static const JobDriver snapshot_delete_job_driver = {
3697    .instance_size = sizeof(SnapshotJob),
3698    .job_type      = JOB_TYPE_SNAPSHOT_DELETE,
3699    .run           = snapshot_delete_job_run,
3700};
3701
3702
3703void qmp_snapshot_save(const char *job_id,
3704                       const char *tag,
3705                       const char *vmstate,
3706                       strList *devices,
3707                       Error **errp)
3708{
3709    SnapshotJob *s;
3710
3711    s = job_create(job_id, &snapshot_save_job_driver, NULL,
3712                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3713                   NULL, NULL, errp);
3714    if (!s) {
3715        return;
3716    }
3717
3718    s->tag = g_strdup(tag);
3719    s->vmstate = g_strdup(vmstate);
3720    s->devices = QAPI_CLONE(strList, devices);
3721
3722    job_start(&s->common);
3723}
3724
3725void qmp_snapshot_load(const char *job_id,
3726                       const char *tag,
3727                       const char *vmstate,
3728                       strList *devices,
3729                       Error **errp)
3730{
3731    SnapshotJob *s;
3732
3733    s = job_create(job_id, &snapshot_load_job_driver, NULL,
3734                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3735                   NULL, NULL, errp);
3736    if (!s) {
3737        return;
3738    }
3739
3740    s->tag = g_strdup(tag);
3741    s->vmstate = g_strdup(vmstate);
3742    s->devices = QAPI_CLONE(strList, devices);
3743
3744    job_start(&s->common);
3745}
3746
3747void qmp_snapshot_delete(const char *job_id,
3748                         const char *tag,
3749                         strList *devices,
3750                         Error **errp)
3751{
3752    SnapshotJob *s;
3753
3754    s = job_create(job_id, &snapshot_delete_job_driver, NULL,
3755                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3756                   NULL, NULL, errp);
3757    if (!s) {
3758        return;
3759    }
3760
3761    s->tag = g_strdup(tag);
3762    s->devices = QAPI_CLONE(strList, devices);
3763
3764    job_start(&s->common);
3765}
3766