qemu/migration/savevm.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2009-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include "hw/boards.h"
  31#include "net/net.h"
  32#include "migration.h"
  33#include "migration/snapshot.h"
  34#include "migration/vmstate.h"
  35#include "migration/misc.h"
  36#include "migration/register.h"
  37#include "migration/global_state.h"
  38#include "ram.h"
  39#include "qemu-file-channel.h"
  40#include "qemu-file.h"
  41#include "savevm.h"
  42#include "postcopy-ram.h"
  43#include "qapi/error.h"
  44#include "qapi/qapi-commands-migration.h"
  45#include "qapi/qmp/json-writer.h"
  46#include "qapi/clone-visitor.h"
  47#include "qapi/qapi-builtin-visit.h"
  48#include "qapi/qmp/qerror.h"
  49#include "qemu/error-report.h"
  50#include "sysemu/cpus.h"
  51#include "exec/memory.h"
  52#include "exec/target_page.h"
  53#include "trace.h"
  54#include "qemu/iov.h"
  55#include "qemu/main-loop.h"
  56#include "block/snapshot.h"
  57#include "qemu/cutils.h"
  58#include "io/channel-buffer.h"
  59#include "io/channel-file.h"
  60#include "sysemu/replay.h"
  61#include "sysemu/runstate.h"
  62#include "sysemu/sysemu.h"
  63#include "sysemu/xen.h"
  64#include "migration/colo.h"
  65#include "qemu/bitmap.h"
  66#include "net/announce.h"
  67#include "qemu/yank.h"
  68#include "yank_functions.h"
  69
  70const unsigned int postcopy_ram_discard_version;
  71
  72/* Subcommands for QEMU_VM_COMMAND */
  73enum qemu_vm_cmd {
  74    MIG_CMD_INVALID = 0,   /* Must be 0 */
  75    MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
  76    MIG_CMD_PING,              /* Request a PONG on the RP */
  77
  78    MIG_CMD_POSTCOPY_ADVISE,       /* Prior to any page transfers, just
  79                                      warn we might want to do PC */
  80    MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
  81                                      pages as it's running. */
  82    MIG_CMD_POSTCOPY_RUN,          /* Start execution */
  83
  84    MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
  85                                      were previously sent during
  86                                      precopy but are dirty. */
  87    MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
  88    MIG_CMD_ENABLE_COLO,       /* Enable COLO */
  89    MIG_CMD_POSTCOPY_RESUME,   /* resume postcopy on dest */
  90    MIG_CMD_RECV_BITMAP,       /* Request for recved bitmap on dst */
  91    MIG_CMD_MAX
  92};
  93
  94#define MAX_VM_CMD_PACKAGED_SIZE UINT32_MAX
  95static struct mig_cmd_args {
  96    ssize_t     len; /* -1 = variable */
  97    const char *name;
  98} mig_cmd_args[] = {
  99    [MIG_CMD_INVALID]          = { .len = -1, .name = "INVALID" },
 100    [MIG_CMD_OPEN_RETURN_PATH] = { .len =  0, .name = "OPEN_RETURN_PATH" },
 101    [MIG_CMD_PING]             = { .len = sizeof(uint32_t), .name = "PING" },
 102    [MIG_CMD_POSTCOPY_ADVISE]  = { .len = -1, .name = "POSTCOPY_ADVISE" },
 103    [MIG_CMD_POSTCOPY_LISTEN]  = { .len =  0, .name = "POSTCOPY_LISTEN" },
 104    [MIG_CMD_POSTCOPY_RUN]     = { .len =  0, .name = "POSTCOPY_RUN" },
 105    [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
 106                                   .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
 107    [MIG_CMD_POSTCOPY_RESUME]  = { .len =  0, .name = "POSTCOPY_RESUME" },
 108    [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
 109    [MIG_CMD_RECV_BITMAP]      = { .len = -1, .name = "RECV_BITMAP" },
 110    [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
 111};
 112
 113/* Note for MIG_CMD_POSTCOPY_ADVISE:
 114 * The format of arguments is depending on postcopy mode:
 115 * - postcopy RAM only
 116 *   uint64_t host page size
 117 *   uint64_t taget page size
 118 *
 119 * - postcopy RAM and postcopy dirty bitmaps
 120 *   format is the same as for postcopy RAM only
 121 *
 122 * - postcopy dirty bitmaps only
 123 *   Nothing. Command length field is 0.
 124 *
 125 * Be careful: adding a new postcopy entity with some other parameters should
 126 * not break format self-description ability. Good way is to introduce some
 127 * generic extendable format with an exception for two old entities.
 128 */
 129
 130/***********************************************************/
 131/* savevm/loadvm support */
 132
 133static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
 134                                   int64_t pos, Error **errp)
 135{
 136    int ret;
 137    QEMUIOVector qiov;
 138
 139    qemu_iovec_init_external(&qiov, iov, iovcnt);
 140    ret = bdrv_writev_vmstate(opaque, &qiov, pos);
 141    if (ret < 0) {
 142        return ret;
 143    }
 144
 145    return qiov.size;
 146}
 147
 148static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
 149                                size_t size, Error **errp)
 150{
 151    return bdrv_load_vmstate(opaque, buf, pos, size);
 152}
 153
 154static int bdrv_fclose(void *opaque, Error **errp)
 155{
 156    return bdrv_flush(opaque);
 157}
 158
 159static const QEMUFileOps bdrv_read_ops = {
 160    .get_buffer = block_get_buffer,
 161    .close =      bdrv_fclose
 162};
 163
 164static const QEMUFileOps bdrv_write_ops = {
 165    .writev_buffer  = block_writev_buffer,
 166    .close          = bdrv_fclose
 167};
 168
 169static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
 170{
 171    if (is_writable) {
 172        return qemu_fopen_ops(bs, &bdrv_write_ops, false);
 173    }
 174    return qemu_fopen_ops(bs, &bdrv_read_ops, false);
 175}
 176
 177
 178/* QEMUFile timer support.
 179 * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c
 180 */
 181
 182void timer_put(QEMUFile *f, QEMUTimer *ts)
 183{
 184    uint64_t expire_time;
 185
 186    expire_time = timer_expire_time_ns(ts);
 187    qemu_put_be64(f, expire_time);
 188}
 189
 190void timer_get(QEMUFile *f, QEMUTimer *ts)
 191{
 192    uint64_t expire_time;
 193
 194    expire_time = qemu_get_be64(f);
 195    if (expire_time != -1) {
 196        timer_mod_ns(ts, expire_time);
 197    } else {
 198        timer_del(ts);
 199    }
 200}
 201
 202
 203/* VMState timer support.
 204 * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c
 205 */
 206
 207static int get_timer(QEMUFile *f, void *pv, size_t size,
 208                     const VMStateField *field)
 209{
 210    QEMUTimer *v = pv;
 211    timer_get(f, v);
 212    return 0;
 213}
 214
 215static int put_timer(QEMUFile *f, void *pv, size_t size,
 216                     const VMStateField *field, JSONWriter *vmdesc)
 217{
 218    QEMUTimer *v = pv;
 219    timer_put(f, v);
 220
 221    return 0;
 222}
 223
 224const VMStateInfo vmstate_info_timer = {
 225    .name = "timer",
 226    .get  = get_timer,
 227    .put  = put_timer,
 228};
 229
 230
 231typedef struct CompatEntry {
 232    char idstr[256];
 233    int instance_id;
 234} CompatEntry;
 235
 236typedef struct SaveStateEntry {
 237    QTAILQ_ENTRY(SaveStateEntry) entry;
 238    char idstr[256];
 239    uint32_t instance_id;
 240    int alias_id;
 241    int version_id;
 242    /* version id read from the stream */
 243    int load_version_id;
 244    int section_id;
 245    /* section id read from the stream */
 246    int load_section_id;
 247    const SaveVMHandlers *ops;
 248    const VMStateDescription *vmsd;
 249    void *opaque;
 250    CompatEntry *compat;
 251    int is_ram;
 252} SaveStateEntry;
 253
 254typedef struct SaveState {
 255    QTAILQ_HEAD(, SaveStateEntry) handlers;
 256    SaveStateEntry *handler_pri_head[MIG_PRI_MAX + 1];
 257    int global_section_id;
 258    uint32_t len;
 259    const char *name;
 260    uint32_t target_page_bits;
 261    uint32_t caps_count;
 262    MigrationCapability *capabilities;
 263    QemuUUID uuid;
 264} SaveState;
 265
 266static SaveState savevm_state = {
 267    .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
 268    .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL },
 269    .global_section_id = 0,
 270};
 271
 272static bool should_validate_capability(int capability)
 273{
 274    assert(capability >= 0 && capability < MIGRATION_CAPABILITY__MAX);
 275    /* Validate only new capabilities to keep compatibility. */
 276    switch (capability) {
 277    case MIGRATION_CAPABILITY_X_IGNORE_SHARED:
 278        return true;
 279    default:
 280        return false;
 281    }
 282}
 283
 284static uint32_t get_validatable_capabilities_count(void)
 285{
 286    MigrationState *s = migrate_get_current();
 287    uint32_t result = 0;
 288    int i;
 289    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 290        if (should_validate_capability(i) && s->enabled_capabilities[i]) {
 291            result++;
 292        }
 293    }
 294    return result;
 295}
 296
 297static int configuration_pre_save(void *opaque)
 298{
 299    SaveState *state = opaque;
 300    const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
 301    MigrationState *s = migrate_get_current();
 302    int i, j;
 303
 304    state->len = strlen(current_name);
 305    state->name = current_name;
 306    state->target_page_bits = qemu_target_page_bits();
 307
 308    state->caps_count = get_validatable_capabilities_count();
 309    state->capabilities = g_renew(MigrationCapability, state->capabilities,
 310                                  state->caps_count);
 311    for (i = j = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 312        if (should_validate_capability(i) && s->enabled_capabilities[i]) {
 313            state->capabilities[j++] = i;
 314        }
 315    }
 316    state->uuid = qemu_uuid;
 317
 318    return 0;
 319}
 320
 321static int configuration_post_save(void *opaque)
 322{
 323    SaveState *state = opaque;
 324
 325    g_free(state->capabilities);
 326    state->capabilities = NULL;
 327    state->caps_count = 0;
 328    return 0;
 329}
 330
 331static int configuration_pre_load(void *opaque)
 332{
 333    SaveState *state = opaque;
 334
 335    /* If there is no target-page-bits subsection it means the source
 336     * predates the variable-target-page-bits support and is using the
 337     * minimum possible value for this CPU.
 338     */
 339    state->target_page_bits = qemu_target_page_bits_min();
 340    return 0;
 341}
 342
 343static bool configuration_validate_capabilities(SaveState *state)
 344{
 345    bool ret = true;
 346    MigrationState *s = migrate_get_current();
 347    unsigned long *source_caps_bm;
 348    int i;
 349
 350    source_caps_bm = bitmap_new(MIGRATION_CAPABILITY__MAX);
 351    for (i = 0; i < state->caps_count; i++) {
 352        MigrationCapability capability = state->capabilities[i];
 353        set_bit(capability, source_caps_bm);
 354    }
 355
 356    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 357        bool source_state, target_state;
 358        if (!should_validate_capability(i)) {
 359            continue;
 360        }
 361        source_state = test_bit(i, source_caps_bm);
 362        target_state = s->enabled_capabilities[i];
 363        if (source_state != target_state) {
 364            error_report("Capability %s is %s, but received capability is %s",
 365                         MigrationCapability_str(i),
 366                         target_state ? "on" : "off",
 367                         source_state ? "on" : "off");
 368            ret = false;
 369            /* Don't break here to report all failed capabilities */
 370        }
 371    }
 372
 373    g_free(source_caps_bm);
 374    return ret;
 375}
 376
 377static int configuration_post_load(void *opaque, int version_id)
 378{
 379    SaveState *state = opaque;
 380    const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
 381    int ret = 0;
 382
 383    if (strncmp(state->name, current_name, state->len) != 0) {
 384        error_report("Machine type received is '%.*s' and local is '%s'",
 385                     (int) state->len, state->name, current_name);
 386        ret = -EINVAL;
 387        goto out;
 388    }
 389
 390    if (state->target_page_bits != qemu_target_page_bits()) {
 391        error_report("Received TARGET_PAGE_BITS is %d but local is %d",
 392                     state->target_page_bits, qemu_target_page_bits());
 393        ret = -EINVAL;
 394        goto out;
 395    }
 396
 397    if (!configuration_validate_capabilities(state)) {
 398        ret = -EINVAL;
 399        goto out;
 400    }
 401
 402out:
 403    g_free((void *)state->name);
 404    state->name = NULL;
 405    state->len = 0;
 406    g_free(state->capabilities);
 407    state->capabilities = NULL;
 408    state->caps_count = 0;
 409
 410    return ret;
 411}
 412
 413static int get_capability(QEMUFile *f, void *pv, size_t size,
 414                          const VMStateField *field)
 415{
 416    MigrationCapability *capability = pv;
 417    char capability_str[UINT8_MAX + 1];
 418    uint8_t len;
 419    int i;
 420
 421    len = qemu_get_byte(f);
 422    qemu_get_buffer(f, (uint8_t *)capability_str, len);
 423    capability_str[len] = '\0';
 424    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 425        if (!strcmp(MigrationCapability_str(i), capability_str)) {
 426            *capability = i;
 427            return 0;
 428        }
 429    }
 430    error_report("Received unknown capability %s", capability_str);
 431    return -EINVAL;
 432}
 433
 434static int put_capability(QEMUFile *f, void *pv, size_t size,
 435                          const VMStateField *field, JSONWriter *vmdesc)
 436{
 437    MigrationCapability *capability = pv;
 438    const char *capability_str = MigrationCapability_str(*capability);
 439    size_t len = strlen(capability_str);
 440    assert(len <= UINT8_MAX);
 441
 442    qemu_put_byte(f, len);
 443    qemu_put_buffer(f, (uint8_t *)capability_str, len);
 444    return 0;
 445}
 446
 447static const VMStateInfo vmstate_info_capability = {
 448    .name = "capability",
 449    .get  = get_capability,
 450    .put  = put_capability,
 451};
 452
 453/* The target-page-bits subsection is present only if the
 454 * target page size is not the same as the default (ie the
 455 * minimum page size for a variable-page-size guest CPU).
 456 * If it is present then it contains the actual target page
 457 * bits for the machine, and migration will fail if the
 458 * two ends don't agree about it.
 459 */
 460static bool vmstate_target_page_bits_needed(void *opaque)
 461{
 462    return qemu_target_page_bits()
 463        > qemu_target_page_bits_min();
 464}
 465
 466static const VMStateDescription vmstate_target_page_bits = {
 467    .name = "configuration/target-page-bits",
 468    .version_id = 1,
 469    .minimum_version_id = 1,
 470    .needed = vmstate_target_page_bits_needed,
 471    .fields = (VMStateField[]) {
 472        VMSTATE_UINT32(target_page_bits, SaveState),
 473        VMSTATE_END_OF_LIST()
 474    }
 475};
 476
 477static bool vmstate_capabilites_needed(void *opaque)
 478{
 479    return get_validatable_capabilities_count() > 0;
 480}
 481
 482static const VMStateDescription vmstate_capabilites = {
 483    .name = "configuration/capabilities",
 484    .version_id = 1,
 485    .minimum_version_id = 1,
 486    .needed = vmstate_capabilites_needed,
 487    .fields = (VMStateField[]) {
 488        VMSTATE_UINT32_V(caps_count, SaveState, 1),
 489        VMSTATE_VARRAY_UINT32_ALLOC(capabilities, SaveState, caps_count, 1,
 490                                    vmstate_info_capability,
 491                                    MigrationCapability),
 492        VMSTATE_END_OF_LIST()
 493    }
 494};
 495
 496static bool vmstate_uuid_needed(void *opaque)
 497{
 498    return qemu_uuid_set && migrate_validate_uuid();
 499}
 500
 501static int vmstate_uuid_post_load(void *opaque, int version_id)
 502{
 503    SaveState *state = opaque;
 504    char uuid_src[UUID_FMT_LEN + 1];
 505    char uuid_dst[UUID_FMT_LEN + 1];
 506
 507    if (!qemu_uuid_set) {
 508        /*
 509         * It's warning because user might not know UUID in some cases,
 510         * e.g. load an old snapshot
 511         */
 512        qemu_uuid_unparse(&state->uuid, uuid_src);
 513        warn_report("UUID is received %s, but local uuid isn't set",
 514                     uuid_src);
 515        return 0;
 516    }
 517    if (!qemu_uuid_is_equal(&state->uuid, &qemu_uuid)) {
 518        qemu_uuid_unparse(&state->uuid, uuid_src);
 519        qemu_uuid_unparse(&qemu_uuid, uuid_dst);
 520        error_report("UUID received is %s and local is %s", uuid_src, uuid_dst);
 521        return -EINVAL;
 522    }
 523    return 0;
 524}
 525
 526static const VMStateDescription vmstate_uuid = {
 527    .name = "configuration/uuid",
 528    .version_id = 1,
 529    .minimum_version_id = 1,
 530    .needed = vmstate_uuid_needed,
 531    .post_load = vmstate_uuid_post_load,
 532    .fields = (VMStateField[]) {
 533        VMSTATE_UINT8_ARRAY_V(uuid.data, SaveState, sizeof(QemuUUID), 1),
 534        VMSTATE_END_OF_LIST()
 535    }
 536};
 537
 538static const VMStateDescription vmstate_configuration = {
 539    .name = "configuration",
 540    .version_id = 1,
 541    .pre_load = configuration_pre_load,
 542    .post_load = configuration_post_load,
 543    .pre_save = configuration_pre_save,
 544    .post_save = configuration_post_save,
 545    .fields = (VMStateField[]) {
 546        VMSTATE_UINT32(len, SaveState),
 547        VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
 548        VMSTATE_END_OF_LIST()
 549    },
 550    .subsections = (const VMStateDescription *[]) {
 551        &vmstate_target_page_bits,
 552        &vmstate_capabilites,
 553        &vmstate_uuid,
 554        NULL
 555    }
 556};
 557
 558static void dump_vmstate_vmsd(FILE *out_file,
 559                              const VMStateDescription *vmsd, int indent,
 560                              bool is_subsection);
 561
 562static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field,
 563                              int indent)
 564{
 565    fprintf(out_file, "%*s{\n", indent, "");
 566    indent += 2;
 567    fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name);
 568    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 569            field->version_id);
 570    fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "",
 571            field->field_exists ? "true" : "false");
 572    fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size);
 573    if (field->vmsd != NULL) {
 574        fprintf(out_file, ",\n");
 575        dump_vmstate_vmsd(out_file, field->vmsd, indent, false);
 576    }
 577    fprintf(out_file, "\n%*s}", indent - 2, "");
 578}
 579
 580static void dump_vmstate_vmss(FILE *out_file,
 581                              const VMStateDescription **subsection,
 582                              int indent)
 583{
 584    if (*subsection != NULL) {
 585        dump_vmstate_vmsd(out_file, *subsection, indent, true);
 586    }
 587}
 588
 589static void dump_vmstate_vmsd(FILE *out_file,
 590                              const VMStateDescription *vmsd, int indent,
 591                              bool is_subsection)
 592{
 593    if (is_subsection) {
 594        fprintf(out_file, "%*s{\n", indent, "");
 595    } else {
 596        fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description");
 597    }
 598    indent += 2;
 599    fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name);
 600    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 601            vmsd->version_id);
 602    fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "",
 603            vmsd->minimum_version_id);
 604    if (vmsd->fields != NULL) {
 605        const VMStateField *field = vmsd->fields;
 606        bool first;
 607
 608        fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, "");
 609        first = true;
 610        while (field->name != NULL) {
 611            if (field->flags & VMS_MUST_EXIST) {
 612                /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */
 613                field++;
 614                continue;
 615            }
 616            if (!first) {
 617                fprintf(out_file, ",\n");
 618            }
 619            dump_vmstate_vmsf(out_file, field, indent + 2);
 620            field++;
 621            first = false;
 622        }
 623        fprintf(out_file, "\n%*s]", indent, "");
 624    }
 625    if (vmsd->subsections != NULL) {
 626        const VMStateDescription **subsection = vmsd->subsections;
 627        bool first;
 628
 629        fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, "");
 630        first = true;
 631        while (*subsection != NULL) {
 632            if (!first) {
 633                fprintf(out_file, ",\n");
 634            }
 635            dump_vmstate_vmss(out_file, subsection, indent + 2);
 636            subsection++;
 637            first = false;
 638        }
 639        fprintf(out_file, "\n%*s]", indent, "");
 640    }
 641    fprintf(out_file, "\n%*s}", indent - 2, "");
 642}
 643
 644static void dump_machine_type(FILE *out_file)
 645{
 646    MachineClass *mc;
 647
 648    mc = MACHINE_GET_CLASS(current_machine);
 649
 650    fprintf(out_file, "  \"vmschkmachine\": {\n");
 651    fprintf(out_file, "    \"Name\": \"%s\"\n", mc->name);
 652    fprintf(out_file, "  },\n");
 653}
 654
 655void dump_vmstate_json_to_file(FILE *out_file)
 656{
 657    GSList *list, *elt;
 658    bool first;
 659
 660    fprintf(out_file, "{\n");
 661    dump_machine_type(out_file);
 662
 663    first = true;
 664    list = object_class_get_list(TYPE_DEVICE, true);
 665    for (elt = list; elt; elt = elt->next) {
 666        DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data,
 667                                             TYPE_DEVICE);
 668        const char *name;
 669        int indent = 2;
 670
 671        if (!dc->vmsd) {
 672            continue;
 673        }
 674
 675        if (!first) {
 676            fprintf(out_file, ",\n");
 677        }
 678        name = object_class_get_name(OBJECT_CLASS(dc));
 679        fprintf(out_file, "%*s\"%s\": {\n", indent, "", name);
 680        indent += 2;
 681        fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name);
 682        fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 683                dc->vmsd->version_id);
 684        fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "",
 685                dc->vmsd->minimum_version_id);
 686
 687        dump_vmstate_vmsd(out_file, dc->vmsd, indent, false);
 688
 689        fprintf(out_file, "\n%*s}", indent - 2, "");
 690        first = false;
 691    }
 692    fprintf(out_file, "\n}\n");
 693    fclose(out_file);
 694    g_slist_free(list);
 695}
 696
 697static uint32_t calculate_new_instance_id(const char *idstr)
 698{
 699    SaveStateEntry *se;
 700    uint32_t instance_id = 0;
 701
 702    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 703        if (strcmp(idstr, se->idstr) == 0
 704            && instance_id <= se->instance_id) {
 705            instance_id = se->instance_id + 1;
 706        }
 707    }
 708    /* Make sure we never loop over without being noticed */
 709    assert(instance_id != VMSTATE_INSTANCE_ID_ANY);
 710    return instance_id;
 711}
 712
 713static int calculate_compat_instance_id(const char *idstr)
 714{
 715    SaveStateEntry *se;
 716    int instance_id = 0;
 717
 718    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 719        if (!se->compat) {
 720            continue;
 721        }
 722
 723        if (strcmp(idstr, se->compat->idstr) == 0
 724            && instance_id <= se->compat->instance_id) {
 725            instance_id = se->compat->instance_id + 1;
 726        }
 727    }
 728    return instance_id;
 729}
 730
 731static inline MigrationPriority save_state_priority(SaveStateEntry *se)
 732{
 733    if (se->vmsd) {
 734        return se->vmsd->priority;
 735    }
 736    return MIG_PRI_DEFAULT;
 737}
 738
 739static void savevm_state_handler_insert(SaveStateEntry *nse)
 740{
 741    MigrationPriority priority = save_state_priority(nse);
 742    SaveStateEntry *se;
 743    int i;
 744
 745    assert(priority <= MIG_PRI_MAX);
 746
 747    for (i = priority - 1; i >= 0; i--) {
 748        se = savevm_state.handler_pri_head[i];
 749        if (se != NULL) {
 750            assert(save_state_priority(se) < priority);
 751            break;
 752        }
 753    }
 754
 755    if (i >= 0) {
 756        QTAILQ_INSERT_BEFORE(se, nse, entry);
 757    } else {
 758        QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry);
 759    }
 760
 761    if (savevm_state.handler_pri_head[priority] == NULL) {
 762        savevm_state.handler_pri_head[priority] = nse;
 763    }
 764}
 765
 766static void savevm_state_handler_remove(SaveStateEntry *se)
 767{
 768    SaveStateEntry *next;
 769    MigrationPriority priority = save_state_priority(se);
 770
 771    if (se == savevm_state.handler_pri_head[priority]) {
 772        next = QTAILQ_NEXT(se, entry);
 773        if (next != NULL && save_state_priority(next) == priority) {
 774            savevm_state.handler_pri_head[priority] = next;
 775        } else {
 776            savevm_state.handler_pri_head[priority] = NULL;
 777        }
 778    }
 779    QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
 780}
 781
 782/* TODO: Individual devices generally have very little idea about the rest
 783   of the system, so instance_id should be removed/replaced.
 784   Meanwhile pass -1 as instance_id if you do not already have a clearly
 785   distinguishing id for all instances of your device class. */
 786int register_savevm_live(const char *idstr,
 787                         uint32_t instance_id,
 788                         int version_id,
 789                         const SaveVMHandlers *ops,
 790                         void *opaque)
 791{
 792    SaveStateEntry *se;
 793
 794    se = g_new0(SaveStateEntry, 1);
 795    se->version_id = version_id;
 796    se->section_id = savevm_state.global_section_id++;
 797    se->ops = ops;
 798    se->opaque = opaque;
 799    se->vmsd = NULL;
 800    /* if this is a live_savem then set is_ram */
 801    if (ops->save_setup != NULL) {
 802        se->is_ram = 1;
 803    }
 804
 805    pstrcat(se->idstr, sizeof(se->idstr), idstr);
 806
 807    if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
 808        se->instance_id = calculate_new_instance_id(se->idstr);
 809    } else {
 810        se->instance_id = instance_id;
 811    }
 812    assert(!se->compat || se->instance_id == 0);
 813    savevm_state_handler_insert(se);
 814    return 0;
 815}
 816
 817void unregister_savevm(VMStateIf *obj, const char *idstr, void *opaque)
 818{
 819    SaveStateEntry *se, *new_se;
 820    char id[256] = "";
 821
 822    if (obj) {
 823        char *oid = vmstate_if_get_id(obj);
 824        if (oid) {
 825            pstrcpy(id, sizeof(id), oid);
 826            pstrcat(id, sizeof(id), "/");
 827            g_free(oid);
 828        }
 829    }
 830    pstrcat(id, sizeof(id), idstr);
 831
 832    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
 833        if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
 834            savevm_state_handler_remove(se);
 835            g_free(se->compat);
 836            g_free(se);
 837        }
 838    }
 839}
 840
 841int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
 842                                   const VMStateDescription *vmsd,
 843                                   void *opaque, int alias_id,
 844                                   int required_for_version,
 845                                   Error **errp)
 846{
 847    SaveStateEntry *se;
 848
 849    /* If this triggers, alias support can be dropped for the vmsd. */
 850    assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id);
 851
 852    se = g_new0(SaveStateEntry, 1);
 853    se->version_id = vmsd->version_id;
 854    se->section_id = savevm_state.global_section_id++;
 855    se->opaque = opaque;
 856    se->vmsd = vmsd;
 857    se->alias_id = alias_id;
 858
 859    if (obj) {
 860        char *id = vmstate_if_get_id(obj);
 861        if (id) {
 862            if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
 863                sizeof(se->idstr)) {
 864                error_setg(errp, "Path too long for VMState (%s)", id);
 865                g_free(id);
 866                g_free(se);
 867
 868                return -1;
 869            }
 870            g_free(id);
 871
 872            se->compat = g_new0(CompatEntry, 1);
 873            pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name);
 874            se->compat->instance_id = instance_id == VMSTATE_INSTANCE_ID_ANY ?
 875                         calculate_compat_instance_id(vmsd->name) : instance_id;
 876            instance_id = VMSTATE_INSTANCE_ID_ANY;
 877        }
 878    }
 879    pstrcat(se->idstr, sizeof(se->idstr), vmsd->name);
 880
 881    if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
 882        se->instance_id = calculate_new_instance_id(se->idstr);
 883    } else {
 884        se->instance_id = instance_id;
 885    }
 886    assert(!se->compat || se->instance_id == 0);
 887    savevm_state_handler_insert(se);
 888    return 0;
 889}
 890
 891void vmstate_unregister(VMStateIf *obj, const VMStateDescription *vmsd,
 892                        void *opaque)
 893{
 894    SaveStateEntry *se, *new_se;
 895
 896    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
 897        if (se->vmsd == vmsd && se->opaque == opaque) {
 898            savevm_state_handler_remove(se);
 899            g_free(se->compat);
 900            g_free(se);
 901        }
 902    }
 903}
 904
 905static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
 906{
 907    trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
 908    if (!se->vmsd) {         /* Old style */
 909        return se->ops->load_state(f, se->opaque, se->load_version_id);
 910    }
 911    return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id);
 912}
 913
 914static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se,
 915                                   JSONWriter *vmdesc)
 916{
 917    int64_t old_offset, size;
 918
 919    old_offset = qemu_ftell_fast(f);
 920    se->ops->save_state(f, se->opaque);
 921    size = qemu_ftell_fast(f) - old_offset;
 922
 923    if (vmdesc) {
 924        json_writer_int64(vmdesc, "size", size);
 925        json_writer_start_array(vmdesc, "fields");
 926        json_writer_start_object(vmdesc, NULL);
 927        json_writer_str(vmdesc, "name", "data");
 928        json_writer_int64(vmdesc, "size", size);
 929        json_writer_str(vmdesc, "type", "buffer");
 930        json_writer_end_object(vmdesc);
 931        json_writer_end_array(vmdesc);
 932    }
 933}
 934
 935static int vmstate_save(QEMUFile *f, SaveStateEntry *se,
 936                        JSONWriter *vmdesc)
 937{
 938    trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
 939    if (!se->vmsd) {
 940        vmstate_save_old_style(f, se, vmdesc);
 941        return 0;
 942    }
 943    return vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
 944}
 945
 946/*
 947 * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
 948 */
 949static void save_section_header(QEMUFile *f, SaveStateEntry *se,
 950                                uint8_t section_type)
 951{
 952    qemu_put_byte(f, section_type);
 953    qemu_put_be32(f, se->section_id);
 954
 955    if (section_type == QEMU_VM_SECTION_FULL ||
 956        section_type == QEMU_VM_SECTION_START) {
 957        /* ID string */
 958        size_t len = strlen(se->idstr);
 959        qemu_put_byte(f, len);
 960        qemu_put_buffer(f, (uint8_t *)se->idstr, len);
 961
 962        qemu_put_be32(f, se->instance_id);
 963        qemu_put_be32(f, se->version_id);
 964    }
 965}
 966
 967/*
 968 * Write a footer onto device sections that catches cases misformatted device
 969 * sections.
 970 */
 971static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
 972{
 973    if (migrate_get_current()->send_section_footer) {
 974        qemu_put_byte(f, QEMU_VM_SECTION_FOOTER);
 975        qemu_put_be32(f, se->section_id);
 976    }
 977}
 978
 979/**
 980 * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
 981 *                           command and associated data.
 982 *
 983 * @f: File to send command on
 984 * @command: Command type to send
 985 * @len: Length of associated data
 986 * @data: Data associated with command.
 987 */
 988static void qemu_savevm_command_send(QEMUFile *f,
 989                                     enum qemu_vm_cmd command,
 990                                     uint16_t len,
 991                                     uint8_t *data)
 992{
 993    trace_savevm_command_send(command, len);
 994    qemu_put_byte(f, QEMU_VM_COMMAND);
 995    qemu_put_be16(f, (uint16_t)command);
 996    qemu_put_be16(f, len);
 997    qemu_put_buffer(f, data, len);
 998    qemu_fflush(f);
 999}
1000
1001void qemu_savevm_send_colo_enable(QEMUFile *f)
1002{
1003    trace_savevm_send_colo_enable();
1004    qemu_savevm_command_send(f, MIG_CMD_ENABLE_COLO, 0, NULL);
1005}
1006
1007void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
1008{
1009    uint32_t buf;
1010
1011    trace_savevm_send_ping(value);
1012    buf = cpu_to_be32(value);
1013    qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
1014}
1015
1016void qemu_savevm_send_open_return_path(QEMUFile *f)
1017{
1018    trace_savevm_send_open_return_path();
1019    qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
1020}
1021
1022/* We have a buffer of data to send; we don't want that all to be loaded
1023 * by the command itself, so the command contains just the length of the
1024 * extra buffer that we then send straight after it.
1025 * TODO: Must be a better way to organise that
1026 *
1027 * Returns:
1028 *    0 on success
1029 *    -ve on error
1030 */
1031int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len)
1032{
1033    uint32_t tmp;
1034
1035    if (len > MAX_VM_CMD_PACKAGED_SIZE) {
1036        error_report("%s: Unreasonably large packaged state: %zu",
1037                     __func__, len);
1038        return -1;
1039    }
1040
1041    tmp = cpu_to_be32(len);
1042
1043    trace_qemu_savevm_send_packaged();
1044    qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
1045
1046    qemu_put_buffer(f, buf, len);
1047
1048    return 0;
1049}
1050
1051/* Send prior to any postcopy transfer */
1052void qemu_savevm_send_postcopy_advise(QEMUFile *f)
1053{
1054    if (migrate_postcopy_ram()) {
1055        uint64_t tmp[2];
1056        tmp[0] = cpu_to_be64(ram_pagesize_summary());
1057        tmp[1] = cpu_to_be64(qemu_target_page_size());
1058
1059        trace_qemu_savevm_send_postcopy_advise();
1060        qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE,
1061                                 16, (uint8_t *)tmp);
1062    } else {
1063        qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 0, NULL);
1064    }
1065}
1066
1067/* Sent prior to starting the destination running in postcopy, discard pages
1068 * that have already been sent but redirtied on the source.
1069 * CMD_POSTCOPY_RAM_DISCARD consist of:
1070 *      byte   version (0)
1071 *      byte   Length of name field (not including 0)
1072 *  n x byte   RAM block name
1073 *      byte   0 terminator (just for safety)
1074 *  n x        Byte ranges within the named RAMBlock
1075 *      be64   Start of the range
1076 *      be64   Length
1077 *
1078 *  name:  RAMBlock name that these entries are part of
1079 *  len: Number of page entries
1080 *  start_list: 'len' addresses
1081 *  length_list: 'len' addresses
1082 *
1083 */
1084void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
1085                                           uint16_t len,
1086                                           uint64_t *start_list,
1087                                           uint64_t *length_list)
1088{
1089    uint8_t *buf;
1090    uint16_t tmplen;
1091    uint16_t t;
1092    size_t name_len = strlen(name);
1093
1094    trace_qemu_savevm_send_postcopy_ram_discard(name, len);
1095    assert(name_len < 256);
1096    buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
1097    buf[0] = postcopy_ram_discard_version;
1098    buf[1] = name_len;
1099    memcpy(buf + 2, name, name_len);
1100    tmplen = 2 + name_len;
1101    buf[tmplen++] = '\0';
1102
1103    for (t = 0; t < len; t++) {
1104        stq_be_p(buf + tmplen, start_list[t]);
1105        tmplen += 8;
1106        stq_be_p(buf + tmplen, length_list[t]);
1107        tmplen += 8;
1108    }
1109    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
1110    g_free(buf);
1111}
1112
1113/* Get the destination into a state where it can receive postcopy data. */
1114void qemu_savevm_send_postcopy_listen(QEMUFile *f)
1115{
1116    trace_savevm_send_postcopy_listen();
1117    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
1118}
1119
1120/* Kick the destination into running */
1121void qemu_savevm_send_postcopy_run(QEMUFile *f)
1122{
1123    trace_savevm_send_postcopy_run();
1124    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
1125}
1126
1127void qemu_savevm_send_postcopy_resume(QEMUFile *f)
1128{
1129    trace_savevm_send_postcopy_resume();
1130    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RESUME, 0, NULL);
1131}
1132
1133void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name)
1134{
1135    size_t len;
1136    char buf[256];
1137
1138    trace_savevm_send_recv_bitmap(block_name);
1139
1140    buf[0] = len = strlen(block_name);
1141    memcpy(buf + 1, block_name, len);
1142
1143    qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
1144}
1145
1146bool qemu_savevm_state_blocked(Error **errp)
1147{
1148    SaveStateEntry *se;
1149
1150    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1151        if (se->vmsd && se->vmsd->unmigratable) {
1152            error_setg(errp, "State blocked by non-migratable device '%s'",
1153                       se->idstr);
1154            return true;
1155        }
1156    }
1157    return false;
1158}
1159
1160void qemu_savevm_non_migratable_list(strList **reasons)
1161{
1162    SaveStateEntry *se;
1163
1164    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1165        if (se->vmsd && se->vmsd->unmigratable) {
1166            QAPI_LIST_PREPEND(*reasons,
1167                              g_strdup_printf("non-migratable device: %s",
1168                                              se->idstr));
1169        }
1170    }
1171}
1172
1173void qemu_savevm_state_header(QEMUFile *f)
1174{
1175    trace_savevm_state_header();
1176    qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1177    qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1178
1179    if (migrate_get_current()->send_configuration) {
1180        qemu_put_byte(f, QEMU_VM_CONFIGURATION);
1181        vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
1182    }
1183}
1184
1185bool qemu_savevm_state_guest_unplug_pending(void)
1186{
1187    SaveStateEntry *se;
1188
1189    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1190        if (se->vmsd && se->vmsd->dev_unplug_pending &&
1191            se->vmsd->dev_unplug_pending(se->opaque)) {
1192            return true;
1193        }
1194    }
1195
1196    return false;
1197}
1198
1199void qemu_savevm_state_setup(QEMUFile *f)
1200{
1201    SaveStateEntry *se;
1202    Error *local_err = NULL;
1203    int ret;
1204
1205    trace_savevm_state_setup();
1206    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1207        if (!se->ops || !se->ops->save_setup) {
1208            continue;
1209        }
1210        if (se->ops->is_active) {
1211            if (!se->ops->is_active(se->opaque)) {
1212                continue;
1213            }
1214        }
1215        save_section_header(f, se, QEMU_VM_SECTION_START);
1216
1217        ret = se->ops->save_setup(f, se->opaque);
1218        save_section_footer(f, se);
1219        if (ret < 0) {
1220            qemu_file_set_error(f, ret);
1221            break;
1222        }
1223    }
1224
1225    if (precopy_notify(PRECOPY_NOTIFY_SETUP, &local_err)) {
1226        error_report_err(local_err);
1227    }
1228}
1229
1230int qemu_savevm_state_resume_prepare(MigrationState *s)
1231{
1232    SaveStateEntry *se;
1233    int ret;
1234
1235    trace_savevm_state_resume_prepare();
1236
1237    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1238        if (!se->ops || !se->ops->resume_prepare) {
1239            continue;
1240        }
1241        if (se->ops->is_active) {
1242            if (!se->ops->is_active(se->opaque)) {
1243                continue;
1244            }
1245        }
1246        ret = se->ops->resume_prepare(s, se->opaque);
1247        if (ret < 0) {
1248            return ret;
1249        }
1250    }
1251
1252    return 0;
1253}
1254
1255/*
1256 * this function has three return values:
1257 *   negative: there was one error, and we have -errno.
1258 *   0 : We haven't finished, caller have to go again
1259 *   1 : We have finished, we can go to complete phase
1260 */
1261int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
1262{
1263    SaveStateEntry *se;
1264    int ret = 1;
1265
1266    trace_savevm_state_iterate();
1267    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1268        if (!se->ops || !se->ops->save_live_iterate) {
1269            continue;
1270        }
1271        if (se->ops->is_active &&
1272            !se->ops->is_active(se->opaque)) {
1273            continue;
1274        }
1275        if (se->ops->is_active_iterate &&
1276            !se->ops->is_active_iterate(se->opaque)) {
1277            continue;
1278        }
1279        /*
1280         * In the postcopy phase, any device that doesn't know how to
1281         * do postcopy should have saved it's state in the _complete
1282         * call that's already run, it might get confused if we call
1283         * iterate afterwards.
1284         */
1285        if (postcopy &&
1286            !(se->ops->has_postcopy && se->ops->has_postcopy(se->opaque))) {
1287            continue;
1288        }
1289        if (qemu_file_rate_limit(f)) {
1290            return 0;
1291        }
1292        trace_savevm_section_start(se->idstr, se->section_id);
1293
1294        save_section_header(f, se, QEMU_VM_SECTION_PART);
1295
1296        ret = se->ops->save_live_iterate(f, se->opaque);
1297        trace_savevm_section_end(se->idstr, se->section_id, ret);
1298        save_section_footer(f, se);
1299
1300        if (ret < 0) {
1301            error_report("failed to save SaveStateEntry with id(name): %d(%s)",
1302                         se->section_id, se->idstr);
1303            qemu_file_set_error(f, ret);
1304        }
1305        if (ret <= 0) {
1306            /* Do not proceed to the next vmstate before this one reported
1307               completion of the current stage. This serializes the migration
1308               and reduces the probability that a faster changing state is
1309               synchronized over and over again. */
1310            break;
1311        }
1312    }
1313    return ret;
1314}
1315
1316static bool should_send_vmdesc(void)
1317{
1318    MachineState *machine = MACHINE(qdev_get_machine());
1319    bool in_postcopy = migration_in_postcopy();
1320    return !machine->suppress_vmdesc && !in_postcopy;
1321}
1322
1323/*
1324 * Calls the save_live_complete_postcopy methods
1325 * causing the last few pages to be sent immediately and doing any associated
1326 * cleanup.
1327 * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
1328 * all the other devices, but that happens at the point we switch to postcopy.
1329 */
1330void qemu_savevm_state_complete_postcopy(QEMUFile *f)
1331{
1332    SaveStateEntry *se;
1333    int ret;
1334
1335    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1336        if (!se->ops || !se->ops->save_live_complete_postcopy) {
1337            continue;
1338        }
1339        if (se->ops->is_active) {
1340            if (!se->ops->is_active(se->opaque)) {
1341                continue;
1342            }
1343        }
1344        trace_savevm_section_start(se->idstr, se->section_id);
1345        /* Section type */
1346        qemu_put_byte(f, QEMU_VM_SECTION_END);
1347        qemu_put_be32(f, se->section_id);
1348
1349        ret = se->ops->save_live_complete_postcopy(f, se->opaque);
1350        trace_savevm_section_end(se->idstr, se->section_id, ret);
1351        save_section_footer(f, se);
1352        if (ret < 0) {
1353            qemu_file_set_error(f, ret);
1354            return;
1355        }
1356    }
1357
1358    qemu_put_byte(f, QEMU_VM_EOF);
1359    qemu_fflush(f);
1360}
1361
1362static
1363int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
1364{
1365    SaveStateEntry *se;
1366    int ret;
1367
1368    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1369        if (!se->ops ||
1370            (in_postcopy && se->ops->has_postcopy &&
1371             se->ops->has_postcopy(se->opaque)) ||
1372            !se->ops->save_live_complete_precopy) {
1373            continue;
1374        }
1375
1376        if (se->ops->is_active) {
1377            if (!se->ops->is_active(se->opaque)) {
1378                continue;
1379            }
1380        }
1381        trace_savevm_section_start(se->idstr, se->section_id);
1382
1383        save_section_header(f, se, QEMU_VM_SECTION_END);
1384
1385        ret = se->ops->save_live_complete_precopy(f, se->opaque);
1386        trace_savevm_section_end(se->idstr, se->section_id, ret);
1387        save_section_footer(f, se);
1388        if (ret < 0) {
1389            qemu_file_set_error(f, ret);
1390            return -1;
1391        }
1392    }
1393
1394    return 0;
1395}
1396
1397int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
1398                                                    bool in_postcopy,
1399                                                    bool inactivate_disks)
1400{
1401    g_autoptr(JSONWriter) vmdesc = NULL;
1402    int vmdesc_len;
1403    SaveStateEntry *se;
1404    int ret;
1405
1406    vmdesc = json_writer_new(false);
1407    json_writer_start_object(vmdesc, NULL);
1408    json_writer_int64(vmdesc, "page_size", qemu_target_page_size());
1409    json_writer_start_array(vmdesc, "devices");
1410    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1411
1412        if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1413            continue;
1414        }
1415        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1416            trace_savevm_section_skip(se->idstr, se->section_id);
1417            continue;
1418        }
1419
1420        trace_savevm_section_start(se->idstr, se->section_id);
1421
1422        json_writer_start_object(vmdesc, NULL);
1423        json_writer_str(vmdesc, "name", se->idstr);
1424        json_writer_int64(vmdesc, "instance_id", se->instance_id);
1425
1426        save_section_header(f, se, QEMU_VM_SECTION_FULL);
1427        ret = vmstate_save(f, se, vmdesc);
1428        if (ret) {
1429            qemu_file_set_error(f, ret);
1430            return ret;
1431        }
1432        trace_savevm_section_end(se->idstr, se->section_id, 0);
1433        save_section_footer(f, se);
1434
1435        json_writer_end_object(vmdesc);
1436    }
1437
1438    if (inactivate_disks) {
1439        /* Inactivate before sending QEMU_VM_EOF so that the
1440         * bdrv_invalidate_cache_all() on the other end won't fail. */
1441        ret = bdrv_inactivate_all();
1442        if (ret) {
1443            error_report("%s: bdrv_inactivate_all() failed (%d)",
1444                         __func__, ret);
1445            qemu_file_set_error(f, ret);
1446            return ret;
1447        }
1448    }
1449    if (!in_postcopy) {
1450        /* Postcopy stream will still be going */
1451        qemu_put_byte(f, QEMU_VM_EOF);
1452    }
1453
1454    json_writer_end_array(vmdesc);
1455    json_writer_end_object(vmdesc);
1456    vmdesc_len = strlen(json_writer_get(vmdesc));
1457
1458    if (should_send_vmdesc()) {
1459        qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
1460        qemu_put_be32(f, vmdesc_len);
1461        qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len);
1462    }
1463
1464    return 0;
1465}
1466
1467int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
1468                                       bool inactivate_disks)
1469{
1470    int ret;
1471    Error *local_err = NULL;
1472    bool in_postcopy = migration_in_postcopy();
1473
1474    if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) {
1475        error_report_err(local_err);
1476    }
1477
1478    trace_savevm_state_complete_precopy();
1479
1480    cpu_synchronize_all_states();
1481
1482    if (!in_postcopy || iterable_only) {
1483        ret = qemu_savevm_state_complete_precopy_iterable(f, in_postcopy);
1484        if (ret) {
1485            return ret;
1486        }
1487    }
1488
1489    if (iterable_only) {
1490        goto flush;
1491    }
1492
1493    ret = qemu_savevm_state_complete_precopy_non_iterable(f, in_postcopy,
1494                                                          inactivate_disks);
1495    if (ret) {
1496        return ret;
1497    }
1498
1499flush:
1500    qemu_fflush(f);
1501    return 0;
1502}
1503
1504/* Give an estimate of the amount left to be transferred,
1505 * the result is split into the amount for units that can and
1506 * for units that can't do postcopy.
1507 */
1508void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
1509                               uint64_t *res_precopy_only,
1510                               uint64_t *res_compatible,
1511                               uint64_t *res_postcopy_only)
1512{
1513    SaveStateEntry *se;
1514
1515    *res_precopy_only = 0;
1516    *res_compatible = 0;
1517    *res_postcopy_only = 0;
1518
1519
1520    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1521        if (!se->ops || !se->ops->save_live_pending) {
1522            continue;
1523        }
1524        if (se->ops->is_active) {
1525            if (!se->ops->is_active(se->opaque)) {
1526                continue;
1527            }
1528        }
1529        se->ops->save_live_pending(f, se->opaque, threshold_size,
1530                                   res_precopy_only, res_compatible,
1531                                   res_postcopy_only);
1532    }
1533}
1534
1535void qemu_savevm_state_cleanup(void)
1536{
1537    SaveStateEntry *se;
1538    Error *local_err = NULL;
1539
1540    if (precopy_notify(PRECOPY_NOTIFY_CLEANUP, &local_err)) {
1541        error_report_err(local_err);
1542    }
1543
1544    trace_savevm_state_cleanup();
1545    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1546        if (se->ops && se->ops->save_cleanup) {
1547            se->ops->save_cleanup(se->opaque);
1548        }
1549    }
1550}
1551
1552static int qemu_savevm_state(QEMUFile *f, Error **errp)
1553{
1554    int ret;
1555    MigrationState *ms = migrate_get_current();
1556    MigrationStatus status;
1557
1558    if (migration_is_running(ms->state)) {
1559        error_setg(errp, QERR_MIGRATION_ACTIVE);
1560        return -EINVAL;
1561    }
1562
1563    if (migrate_use_block()) {
1564        error_setg(errp, "Block migration and snapshots are incompatible");
1565        return -EINVAL;
1566    }
1567
1568    migrate_init(ms);
1569    memset(&ram_counters, 0, sizeof(ram_counters));
1570    memset(&compression_counters, 0, sizeof(compression_counters));
1571    ms->to_dst_file = f;
1572
1573    qemu_mutex_unlock_iothread();
1574    qemu_savevm_state_header(f);
1575    qemu_savevm_state_setup(f);
1576    qemu_mutex_lock_iothread();
1577
1578    while (qemu_file_get_error(f) == 0) {
1579        if (qemu_savevm_state_iterate(f, false) > 0) {
1580            break;
1581        }
1582    }
1583
1584    ret = qemu_file_get_error(f);
1585    if (ret == 0) {
1586        qemu_savevm_state_complete_precopy(f, false, false);
1587        ret = qemu_file_get_error(f);
1588    }
1589    qemu_savevm_state_cleanup();
1590    if (ret != 0) {
1591        error_setg_errno(errp, -ret, "Error while writing VM state");
1592    }
1593
1594    if (ret != 0) {
1595        status = MIGRATION_STATUS_FAILED;
1596    } else {
1597        status = MIGRATION_STATUS_COMPLETED;
1598    }
1599    migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status);
1600
1601    /* f is outer parameter, it should not stay in global migration state after
1602     * this function finished */
1603    ms->to_dst_file = NULL;
1604
1605    return ret;
1606}
1607
1608void qemu_savevm_live_state(QEMUFile *f)
1609{
1610    /* save QEMU_VM_SECTION_END section */
1611    qemu_savevm_state_complete_precopy(f, true, false);
1612    qemu_put_byte(f, QEMU_VM_EOF);
1613}
1614
1615int qemu_save_device_state(QEMUFile *f)
1616{
1617    SaveStateEntry *se;
1618
1619    if (!migration_in_colo_state()) {
1620        qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1621        qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1622    }
1623    cpu_synchronize_all_states();
1624
1625    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1626        int ret;
1627
1628        if (se->is_ram) {
1629            continue;
1630        }
1631        if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1632            continue;
1633        }
1634        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1635            continue;
1636        }
1637
1638        save_section_header(f, se, QEMU_VM_SECTION_FULL);
1639
1640        ret = vmstate_save(f, se, NULL);
1641        if (ret) {
1642            return ret;
1643        }
1644
1645        save_section_footer(f, se);
1646    }
1647
1648    qemu_put_byte(f, QEMU_VM_EOF);
1649
1650    return qemu_file_get_error(f);
1651}
1652
1653static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id)
1654{
1655    SaveStateEntry *se;
1656
1657    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1658        if (!strcmp(se->idstr, idstr) &&
1659            (instance_id == se->instance_id ||
1660             instance_id == se->alias_id))
1661            return se;
1662        /* Migrating from an older version? */
1663        if (strstr(se->idstr, idstr) && se->compat) {
1664            if (!strcmp(se->compat->idstr, idstr) &&
1665                (instance_id == se->compat->instance_id ||
1666                 instance_id == se->alias_id))
1667                return se;
1668        }
1669    }
1670    return NULL;
1671}
1672
1673enum LoadVMExitCodes {
1674    /* Allow a command to quit all layers of nested loadvm loops */
1675    LOADVM_QUIT     =  1,
1676};
1677
1678/* ------ incoming postcopy messages ------ */
1679/* 'advise' arrives before any transfers just to tell us that a postcopy
1680 * *might* happen - it might be skipped if precopy transferred everything
1681 * quickly.
1682 */
1683static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
1684                                         uint16_t len)
1685{
1686    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1687    uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps;
1688    Error *local_err = NULL;
1689
1690    trace_loadvm_postcopy_handle_advise();
1691    if (ps != POSTCOPY_INCOMING_NONE) {
1692        error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
1693        return -1;
1694    }
1695
1696    switch (len) {
1697    case 0:
1698        if (migrate_postcopy_ram()) {
1699            error_report("RAM postcopy is enabled but have 0 byte advise");
1700            return -EINVAL;
1701        }
1702        return 0;
1703    case 8 + 8:
1704        if (!migrate_postcopy_ram()) {
1705            error_report("RAM postcopy is disabled but have 16 byte advise");
1706            return -EINVAL;
1707        }
1708        break;
1709    default:
1710        error_report("CMD_POSTCOPY_ADVISE invalid length (%d)", len);
1711        return -EINVAL;
1712    }
1713
1714    if (!postcopy_ram_supported_by_host(mis)) {
1715        postcopy_state_set(POSTCOPY_INCOMING_NONE);
1716        return -1;
1717    }
1718
1719    remote_pagesize_summary = qemu_get_be64(mis->from_src_file);
1720    local_pagesize_summary = ram_pagesize_summary();
1721
1722    if (remote_pagesize_summary != local_pagesize_summary)  {
1723        /*
1724         * This detects two potential causes of mismatch:
1725         *   a) A mismatch in host page sizes
1726         *      Some combinations of mismatch are probably possible but it gets
1727         *      a bit more complicated.  In particular we need to place whole
1728         *      host pages on the dest at once, and we need to ensure that we
1729         *      handle dirtying to make sure we never end up sending part of
1730         *      a hostpage on it's own.
1731         *   b) The use of different huge page sizes on source/destination
1732         *      a more fine grain test is performed during RAM block migration
1733         *      but this test here causes a nice early clear failure, and
1734         *      also fails when passed to an older qemu that doesn't
1735         *      do huge pages.
1736         */
1737        error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64
1738                                                             " d=%" PRIx64 ")",
1739                     remote_pagesize_summary, local_pagesize_summary);
1740        return -1;
1741    }
1742
1743    remote_tps = qemu_get_be64(mis->from_src_file);
1744    if (remote_tps != qemu_target_page_size()) {
1745        /*
1746         * Again, some differences could be dealt with, but for now keep it
1747         * simple.
1748         */
1749        error_report("Postcopy needs matching target page sizes (s=%d d=%zd)",
1750                     (int)remote_tps, qemu_target_page_size());
1751        return -1;
1752    }
1753
1754    if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_ADVISE, &local_err)) {
1755        error_report_err(local_err);
1756        return -1;
1757    }
1758
1759    if (ram_postcopy_incoming_init(mis)) {
1760        return -1;
1761    }
1762
1763    return 0;
1764}
1765
1766/* After postcopy we will be told to throw some pages away since they're
1767 * dirty and will have to be demand fetched.  Must happen before CPU is
1768 * started.
1769 * There can be 0..many of these messages, each encoding multiple pages.
1770 */
1771static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
1772                                              uint16_t len)
1773{
1774    int tmp;
1775    char ramid[256];
1776    PostcopyState ps = postcopy_state_get();
1777
1778    trace_loadvm_postcopy_ram_handle_discard();
1779
1780    switch (ps) {
1781    case POSTCOPY_INCOMING_ADVISE:
1782        /* 1st discard */
1783        tmp = postcopy_ram_prepare_discard(mis);
1784        if (tmp) {
1785            return tmp;
1786        }
1787        break;
1788
1789    case POSTCOPY_INCOMING_DISCARD:
1790        /* Expected state */
1791        break;
1792
1793    default:
1794        error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
1795                     ps);
1796        return -1;
1797    }
1798    /* We're expecting a
1799     *    Version (0)
1800     *    a RAM ID string (length byte, name, 0 term)
1801     *    then at least 1 16 byte chunk
1802    */
1803    if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
1804        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1805        return -1;
1806    }
1807
1808    tmp = qemu_get_byte(mis->from_src_file);
1809    if (tmp != postcopy_ram_discard_version) {
1810        error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
1811        return -1;
1812    }
1813
1814    if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
1815        error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
1816        return -1;
1817    }
1818    tmp = qemu_get_byte(mis->from_src_file);
1819    if (tmp != 0) {
1820        error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
1821        return -1;
1822    }
1823
1824    len -= 3 + strlen(ramid);
1825    if (len % 16) {
1826        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1827        return -1;
1828    }
1829    trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
1830    while (len) {
1831        uint64_t start_addr, block_length;
1832        start_addr = qemu_get_be64(mis->from_src_file);
1833        block_length = qemu_get_be64(mis->from_src_file);
1834
1835        len -= 16;
1836        int ret = ram_discard_range(ramid, start_addr, block_length);
1837        if (ret) {
1838            return ret;
1839        }
1840    }
1841    trace_loadvm_postcopy_ram_handle_discard_end();
1842
1843    return 0;
1844}
1845
1846/*
1847 * Triggered by a postcopy_listen command; this thread takes over reading
1848 * the input stream, leaving the main thread free to carry on loading the rest
1849 * of the device state (from RAM).
1850 * (TODO:This could do with being in a postcopy file - but there again it's
1851 * just another input loop, not that postcopy specific)
1852 */
1853static void *postcopy_ram_listen_thread(void *opaque)
1854{
1855    MigrationIncomingState *mis = migration_incoming_get_current();
1856    QEMUFile *f = mis->from_src_file;
1857    int load_res;
1858    MigrationState *migr = migrate_get_current();
1859
1860    object_ref(OBJECT(migr));
1861
1862    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
1863                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);
1864    qemu_sem_post(&mis->listen_thread_sem);
1865    trace_postcopy_ram_listen_thread_start();
1866
1867    rcu_register_thread();
1868    /*
1869     * Because we're a thread and not a coroutine we can't yield
1870     * in qemu_file, and thus we must be blocking now.
1871     */
1872    qemu_file_set_blocking(f, true);
1873    load_res = qemu_loadvm_state_main(f, mis);
1874
1875    /*
1876     * This is tricky, but, mis->from_src_file can change after it
1877     * returns, when postcopy recovery happened. In the future, we may
1878     * want a wrapper for the QEMUFile handle.
1879     */
1880    f = mis->from_src_file;
1881
1882    /* And non-blocking again so we don't block in any cleanup */
1883    qemu_file_set_blocking(f, false);
1884
1885    trace_postcopy_ram_listen_thread_exit();
1886    if (load_res < 0) {
1887        qemu_file_set_error(f, load_res);
1888        dirty_bitmap_mig_cancel_incoming();
1889        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
1890            !migrate_postcopy_ram() && migrate_dirty_bitmaps())
1891        {
1892            error_report("%s: loadvm failed during postcopy: %d. All states "
1893                         "are migrated except dirty bitmaps. Some dirty "
1894                         "bitmaps may be lost, and present migrated dirty "
1895                         "bitmaps are correctly migrated and valid.",
1896                         __func__, load_res);
1897            load_res = 0; /* prevent further exit() */
1898        } else {
1899            error_report("%s: loadvm failed: %d", __func__, load_res);
1900            migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1901                                           MIGRATION_STATUS_FAILED);
1902        }
1903    }
1904    if (load_res >= 0) {
1905        /*
1906         * This looks good, but it's possible that the device loading in the
1907         * main thread hasn't finished yet, and so we might not be in 'RUN'
1908         * state yet; wait for the end of the main thread.
1909         */
1910        qemu_event_wait(&mis->main_thread_load_event);
1911    }
1912    postcopy_ram_incoming_cleanup(mis);
1913
1914    if (load_res < 0) {
1915        /*
1916         * If something went wrong then we have a bad state so exit;
1917         * depending how far we got it might be possible at this point
1918         * to leave the guest running and fire MCEs for pages that never
1919         * arrived as a desperate recovery step.
1920         */
1921        rcu_unregister_thread();
1922        exit(EXIT_FAILURE);
1923    }
1924
1925    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1926                                   MIGRATION_STATUS_COMPLETED);
1927    /*
1928     * If everything has worked fine, then the main thread has waited
1929     * for us to start, and we're the last use of the mis.
1930     * (If something broke then qemu will have to exit anyway since it's
1931     * got a bad migration state).
1932     */
1933    migration_incoming_state_destroy();
1934    qemu_loadvm_state_cleanup();
1935
1936    rcu_unregister_thread();
1937    mis->have_listen_thread = false;
1938    postcopy_state_set(POSTCOPY_INCOMING_END);
1939
1940    object_unref(OBJECT(migr));
1941
1942    return NULL;
1943}
1944
1945/* After this message we must be able to immediately receive postcopy data */
1946static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
1947{
1948    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
1949    trace_loadvm_postcopy_handle_listen();
1950    Error *local_err = NULL;
1951
1952    if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
1953        error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
1954        return -1;
1955    }
1956    if (ps == POSTCOPY_INCOMING_ADVISE) {
1957        /*
1958         * A rare case, we entered listen without having to do any discards,
1959         * so do the setup that's normally done at the time of the 1st discard.
1960         */
1961        if (migrate_postcopy_ram()) {
1962            postcopy_ram_prepare_discard(mis);
1963        }
1964    }
1965
1966    /*
1967     * Sensitise RAM - can now generate requests for blocks that don't exist
1968     * However, at this point the CPU shouldn't be running, and the IO
1969     * shouldn't be doing anything yet so don't actually expect requests
1970     */
1971    if (migrate_postcopy_ram()) {
1972        if (postcopy_ram_incoming_setup(mis)) {
1973            postcopy_ram_incoming_cleanup(mis);
1974            return -1;
1975        }
1976    }
1977
1978    if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) {
1979        error_report_err(local_err);
1980        return -1;
1981    }
1982
1983    mis->have_listen_thread = true;
1984    /* Start up the listening thread and wait for it to signal ready */
1985    qemu_sem_init(&mis->listen_thread_sem, 0);
1986    qemu_thread_create(&mis->listen_thread, "postcopy/listen",
1987                       postcopy_ram_listen_thread, NULL,
1988                       QEMU_THREAD_DETACHED);
1989    qemu_sem_wait(&mis->listen_thread_sem);
1990    qemu_sem_destroy(&mis->listen_thread_sem);
1991
1992    return 0;
1993}
1994
1995static void loadvm_postcopy_handle_run_bh(void *opaque)
1996{
1997    Error *local_err = NULL;
1998    MigrationIncomingState *mis = opaque;
1999
2000    /* TODO we should move all of this lot into postcopy_ram.c or a shared code
2001     * in migration.c
2002     */
2003    cpu_synchronize_all_post_init();
2004
2005    qemu_announce_self(&mis->announce_timer, migrate_announce_params());
2006
2007    /* Make sure all file formats flush their mutable metadata.
2008     * If we get an error here, just don't restart the VM yet. */
2009    bdrv_invalidate_cache_all(&local_err);
2010    if (local_err) {
2011        error_report_err(local_err);
2012        local_err = NULL;
2013        autostart = false;
2014    }
2015
2016    trace_loadvm_postcopy_handle_run_cpu_sync();
2017
2018    trace_loadvm_postcopy_handle_run_vmstart();
2019
2020    dirty_bitmap_mig_before_vm_start();
2021
2022    if (autostart) {
2023        /* Hold onto your hats, starting the CPU */
2024        vm_start();
2025    } else {
2026        /* leave it paused and let management decide when to start the CPU */
2027        runstate_set(RUN_STATE_PAUSED);
2028    }
2029
2030    qemu_bh_delete(mis->bh);
2031}
2032
2033/* After all discards we can start running and asking for pages */
2034static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
2035{
2036    PostcopyState ps = postcopy_state_get();
2037
2038    trace_loadvm_postcopy_handle_run();
2039    if (ps != POSTCOPY_INCOMING_LISTENING) {
2040        error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
2041        return -1;
2042    }
2043
2044    postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
2045    mis->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, mis);
2046    qemu_bh_schedule(mis->bh);
2047
2048    /* We need to finish reading the stream from the package
2049     * and also stop reading anything more from the stream that loaded the
2050     * package (since it's now being read by the listener thread).
2051     * LOADVM_QUIT will quit all the layers of nested loadvm loops.
2052     */
2053    return LOADVM_QUIT;
2054}
2055
2056/* We must be with page_request_mutex held */
2057static gboolean postcopy_sync_page_req(gpointer key, gpointer value,
2058                                       gpointer data)
2059{
2060    MigrationIncomingState *mis = data;
2061    void *host_addr = (void *) key;
2062    ram_addr_t rb_offset;
2063    RAMBlock *rb;
2064    int ret;
2065
2066    rb = qemu_ram_block_from_host(host_addr, true, &rb_offset);
2067    if (!rb) {
2068        /*
2069         * This should _never_ happen.  However be nice for a migrating VM to
2070         * not crash/assert.  Post an error (note: intended to not use *_once
2071         * because we do want to see all the illegal addresses; and this can
2072         * never be triggered by the guest so we're safe) and move on next.
2073         */
2074        error_report("%s: illegal host addr %p", __func__, host_addr);
2075        /* Try the next entry */
2076        return FALSE;
2077    }
2078
2079    ret = migrate_send_rp_message_req_pages(mis, rb, rb_offset);
2080    if (ret) {
2081        /* Please refer to above comment. */
2082        error_report("%s: send rp message failed for addr %p",
2083                     __func__, host_addr);
2084        return FALSE;
2085    }
2086
2087    trace_postcopy_page_req_sync(host_addr);
2088
2089    return FALSE;
2090}
2091
2092static void migrate_send_rp_req_pages_pending(MigrationIncomingState *mis)
2093{
2094    WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
2095        g_tree_foreach(mis->page_requested, postcopy_sync_page_req, mis);
2096    }
2097}
2098
2099static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis)
2100{
2101    if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
2102        error_report("%s: illegal resume received", __func__);
2103        /* Don't fail the load, only for this. */
2104        return 0;
2105    }
2106
2107    /*
2108     * Reset the last_rb before we resend any page req to source again, since
2109     * the source should have it reset already.
2110     */
2111    mis->last_rb = NULL;
2112
2113    /*
2114     * This means source VM is ready to resume the postcopy migration.
2115     */
2116    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2117                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
2118
2119    trace_loadvm_postcopy_handle_resume();
2120
2121    /* Tell source that "we are ready" */
2122    migrate_send_rp_resume_ack(mis, MIGRATION_RESUME_ACK_VALUE);
2123
2124    /*
2125     * After a postcopy recovery, the source should have lost the postcopy
2126     * queue, or potentially the requested pages could have been lost during
2127     * the network down phase.  Let's re-sync with the source VM by re-sending
2128     * all the pending pages that we eagerly need, so these threads won't get
2129     * blocked too long due to the recovery.
2130     *
2131     * Without this procedure, the faulted destination VM threads (waiting for
2132     * page requests right before the postcopy is interrupted) can keep hanging
2133     * until the pages are sent by the source during the background copying of
2134     * pages, or another thread faulted on the same address accidentally.
2135     */
2136    migrate_send_rp_req_pages_pending(mis);
2137
2138    /*
2139     * It's time to switch state and release the fault thread to continue
2140     * service page faults.  Note that this should be explicitly after the
2141     * above call to migrate_send_rp_req_pages_pending().  In short:
2142     * migrate_send_rp_message_req_pages() is not thread safe, yet.
2143     */
2144    qemu_sem_post(&mis->postcopy_pause_sem_fault);
2145
2146    return 0;
2147}
2148
2149/**
2150 * Immediately following this command is a blob of data containing an embedded
2151 * chunk of migration stream; read it and load it.
2152 *
2153 * @mis: Incoming state
2154 * @length: Length of packaged data to read
2155 *
2156 * Returns: Negative values on error
2157 *
2158 */
2159static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
2160{
2161    int ret;
2162    size_t length;
2163    QIOChannelBuffer *bioc;
2164
2165    length = qemu_get_be32(mis->from_src_file);
2166    trace_loadvm_handle_cmd_packaged(length);
2167
2168    if (length > MAX_VM_CMD_PACKAGED_SIZE) {
2169        error_report("Unreasonably large packaged state: %zu", length);
2170        return -1;
2171    }
2172
2173    bioc = qio_channel_buffer_new(length);
2174    qio_channel_set_name(QIO_CHANNEL(bioc), "migration-loadvm-buffer");
2175    ret = qemu_get_buffer(mis->from_src_file,
2176                          bioc->data,
2177                          length);
2178    if (ret != length) {
2179        object_unref(OBJECT(bioc));
2180        error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu",
2181                     ret, length);
2182        return (ret < 0) ? ret : -EAGAIN;
2183    }
2184    bioc->usage += length;
2185    trace_loadvm_handle_cmd_packaged_received(ret);
2186
2187    QEMUFile *packf = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
2188
2189    ret = qemu_loadvm_state_main(packf, mis);
2190    trace_loadvm_handle_cmd_packaged_main(ret);
2191    qemu_fclose(packf);
2192    object_unref(OBJECT(bioc));
2193
2194    return ret;
2195}
2196
2197/*
2198 * Handle request that source requests for recved_bitmap on
2199 * destination. Payload format:
2200 *
2201 * len (1 byte) + ramblock_name (<255 bytes)
2202 */
2203static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
2204                                     uint16_t len)
2205{
2206    QEMUFile *file = mis->from_src_file;
2207    RAMBlock *rb;
2208    char block_name[256];
2209    size_t cnt;
2210
2211    cnt = qemu_get_counted_string(file, block_name);
2212    if (!cnt) {
2213        error_report("%s: failed to read block name", __func__);
2214        return -EINVAL;
2215    }
2216
2217    /* Validate before using the data */
2218    if (qemu_file_get_error(file)) {
2219        return qemu_file_get_error(file);
2220    }
2221
2222    if (len != cnt + 1) {
2223        error_report("%s: invalid payload length (%d)", __func__, len);
2224        return -EINVAL;
2225    }
2226
2227    rb = qemu_ram_block_by_name(block_name);
2228    if (!rb) {
2229        error_report("%s: block '%s' not found", __func__, block_name);
2230        return -EINVAL;
2231    }
2232
2233    migrate_send_rp_recv_bitmap(mis, block_name);
2234
2235    trace_loadvm_handle_recv_bitmap(block_name);
2236
2237    return 0;
2238}
2239
2240static int loadvm_process_enable_colo(MigrationIncomingState *mis)
2241{
2242    int ret = migration_incoming_enable_colo();
2243
2244    if (!ret) {
2245        ret = colo_init_ram_cache();
2246        if (ret) {
2247            migration_incoming_disable_colo();
2248        }
2249    }
2250    return ret;
2251}
2252
2253/*
2254 * Process an incoming 'QEMU_VM_COMMAND'
2255 * 0           just a normal return
2256 * LOADVM_QUIT All good, but exit the loop
2257 * <0          Error
2258 */
2259static int loadvm_process_command(QEMUFile *f)
2260{
2261    MigrationIncomingState *mis = migration_incoming_get_current();
2262    uint16_t cmd;
2263    uint16_t len;
2264    uint32_t tmp32;
2265
2266    cmd = qemu_get_be16(f);
2267    len = qemu_get_be16(f);
2268
2269    /* Check validity before continue processing of cmds */
2270    if (qemu_file_get_error(f)) {
2271        return qemu_file_get_error(f);
2272    }
2273
2274    trace_loadvm_process_command(cmd, len);
2275    if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
2276        error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
2277        return -EINVAL;
2278    }
2279
2280    if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
2281        error_report("%s received with bad length - expecting %zu, got %d",
2282                     mig_cmd_args[cmd].name,
2283                     (size_t)mig_cmd_args[cmd].len, len);
2284        return -ERANGE;
2285    }
2286
2287    switch (cmd) {
2288    case MIG_CMD_OPEN_RETURN_PATH:
2289        if (mis->to_src_file) {
2290            error_report("CMD_OPEN_RETURN_PATH called when RP already open");
2291            /* Not really a problem, so don't give up */
2292            return 0;
2293        }
2294        mis->to_src_file = qemu_file_get_return_path(f);
2295        if (!mis->to_src_file) {
2296            error_report("CMD_OPEN_RETURN_PATH failed");
2297            return -1;
2298        }
2299        break;
2300
2301    case MIG_CMD_PING:
2302        tmp32 = qemu_get_be32(f);
2303        trace_loadvm_process_command_ping(tmp32);
2304        if (!mis->to_src_file) {
2305            error_report("CMD_PING (0x%x) received with no return path",
2306                         tmp32);
2307            return -1;
2308        }
2309        migrate_send_rp_pong(mis, tmp32);
2310        break;
2311
2312    case MIG_CMD_PACKAGED:
2313        return loadvm_handle_cmd_packaged(mis);
2314
2315    case MIG_CMD_POSTCOPY_ADVISE:
2316        return loadvm_postcopy_handle_advise(mis, len);
2317
2318    case MIG_CMD_POSTCOPY_LISTEN:
2319        return loadvm_postcopy_handle_listen(mis);
2320
2321    case MIG_CMD_POSTCOPY_RUN:
2322        return loadvm_postcopy_handle_run(mis);
2323
2324    case MIG_CMD_POSTCOPY_RAM_DISCARD:
2325        return loadvm_postcopy_ram_handle_discard(mis, len);
2326
2327    case MIG_CMD_POSTCOPY_RESUME:
2328        return loadvm_postcopy_handle_resume(mis);
2329
2330    case MIG_CMD_RECV_BITMAP:
2331        return loadvm_handle_recv_bitmap(mis, len);
2332
2333    case MIG_CMD_ENABLE_COLO:
2334        return loadvm_process_enable_colo(mis);
2335    }
2336
2337    return 0;
2338}
2339
2340/*
2341 * Read a footer off the wire and check that it matches the expected section
2342 *
2343 * Returns: true if the footer was good
2344 *          false if there is a problem (and calls error_report to say why)
2345 */
2346static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
2347{
2348    int ret;
2349    uint8_t read_mark;
2350    uint32_t read_section_id;
2351
2352    if (!migrate_get_current()->send_section_footer) {
2353        /* No footer to check */
2354        return true;
2355    }
2356
2357    read_mark = qemu_get_byte(f);
2358
2359    ret = qemu_file_get_error(f);
2360    if (ret) {
2361        error_report("%s: Read section footer failed: %d",
2362                     __func__, ret);
2363        return false;
2364    }
2365
2366    if (read_mark != QEMU_VM_SECTION_FOOTER) {
2367        error_report("Missing section footer for %s", se->idstr);
2368        return false;
2369    }
2370
2371    read_section_id = qemu_get_be32(f);
2372    if (read_section_id != se->load_section_id) {
2373        error_report("Mismatched section id in footer for %s -"
2374                     " read 0x%x expected 0x%x",
2375                     se->idstr, read_section_id, se->load_section_id);
2376        return false;
2377    }
2378
2379    /* All good */
2380    return true;
2381}
2382
2383static int
2384qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
2385{
2386    uint32_t instance_id, version_id, section_id;
2387    SaveStateEntry *se;
2388    char idstr[256];
2389    int ret;
2390
2391    /* Read section start */
2392    section_id = qemu_get_be32(f);
2393    if (!qemu_get_counted_string(f, idstr)) {
2394        error_report("Unable to read ID string for section %u",
2395                     section_id);
2396        return -EINVAL;
2397    }
2398    instance_id = qemu_get_be32(f);
2399    version_id = qemu_get_be32(f);
2400
2401    ret = qemu_file_get_error(f);
2402    if (ret) {
2403        error_report("%s: Failed to read instance/version ID: %d",
2404                     __func__, ret);
2405        return ret;
2406    }
2407
2408    trace_qemu_loadvm_state_section_startfull(section_id, idstr,
2409            instance_id, version_id);
2410    /* Find savevm section */
2411    se = find_se(idstr, instance_id);
2412    if (se == NULL) {
2413        error_report("Unknown savevm section or instance '%s' %"PRIu32". "
2414                     "Make sure that your current VM setup matches your "
2415                     "saved VM setup, including any hotplugged devices",
2416                     idstr, instance_id);
2417        return -EINVAL;
2418    }
2419
2420    /* Validate version */
2421    if (version_id > se->version_id) {
2422        error_report("savevm: unsupported version %d for '%s' v%d",
2423                     version_id, idstr, se->version_id);
2424        return -EINVAL;
2425    }
2426    se->load_version_id = version_id;
2427    se->load_section_id = section_id;
2428
2429    /* Validate if it is a device's state */
2430    if (xen_enabled() && se->is_ram) {
2431        error_report("loadvm: %s RAM loading not allowed on Xen", idstr);
2432        return -EINVAL;
2433    }
2434
2435    ret = vmstate_load(f, se);
2436    if (ret < 0) {
2437        error_report("error while loading state for instance 0x%"PRIx32" of"
2438                     " device '%s'", instance_id, idstr);
2439        return ret;
2440    }
2441    if (!check_section_footer(f, se)) {
2442        return -EINVAL;
2443    }
2444
2445    return 0;
2446}
2447
2448static int
2449qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
2450{
2451    uint32_t section_id;
2452    SaveStateEntry *se;
2453    int ret;
2454
2455    section_id = qemu_get_be32(f);
2456
2457    ret = qemu_file_get_error(f);
2458    if (ret) {
2459        error_report("%s: Failed to read section ID: %d",
2460                     __func__, ret);
2461        return ret;
2462    }
2463
2464    trace_qemu_loadvm_state_section_partend(section_id);
2465    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2466        if (se->load_section_id == section_id) {
2467            break;
2468        }
2469    }
2470    if (se == NULL) {
2471        error_report("Unknown savevm section %d", section_id);
2472        return -EINVAL;
2473    }
2474
2475    ret = vmstate_load(f, se);
2476    if (ret < 0) {
2477        error_report("error while loading state section id %d(%s)",
2478                     section_id, se->idstr);
2479        return ret;
2480    }
2481    if (!check_section_footer(f, se)) {
2482        return -EINVAL;
2483    }
2484
2485    return 0;
2486}
2487
2488static int qemu_loadvm_state_header(QEMUFile *f)
2489{
2490    unsigned int v;
2491    int ret;
2492
2493    v = qemu_get_be32(f);
2494    if (v != QEMU_VM_FILE_MAGIC) {
2495        error_report("Not a migration stream");
2496        return -EINVAL;
2497    }
2498
2499    v = qemu_get_be32(f);
2500    if (v == QEMU_VM_FILE_VERSION_COMPAT) {
2501        error_report("SaveVM v2 format is obsolete and don't work anymore");
2502        return -ENOTSUP;
2503    }
2504    if (v != QEMU_VM_FILE_VERSION) {
2505        error_report("Unsupported migration stream version");
2506        return -ENOTSUP;
2507    }
2508
2509    if (migrate_get_current()->send_configuration) {
2510        if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
2511            error_report("Configuration section missing");
2512            qemu_loadvm_state_cleanup();
2513            return -EINVAL;
2514        }
2515        ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
2516
2517        if (ret) {
2518            qemu_loadvm_state_cleanup();
2519            return ret;
2520        }
2521    }
2522    return 0;
2523}
2524
2525static int qemu_loadvm_state_setup(QEMUFile *f)
2526{
2527    SaveStateEntry *se;
2528    int ret;
2529
2530    trace_loadvm_state_setup();
2531    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2532        if (!se->ops || !se->ops->load_setup) {
2533            continue;
2534        }
2535        if (se->ops->is_active) {
2536            if (!se->ops->is_active(se->opaque)) {
2537                continue;
2538            }
2539        }
2540
2541        ret = se->ops->load_setup(f, se->opaque);
2542        if (ret < 0) {
2543            qemu_file_set_error(f, ret);
2544            error_report("Load state of device %s failed", se->idstr);
2545            return ret;
2546        }
2547    }
2548    return 0;
2549}
2550
2551void qemu_loadvm_state_cleanup(void)
2552{
2553    SaveStateEntry *se;
2554
2555    trace_loadvm_state_cleanup();
2556    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2557        if (se->ops && se->ops->load_cleanup) {
2558            se->ops->load_cleanup(se->opaque);
2559        }
2560    }
2561}
2562
2563/* Return true if we should continue the migration, or false. */
2564static bool postcopy_pause_incoming(MigrationIncomingState *mis)
2565{
2566    trace_postcopy_pause_incoming();
2567
2568    assert(migrate_postcopy_ram());
2569
2570    /* Clear the triggered bit to allow one recovery */
2571    mis->postcopy_recover_triggered = false;
2572
2573    /*
2574     * Unregister yank with either from/to src would work, since ioc behind it
2575     * is the same
2576     */
2577    migration_ioc_unregister_yank_from_file(mis->from_src_file);
2578
2579    assert(mis->from_src_file);
2580    qemu_file_shutdown(mis->from_src_file);
2581    qemu_fclose(mis->from_src_file);
2582    mis->from_src_file = NULL;
2583
2584    assert(mis->to_src_file);
2585    qemu_file_shutdown(mis->to_src_file);
2586    qemu_mutex_lock(&mis->rp_mutex);
2587    qemu_fclose(mis->to_src_file);
2588    mis->to_src_file = NULL;
2589    qemu_mutex_unlock(&mis->rp_mutex);
2590
2591    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2592                      MIGRATION_STATUS_POSTCOPY_PAUSED);
2593
2594    /* Notify the fault thread for the invalidated file handle */
2595    postcopy_fault_thread_notify(mis);
2596
2597    error_report("Detected IO failure for postcopy. "
2598                 "Migration paused.");
2599
2600    while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
2601        qemu_sem_wait(&mis->postcopy_pause_sem_dst);
2602    }
2603
2604    trace_postcopy_pause_incoming_continued();
2605
2606    return true;
2607}
2608
2609int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
2610{
2611    uint8_t section_type;
2612    int ret = 0;
2613
2614retry:
2615    while (true) {
2616        section_type = qemu_get_byte(f);
2617
2618        if (qemu_file_get_error(f)) {
2619            ret = qemu_file_get_error(f);
2620            break;
2621        }
2622
2623        trace_qemu_loadvm_state_section(section_type);
2624        switch (section_type) {
2625        case QEMU_VM_SECTION_START:
2626        case QEMU_VM_SECTION_FULL:
2627            ret = qemu_loadvm_section_start_full(f, mis);
2628            if (ret < 0) {
2629                goto out;
2630            }
2631            break;
2632        case QEMU_VM_SECTION_PART:
2633        case QEMU_VM_SECTION_END:
2634            ret = qemu_loadvm_section_part_end(f, mis);
2635            if (ret < 0) {
2636                goto out;
2637            }
2638            break;
2639        case QEMU_VM_COMMAND:
2640            ret = loadvm_process_command(f);
2641            trace_qemu_loadvm_state_section_command(ret);
2642            if ((ret < 0) || (ret == LOADVM_QUIT)) {
2643                goto out;
2644            }
2645            break;
2646        case QEMU_VM_EOF:
2647            /* This is the end of migration */
2648            goto out;
2649        default:
2650            error_report("Unknown savevm section type %d", section_type);
2651            ret = -EINVAL;
2652            goto out;
2653        }
2654    }
2655
2656out:
2657    if (ret < 0) {
2658        qemu_file_set_error(f, ret);
2659
2660        /* Cancel bitmaps incoming regardless of recovery */
2661        dirty_bitmap_mig_cancel_incoming();
2662
2663        /*
2664         * If we are during an active postcopy, then we pause instead
2665         * of bail out to at least keep the VM's dirty data.  Note
2666         * that POSTCOPY_INCOMING_LISTENING stage is still not enough,
2667         * during which we're still receiving device states and we
2668         * still haven't yet started the VM on destination.
2669         *
2670         * Only RAM postcopy supports recovery. Still, if RAM postcopy is
2671         * enabled, canceled bitmaps postcopy will not affect RAM postcopy
2672         * recovering.
2673         */
2674        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
2675            migrate_postcopy_ram() && postcopy_pause_incoming(mis)) {
2676            /* Reset f to point to the newly created channel */
2677            f = mis->from_src_file;
2678            goto retry;
2679        }
2680    }
2681    return ret;
2682}
2683
2684int qemu_loadvm_state(QEMUFile *f)
2685{
2686    MigrationIncomingState *mis = migration_incoming_get_current();
2687    Error *local_err = NULL;
2688    int ret;
2689
2690    if (qemu_savevm_state_blocked(&local_err)) {
2691        error_report_err(local_err);
2692        return -EINVAL;
2693    }
2694
2695    ret = qemu_loadvm_state_header(f);
2696    if (ret) {
2697        return ret;
2698    }
2699
2700    if (qemu_loadvm_state_setup(f) != 0) {
2701        return -EINVAL;
2702    }
2703
2704    cpu_synchronize_all_pre_loadvm();
2705
2706    ret = qemu_loadvm_state_main(f, mis);
2707    qemu_event_set(&mis->main_thread_load_event);
2708
2709    trace_qemu_loadvm_state_post_main(ret);
2710
2711    if (mis->have_listen_thread) {
2712        /* Listen thread still going, can't clean up yet */
2713        return ret;
2714    }
2715
2716    if (ret == 0) {
2717        ret = qemu_file_get_error(f);
2718    }
2719
2720    /*
2721     * Try to read in the VMDESC section as well, so that dumping tools that
2722     * intercept our migration stream have the chance to see it.
2723     */
2724
2725    /* We've got to be careful; if we don't read the data and just shut the fd
2726     * then the sender can error if we close while it's still sending.
2727     * We also mustn't read data that isn't there; some transports (RDMA)
2728     * will stall waiting for that data when the source has already closed.
2729     */
2730    if (ret == 0 && should_send_vmdesc()) {
2731        uint8_t *buf;
2732        uint32_t size;
2733        uint8_t  section_type = qemu_get_byte(f);
2734
2735        if (section_type != QEMU_VM_VMDESCRIPTION) {
2736            error_report("Expected vmdescription section, but got %d",
2737                         section_type);
2738            /*
2739             * It doesn't seem worth failing at this point since
2740             * we apparently have an otherwise valid VM state
2741             */
2742        } else {
2743            buf = g_malloc(0x1000);
2744            size = qemu_get_be32(f);
2745
2746            while (size > 0) {
2747                uint32_t read_chunk = MIN(size, 0x1000);
2748                qemu_get_buffer(f, buf, read_chunk);
2749                size -= read_chunk;
2750            }
2751            g_free(buf);
2752        }
2753    }
2754
2755    qemu_loadvm_state_cleanup();
2756    cpu_synchronize_all_post_init();
2757
2758    return ret;
2759}
2760
2761int qemu_load_device_state(QEMUFile *f)
2762{
2763    MigrationIncomingState *mis = migration_incoming_get_current();
2764    int ret;
2765
2766    /* Load QEMU_VM_SECTION_FULL section */
2767    ret = qemu_loadvm_state_main(f, mis);
2768    if (ret < 0) {
2769        error_report("Failed to load device state: %d", ret);
2770        return ret;
2771    }
2772
2773    cpu_synchronize_all_post_init();
2774    return 0;
2775}
2776
2777bool save_snapshot(const char *name, bool overwrite, const char *vmstate,
2778                  bool has_devices, strList *devices, Error **errp)
2779{
2780    BlockDriverState *bs;
2781    QEMUSnapshotInfo sn1, *sn = &sn1;
2782    int ret = -1, ret2;
2783    QEMUFile *f;
2784    int saved_vm_running;
2785    uint64_t vm_state_size;
2786    g_autoptr(GDateTime) now = g_date_time_new_now_local();
2787    AioContext *aio_context;
2788
2789    if (migration_is_blocked(errp)) {
2790        return false;
2791    }
2792
2793    if (!replay_can_snapshot()) {
2794        error_setg(errp, "Record/replay does not allow making snapshot "
2795                   "right now. Try once more later.");
2796        return false;
2797    }
2798
2799    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
2800        return false;
2801    }
2802
2803    /* Delete old snapshots of the same name */
2804    if (name) {
2805        if (overwrite) {
2806            if (bdrv_all_delete_snapshot(name, has_devices,
2807                                         devices, errp) < 0) {
2808                return false;
2809            }
2810        } else {
2811            ret2 = bdrv_all_has_snapshot(name, has_devices, devices, errp);
2812            if (ret2 < 0) {
2813                return false;
2814            }
2815            if (ret2 == 1) {
2816                error_setg(errp,
2817                           "Snapshot '%s' already exists in one or more devices",
2818                           name);
2819                return false;
2820            }
2821        }
2822    }
2823
2824    bs = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
2825    if (bs == NULL) {
2826        return false;
2827    }
2828    aio_context = bdrv_get_aio_context(bs);
2829
2830    saved_vm_running = runstate_is_running();
2831
2832    ret = global_state_store();
2833    if (ret) {
2834        error_setg(errp, "Error saving global state");
2835        return false;
2836    }
2837    vm_stop(RUN_STATE_SAVE_VM);
2838
2839    bdrv_drain_all_begin();
2840
2841    aio_context_acquire(aio_context);
2842
2843    memset(sn, 0, sizeof(*sn));
2844
2845    /* fill auxiliary fields */
2846    sn->date_sec = g_date_time_to_unix(now);
2847    sn->date_nsec = g_date_time_get_microsecond(now) * 1000;
2848    sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
2849    if (replay_mode != REPLAY_MODE_NONE) {
2850        sn->icount = replay_get_current_icount();
2851    } else {
2852        sn->icount = -1ULL;
2853    }
2854
2855    if (name) {
2856        pstrcpy(sn->name, sizeof(sn->name), name);
2857    } else {
2858        g_autofree char *autoname = g_date_time_format(now,  "vm-%Y%m%d%H%M%S");
2859        pstrcpy(sn->name, sizeof(sn->name), autoname);
2860    }
2861
2862    /* save the VM state */
2863    f = qemu_fopen_bdrv(bs, 1);
2864    if (!f) {
2865        error_setg(errp, "Could not open VM state file");
2866        goto the_end;
2867    }
2868    ret = qemu_savevm_state(f, errp);
2869    vm_state_size = qemu_ftell(f);
2870    ret2 = qemu_fclose(f);
2871    if (ret < 0) {
2872        goto the_end;
2873    }
2874    if (ret2 < 0) {
2875        ret = ret2;
2876        goto the_end;
2877    }
2878
2879    /* The bdrv_all_create_snapshot() call that follows acquires the AioContext
2880     * for itself.  BDRV_POLL_WHILE() does not support nested locking because
2881     * it only releases the lock once.  Therefore synchronous I/O will deadlock
2882     * unless we release the AioContext before bdrv_all_create_snapshot().
2883     */
2884    aio_context_release(aio_context);
2885    aio_context = NULL;
2886
2887    ret = bdrv_all_create_snapshot(sn, bs, vm_state_size,
2888                                   has_devices, devices, errp);
2889    if (ret < 0) {
2890        bdrv_all_delete_snapshot(sn->name, has_devices, devices, NULL);
2891        goto the_end;
2892    }
2893
2894    ret = 0;
2895
2896 the_end:
2897    if (aio_context) {
2898        aio_context_release(aio_context);
2899    }
2900
2901    bdrv_drain_all_end();
2902
2903    if (saved_vm_running) {
2904        vm_start();
2905    }
2906    return ret == 0;
2907}
2908
2909void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
2910                                Error **errp)
2911{
2912    QEMUFile *f;
2913    QIOChannelFile *ioc;
2914    int saved_vm_running;
2915    int ret;
2916
2917    if (!has_live) {
2918        /* live default to true so old version of Xen tool stack can have a
2919         * successful live migration */
2920        live = true;
2921    }
2922
2923    saved_vm_running = runstate_is_running();
2924    vm_stop(RUN_STATE_SAVE_VM);
2925    global_state_store_running();
2926
2927    ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT | O_TRUNC,
2928                                    0660, errp);
2929    if (!ioc) {
2930        goto the_end;
2931    }
2932    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-save-state");
2933    f = qemu_fopen_channel_output(QIO_CHANNEL(ioc));
2934    object_unref(OBJECT(ioc));
2935    ret = qemu_save_device_state(f);
2936    if (ret < 0 || qemu_fclose(f) < 0) {
2937        error_setg(errp, QERR_IO_ERROR);
2938    } else {
2939        /* libxl calls the QMP command "stop" before calling
2940         * "xen-save-devices-state" and in case of migration failure, libxl
2941         * would call "cont".
2942         * So call bdrv_inactivate_all (release locks) here to let the other
2943         * side of the migration take control of the images.
2944         */
2945        if (live && !saved_vm_running) {
2946            ret = bdrv_inactivate_all();
2947            if (ret) {
2948                error_setg(errp, "%s: bdrv_inactivate_all() failed (%d)",
2949                           __func__, ret);
2950            }
2951        }
2952    }
2953
2954 the_end:
2955    if (saved_vm_running) {
2956        vm_start();
2957    }
2958}
2959
2960void qmp_xen_load_devices_state(const char *filename, Error **errp)
2961{
2962    QEMUFile *f;
2963    QIOChannelFile *ioc;
2964    int ret;
2965
2966    /* Guest must be paused before loading the device state; the RAM state
2967     * will already have been loaded by xc
2968     */
2969    if (runstate_is_running()) {
2970        error_setg(errp, "Cannot update device state while vm is running");
2971        return;
2972    }
2973    vm_stop(RUN_STATE_RESTORE_VM);
2974
2975    ioc = qio_channel_file_new_path(filename, O_RDONLY | O_BINARY, 0, errp);
2976    if (!ioc) {
2977        return;
2978    }
2979    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-load-state");
2980    f = qemu_fopen_channel_input(QIO_CHANNEL(ioc));
2981    object_unref(OBJECT(ioc));
2982
2983    ret = qemu_loadvm_state(f);
2984    qemu_fclose(f);
2985    if (ret < 0) {
2986        error_setg(errp, QERR_IO_ERROR);
2987    }
2988    migration_incoming_state_destroy();
2989}
2990
2991bool load_snapshot(const char *name, const char *vmstate,
2992                   bool has_devices, strList *devices, Error **errp)
2993{
2994    BlockDriverState *bs_vm_state;
2995    QEMUSnapshotInfo sn;
2996    QEMUFile *f;
2997    int ret;
2998    AioContext *aio_context;
2999    MigrationIncomingState *mis = migration_incoming_get_current();
3000
3001    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
3002        return false;
3003    }
3004    ret = bdrv_all_has_snapshot(name, has_devices, devices, errp);
3005    if (ret < 0) {
3006        return false;
3007    }
3008    if (ret == 0) {
3009        error_setg(errp, "Snapshot '%s' does not exist in one or more devices",
3010                   name);
3011        return false;
3012    }
3013
3014    bs_vm_state = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
3015    if (!bs_vm_state) {
3016        return false;
3017    }
3018    aio_context = bdrv_get_aio_context(bs_vm_state);
3019
3020    /* Don't even try to load empty VM states */
3021    aio_context_acquire(aio_context);
3022    ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
3023    aio_context_release(aio_context);
3024    if (ret < 0) {
3025        return false;
3026    } else if (sn.vm_state_size == 0) {
3027        error_setg(errp, "This is a disk-only snapshot. Revert to it "
3028                   " offline using qemu-img");
3029        return false;
3030    }
3031
3032    /*
3033     * Flush the record/replay queue. Now the VM state is going
3034     * to change. Therefore we don't need to preserve its consistency
3035     */
3036    replay_flush_events();
3037
3038    /* Flush all IO requests so they don't interfere with the new state.  */
3039    bdrv_drain_all_begin();
3040
3041    ret = bdrv_all_goto_snapshot(name, has_devices, devices, errp);
3042    if (ret < 0) {
3043        goto err_drain;
3044    }
3045
3046    /* restore the VM state */
3047    f = qemu_fopen_bdrv(bs_vm_state, 0);
3048    if (!f) {
3049        error_setg(errp, "Could not open VM state file");
3050        goto err_drain;
3051    }
3052
3053    qemu_system_reset(SHUTDOWN_CAUSE_NONE);
3054    mis->from_src_file = f;
3055
3056    if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
3057        ret = -EINVAL;
3058        goto err_drain;
3059    }
3060    aio_context_acquire(aio_context);
3061    ret = qemu_loadvm_state(f);
3062    migration_incoming_state_destroy();
3063    aio_context_release(aio_context);
3064
3065    bdrv_drain_all_end();
3066
3067    if (ret < 0) {
3068        error_setg(errp, "Error %d while loading VM state", ret);
3069        return false;
3070    }
3071
3072    return true;
3073
3074err_drain:
3075    bdrv_drain_all_end();
3076    return false;
3077}
3078
3079bool delete_snapshot(const char *name, bool has_devices,
3080                     strList *devices, Error **errp)
3081{
3082    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
3083        return false;
3084    }
3085
3086    if (bdrv_all_delete_snapshot(name, has_devices, devices, errp) < 0) {
3087        return false;
3088    }
3089
3090    return true;
3091}
3092
3093void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
3094{
3095    qemu_ram_set_idstr(mr->ram_block,
3096                       memory_region_name(mr), dev);
3097    qemu_ram_set_migratable(mr->ram_block);
3098}
3099
3100void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
3101{
3102    qemu_ram_unset_idstr(mr->ram_block);
3103    qemu_ram_unset_migratable(mr->ram_block);
3104}
3105
3106void vmstate_register_ram_global(MemoryRegion *mr)
3107{
3108    vmstate_register_ram(mr, NULL);
3109}
3110
3111bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
3112{
3113    /* check needed if --only-migratable is specified */
3114    if (!only_migratable) {
3115        return true;
3116    }
3117
3118    return !(vmsd && vmsd->unmigratable);
3119}
3120
3121typedef struct SnapshotJob {
3122    Job common;
3123    char *tag;
3124    char *vmstate;
3125    strList *devices;
3126    Coroutine *co;
3127    Error **errp;
3128    bool ret;
3129} SnapshotJob;
3130
3131static void qmp_snapshot_job_free(SnapshotJob *s)
3132{
3133    g_free(s->tag);
3134    g_free(s->vmstate);
3135    qapi_free_strList(s->devices);
3136}
3137
3138
3139static void snapshot_load_job_bh(void *opaque)
3140{
3141    Job *job = opaque;
3142    SnapshotJob *s = container_of(job, SnapshotJob, common);
3143    int orig_vm_running;
3144
3145    job_progress_set_remaining(&s->common, 1);
3146
3147    orig_vm_running = runstate_is_running();
3148    vm_stop(RUN_STATE_RESTORE_VM);
3149
3150    s->ret = load_snapshot(s->tag, s->vmstate, true, s->devices, s->errp);
3151    if (s->ret && orig_vm_running) {
3152        vm_start();
3153    }
3154
3155    job_progress_update(&s->common, 1);
3156
3157    qmp_snapshot_job_free(s);
3158    aio_co_wake(s->co);
3159}
3160
3161static void snapshot_save_job_bh(void *opaque)
3162{
3163    Job *job = opaque;
3164    SnapshotJob *s = container_of(job, SnapshotJob, common);
3165
3166    job_progress_set_remaining(&s->common, 1);
3167    s->ret = save_snapshot(s->tag, false, s->vmstate,
3168                           true, s->devices, s->errp);
3169    job_progress_update(&s->common, 1);
3170
3171    qmp_snapshot_job_free(s);
3172    aio_co_wake(s->co);
3173}
3174
3175static void snapshot_delete_job_bh(void *opaque)
3176{
3177    Job *job = opaque;
3178    SnapshotJob *s = container_of(job, SnapshotJob, common);
3179
3180    job_progress_set_remaining(&s->common, 1);
3181    s->ret = delete_snapshot(s->tag, true, s->devices, s->errp);
3182    job_progress_update(&s->common, 1);
3183
3184    qmp_snapshot_job_free(s);
3185    aio_co_wake(s->co);
3186}
3187
3188static int coroutine_fn snapshot_save_job_run(Job *job, Error **errp)
3189{
3190    SnapshotJob *s = container_of(job, SnapshotJob, common);
3191    s->errp = errp;
3192    s->co = qemu_coroutine_self();
3193    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3194                            snapshot_save_job_bh, job);
3195    qemu_coroutine_yield();
3196    return s->ret ? 0 : -1;
3197}
3198
3199static int coroutine_fn snapshot_load_job_run(Job *job, Error **errp)
3200{
3201    SnapshotJob *s = container_of(job, SnapshotJob, common);
3202    s->errp = errp;
3203    s->co = qemu_coroutine_self();
3204    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3205                            snapshot_load_job_bh, job);
3206    qemu_coroutine_yield();
3207    return s->ret ? 0 : -1;
3208}
3209
3210static int coroutine_fn snapshot_delete_job_run(Job *job, Error **errp)
3211{
3212    SnapshotJob *s = container_of(job, SnapshotJob, common);
3213    s->errp = errp;
3214    s->co = qemu_coroutine_self();
3215    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3216                            snapshot_delete_job_bh, job);
3217    qemu_coroutine_yield();
3218    return s->ret ? 0 : -1;
3219}
3220
3221
3222static const JobDriver snapshot_load_job_driver = {
3223    .instance_size = sizeof(SnapshotJob),
3224    .job_type      = JOB_TYPE_SNAPSHOT_LOAD,
3225    .run           = snapshot_load_job_run,
3226};
3227
3228static const JobDriver snapshot_save_job_driver = {
3229    .instance_size = sizeof(SnapshotJob),
3230    .job_type      = JOB_TYPE_SNAPSHOT_SAVE,
3231    .run           = snapshot_save_job_run,
3232};
3233
3234static const JobDriver snapshot_delete_job_driver = {
3235    .instance_size = sizeof(SnapshotJob),
3236    .job_type      = JOB_TYPE_SNAPSHOT_DELETE,
3237    .run           = snapshot_delete_job_run,
3238};
3239
3240
3241void qmp_snapshot_save(const char *job_id,
3242                       const char *tag,
3243                       const char *vmstate,
3244                       strList *devices,
3245                       Error **errp)
3246{
3247    SnapshotJob *s;
3248
3249    s = job_create(job_id, &snapshot_save_job_driver, NULL,
3250                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3251                   NULL, NULL, errp);
3252    if (!s) {
3253        return;
3254    }
3255
3256    s->tag = g_strdup(tag);
3257    s->vmstate = g_strdup(vmstate);
3258    s->devices = QAPI_CLONE(strList, devices);
3259
3260    job_start(&s->common);
3261}
3262
3263void qmp_snapshot_load(const char *job_id,
3264                       const char *tag,
3265                       const char *vmstate,
3266                       strList *devices,
3267                       Error **errp)
3268{
3269    SnapshotJob *s;
3270
3271    s = job_create(job_id, &snapshot_load_job_driver, NULL,
3272                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3273                   NULL, NULL, errp);
3274    if (!s) {
3275        return;
3276    }
3277
3278    s->tag = g_strdup(tag);
3279    s->vmstate = g_strdup(vmstate);
3280    s->devices = QAPI_CLONE(strList, devices);
3281
3282    job_start(&s->common);
3283}
3284
3285void qmp_snapshot_delete(const char *job_id,
3286                         const char *tag,
3287                         strList *devices,
3288                         Error **errp)
3289{
3290    SnapshotJob *s;
3291
3292    s = job_create(job_id, &snapshot_delete_job_driver, NULL,
3293                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3294                   NULL, NULL, errp);
3295    if (!s) {
3296        return;
3297    }
3298
3299    s->tag = g_strdup(tag);
3300    s->devices = QAPI_CLONE(strList, devices);
3301
3302    job_start(&s->common);
3303}
3304