qemu/migration/savevm.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2009-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include "hw/boards.h"
  31#include "net/net.h"
  32#include "migration.h"
  33#include "migration/snapshot.h"
  34#include "migration/vmstate.h"
  35#include "migration/misc.h"
  36#include "migration/register.h"
  37#include "migration/global_state.h"
  38#include "ram.h"
  39#include "qemu-file-channel.h"
  40#include "qemu-file.h"
  41#include "savevm.h"
  42#include "postcopy-ram.h"
  43#include "qapi/error.h"
  44#include "qapi/qapi-commands-migration.h"
  45#include "qapi/qmp/json-writer.h"
  46#include "qapi/clone-visitor.h"
  47#include "qapi/qapi-builtin-visit.h"
  48#include "qapi/qmp/qerror.h"
  49#include "qemu/error-report.h"
  50#include "sysemu/cpus.h"
  51#include "exec/memory.h"
  52#include "exec/target_page.h"
  53#include "trace.h"
  54#include "qemu/iov.h"
  55#include "qemu/main-loop.h"
  56#include "block/snapshot.h"
  57#include "qemu/cutils.h"
  58#include "io/channel-buffer.h"
  59#include "io/channel-file.h"
  60#include "sysemu/replay.h"
  61#include "sysemu/runstate.h"
  62#include "sysemu/sysemu.h"
  63#include "sysemu/xen.h"
  64#include "migration/colo.h"
  65#include "qemu/bitmap.h"
  66#include "net/announce.h"
  67#include "qemu/yank.h"
  68#include "yank_functions.h"
  69
  70const unsigned int postcopy_ram_discard_version;
  71
  72/* Subcommands for QEMU_VM_COMMAND */
  73enum qemu_vm_cmd {
  74    MIG_CMD_INVALID = 0,   /* Must be 0 */
  75    MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
  76    MIG_CMD_PING,              /* Request a PONG on the RP */
  77
  78    MIG_CMD_POSTCOPY_ADVISE,       /* Prior to any page transfers, just
  79                                      warn we might want to do PC */
  80    MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
  81                                      pages as it's running. */
  82    MIG_CMD_POSTCOPY_RUN,          /* Start execution */
  83
  84    MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
  85                                      were previously sent during
  86                                      precopy but are dirty. */
  87    MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
  88    MIG_CMD_ENABLE_COLO,       /* Enable COLO */
  89    MIG_CMD_POSTCOPY_RESUME,   /* resume postcopy on dest */
  90    MIG_CMD_RECV_BITMAP,       /* Request for recved bitmap on dst */
  91    MIG_CMD_MAX
  92};
  93
  94#define MAX_VM_CMD_PACKAGED_SIZE UINT32_MAX
  95static struct mig_cmd_args {
  96    ssize_t     len; /* -1 = variable */
  97    const char *name;
  98} mig_cmd_args[] = {
  99    [MIG_CMD_INVALID]          = { .len = -1, .name = "INVALID" },
 100    [MIG_CMD_OPEN_RETURN_PATH] = { .len =  0, .name = "OPEN_RETURN_PATH" },
 101    [MIG_CMD_PING]             = { .len = sizeof(uint32_t), .name = "PING" },
 102    [MIG_CMD_POSTCOPY_ADVISE]  = { .len = -1, .name = "POSTCOPY_ADVISE" },
 103    [MIG_CMD_POSTCOPY_LISTEN]  = { .len =  0, .name = "POSTCOPY_LISTEN" },
 104    [MIG_CMD_POSTCOPY_RUN]     = { .len =  0, .name = "POSTCOPY_RUN" },
 105    [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
 106                                   .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
 107    [MIG_CMD_POSTCOPY_RESUME]  = { .len =  0, .name = "POSTCOPY_RESUME" },
 108    [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
 109    [MIG_CMD_RECV_BITMAP]      = { .len = -1, .name = "RECV_BITMAP" },
 110    [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
 111};
 112
 113/* Note for MIG_CMD_POSTCOPY_ADVISE:
 114 * The format of arguments is depending on postcopy mode:
 115 * - postcopy RAM only
 116 *   uint64_t host page size
 117 *   uint64_t taget page size
 118 *
 119 * - postcopy RAM and postcopy dirty bitmaps
 120 *   format is the same as for postcopy RAM only
 121 *
 122 * - postcopy dirty bitmaps only
 123 *   Nothing. Command length field is 0.
 124 *
 125 * Be careful: adding a new postcopy entity with some other parameters should
 126 * not break format self-description ability. Good way is to introduce some
 127 * generic extendable format with an exception for two old entities.
 128 */
 129
 130/***********************************************************/
 131/* savevm/loadvm support */
 132
 133static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
 134                                   int64_t pos, Error **errp)
 135{
 136    int ret;
 137    QEMUIOVector qiov;
 138
 139    qemu_iovec_init_external(&qiov, iov, iovcnt);
 140    ret = bdrv_writev_vmstate(opaque, &qiov, pos);
 141    if (ret < 0) {
 142        return ret;
 143    }
 144
 145    return qiov.size;
 146}
 147
 148static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
 149                                size_t size, Error **errp)
 150{
 151    return bdrv_load_vmstate(opaque, buf, pos, size);
 152}
 153
 154static int bdrv_fclose(void *opaque, Error **errp)
 155{
 156    return bdrv_flush(opaque);
 157}
 158
 159static const QEMUFileOps bdrv_read_ops = {
 160    .get_buffer = block_get_buffer,
 161    .close =      bdrv_fclose
 162};
 163
 164static const QEMUFileOps bdrv_write_ops = {
 165    .writev_buffer  = block_writev_buffer,
 166    .close          = bdrv_fclose
 167};
 168
 169static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
 170{
 171    if (is_writable) {
 172        return qemu_fopen_ops(bs, &bdrv_write_ops, false);
 173    }
 174    return qemu_fopen_ops(bs, &bdrv_read_ops, false);
 175}
 176
 177
 178/* QEMUFile timer support.
 179 * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c
 180 */
 181
 182void timer_put(QEMUFile *f, QEMUTimer *ts)
 183{
 184    uint64_t expire_time;
 185
 186    expire_time = timer_expire_time_ns(ts);
 187    qemu_put_be64(f, expire_time);
 188}
 189
 190void timer_get(QEMUFile *f, QEMUTimer *ts)
 191{
 192    uint64_t expire_time;
 193
 194    expire_time = qemu_get_be64(f);
 195    if (expire_time != -1) {
 196        timer_mod_ns(ts, expire_time);
 197    } else {
 198        timer_del(ts);
 199    }
 200}
 201
 202
 203/* VMState timer support.
 204 * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c
 205 */
 206
 207static int get_timer(QEMUFile *f, void *pv, size_t size,
 208                     const VMStateField *field)
 209{
 210    QEMUTimer *v = pv;
 211    timer_get(f, v);
 212    return 0;
 213}
 214
 215static int put_timer(QEMUFile *f, void *pv, size_t size,
 216                     const VMStateField *field, JSONWriter *vmdesc)
 217{
 218    QEMUTimer *v = pv;
 219    timer_put(f, v);
 220
 221    return 0;
 222}
 223
 224const VMStateInfo vmstate_info_timer = {
 225    .name = "timer",
 226    .get  = get_timer,
 227    .put  = put_timer,
 228};
 229
 230
 231typedef struct CompatEntry {
 232    char idstr[256];
 233    int instance_id;
 234} CompatEntry;
 235
 236typedef struct SaveStateEntry {
 237    QTAILQ_ENTRY(SaveStateEntry) entry;
 238    char idstr[256];
 239    uint32_t instance_id;
 240    int alias_id;
 241    int version_id;
 242    /* version id read from the stream */
 243    int load_version_id;
 244    int section_id;
 245    /* section id read from the stream */
 246    int load_section_id;
 247    const SaveVMHandlers *ops;
 248    const VMStateDescription *vmsd;
 249    void *opaque;
 250    CompatEntry *compat;
 251    int is_ram;
 252} SaveStateEntry;
 253
 254typedef struct SaveState {
 255    QTAILQ_HEAD(, SaveStateEntry) handlers;
 256    SaveStateEntry *handler_pri_head[MIG_PRI_MAX + 1];
 257    int global_section_id;
 258    uint32_t len;
 259    const char *name;
 260    uint32_t target_page_bits;
 261    uint32_t caps_count;
 262    MigrationCapability *capabilities;
 263    QemuUUID uuid;
 264} SaveState;
 265
 266static SaveState savevm_state = {
 267    .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
 268    .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL },
 269    .global_section_id = 0,
 270};
 271
 272static bool should_validate_capability(int capability)
 273{
 274    assert(capability >= 0 && capability < MIGRATION_CAPABILITY__MAX);
 275    /* Validate only new capabilities to keep compatibility. */
 276    switch (capability) {
 277    case MIGRATION_CAPABILITY_X_IGNORE_SHARED:
 278        return true;
 279    default:
 280        return false;
 281    }
 282}
 283
 284static uint32_t get_validatable_capabilities_count(void)
 285{
 286    MigrationState *s = migrate_get_current();
 287    uint32_t result = 0;
 288    int i;
 289    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 290        if (should_validate_capability(i) && s->enabled_capabilities[i]) {
 291            result++;
 292        }
 293    }
 294    return result;
 295}
 296
 297static int configuration_pre_save(void *opaque)
 298{
 299    SaveState *state = opaque;
 300    const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
 301    MigrationState *s = migrate_get_current();
 302    int i, j;
 303
 304    state->len = strlen(current_name);
 305    state->name = current_name;
 306    state->target_page_bits = qemu_target_page_bits();
 307
 308    state->caps_count = get_validatable_capabilities_count();
 309    state->capabilities = g_renew(MigrationCapability, state->capabilities,
 310                                  state->caps_count);
 311    for (i = j = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 312        if (should_validate_capability(i) && s->enabled_capabilities[i]) {
 313            state->capabilities[j++] = i;
 314        }
 315    }
 316    state->uuid = qemu_uuid;
 317
 318    return 0;
 319}
 320
 321static int configuration_post_save(void *opaque)
 322{
 323    SaveState *state = opaque;
 324
 325    g_free(state->capabilities);
 326    state->capabilities = NULL;
 327    state->caps_count = 0;
 328    return 0;
 329}
 330
 331static int configuration_pre_load(void *opaque)
 332{
 333    SaveState *state = opaque;
 334
 335    /* If there is no target-page-bits subsection it means the source
 336     * predates the variable-target-page-bits support and is using the
 337     * minimum possible value for this CPU.
 338     */
 339    state->target_page_bits = qemu_target_page_bits_min();
 340    return 0;
 341}
 342
 343static bool configuration_validate_capabilities(SaveState *state)
 344{
 345    bool ret = true;
 346    MigrationState *s = migrate_get_current();
 347    unsigned long *source_caps_bm;
 348    int i;
 349
 350    source_caps_bm = bitmap_new(MIGRATION_CAPABILITY__MAX);
 351    for (i = 0; i < state->caps_count; i++) {
 352        MigrationCapability capability = state->capabilities[i];
 353        set_bit(capability, source_caps_bm);
 354    }
 355
 356    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 357        bool source_state, target_state;
 358        if (!should_validate_capability(i)) {
 359            continue;
 360        }
 361        source_state = test_bit(i, source_caps_bm);
 362        target_state = s->enabled_capabilities[i];
 363        if (source_state != target_state) {
 364            error_report("Capability %s is %s, but received capability is %s",
 365                         MigrationCapability_str(i),
 366                         target_state ? "on" : "off",
 367                         source_state ? "on" : "off");
 368            ret = false;
 369            /* Don't break here to report all failed capabilities */
 370        }
 371    }
 372
 373    g_free(source_caps_bm);
 374    return ret;
 375}
 376
 377static int configuration_post_load(void *opaque, int version_id)
 378{
 379    SaveState *state = opaque;
 380    const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
 381    int ret = 0;
 382
 383    if (strncmp(state->name, current_name, state->len) != 0) {
 384        error_report("Machine type received is '%.*s' and local is '%s'",
 385                     (int) state->len, state->name, current_name);
 386        ret = -EINVAL;
 387        goto out;
 388    }
 389
 390    if (state->target_page_bits != qemu_target_page_bits()) {
 391        error_report("Received TARGET_PAGE_BITS is %d but local is %d",
 392                     state->target_page_bits, qemu_target_page_bits());
 393        ret = -EINVAL;
 394        goto out;
 395    }
 396
 397    if (!configuration_validate_capabilities(state)) {
 398        ret = -EINVAL;
 399        goto out;
 400    }
 401
 402out:
 403    g_free((void *)state->name);
 404    state->name = NULL;
 405    state->len = 0;
 406    g_free(state->capabilities);
 407    state->capabilities = NULL;
 408    state->caps_count = 0;
 409
 410    return ret;
 411}
 412
 413static int get_capability(QEMUFile *f, void *pv, size_t size,
 414                          const VMStateField *field)
 415{
 416    MigrationCapability *capability = pv;
 417    char capability_str[UINT8_MAX + 1];
 418    uint8_t len;
 419    int i;
 420
 421    len = qemu_get_byte(f);
 422    qemu_get_buffer(f, (uint8_t *)capability_str, len);
 423    capability_str[len] = '\0';
 424    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 425        if (!strcmp(MigrationCapability_str(i), capability_str)) {
 426            *capability = i;
 427            return 0;
 428        }
 429    }
 430    error_report("Received unknown capability %s", capability_str);
 431    return -EINVAL;
 432}
 433
 434static int put_capability(QEMUFile *f, void *pv, size_t size,
 435                          const VMStateField *field, JSONWriter *vmdesc)
 436{
 437    MigrationCapability *capability = pv;
 438    const char *capability_str = MigrationCapability_str(*capability);
 439    size_t len = strlen(capability_str);
 440    assert(len <= UINT8_MAX);
 441
 442    qemu_put_byte(f, len);
 443    qemu_put_buffer(f, (uint8_t *)capability_str, len);
 444    return 0;
 445}
 446
 447static const VMStateInfo vmstate_info_capability = {
 448    .name = "capability",
 449    .get  = get_capability,
 450    .put  = put_capability,
 451};
 452
 453/* The target-page-bits subsection is present only if the
 454 * target page size is not the same as the default (ie the
 455 * minimum page size for a variable-page-size guest CPU).
 456 * If it is present then it contains the actual target page
 457 * bits for the machine, and migration will fail if the
 458 * two ends don't agree about it.
 459 */
 460static bool vmstate_target_page_bits_needed(void *opaque)
 461{
 462    return qemu_target_page_bits()
 463        > qemu_target_page_bits_min();
 464}
 465
 466static const VMStateDescription vmstate_target_page_bits = {
 467    .name = "configuration/target-page-bits",
 468    .version_id = 1,
 469    .minimum_version_id = 1,
 470    .needed = vmstate_target_page_bits_needed,
 471    .fields = (VMStateField[]) {
 472        VMSTATE_UINT32(target_page_bits, SaveState),
 473        VMSTATE_END_OF_LIST()
 474    }
 475};
 476
 477static bool vmstate_capabilites_needed(void *opaque)
 478{
 479    return get_validatable_capabilities_count() > 0;
 480}
 481
 482static const VMStateDescription vmstate_capabilites = {
 483    .name = "configuration/capabilities",
 484    .version_id = 1,
 485    .minimum_version_id = 1,
 486    .needed = vmstate_capabilites_needed,
 487    .fields = (VMStateField[]) {
 488        VMSTATE_UINT32_V(caps_count, SaveState, 1),
 489        VMSTATE_VARRAY_UINT32_ALLOC(capabilities, SaveState, caps_count, 1,
 490                                    vmstate_info_capability,
 491                                    MigrationCapability),
 492        VMSTATE_END_OF_LIST()
 493    }
 494};
 495
 496static bool vmstate_uuid_needed(void *opaque)
 497{
 498    return qemu_uuid_set && migrate_validate_uuid();
 499}
 500
 501static int vmstate_uuid_post_load(void *opaque, int version_id)
 502{
 503    SaveState *state = opaque;
 504    char uuid_src[UUID_FMT_LEN + 1];
 505    char uuid_dst[UUID_FMT_LEN + 1];
 506
 507    if (!qemu_uuid_set) {
 508        /*
 509         * It's warning because user might not know UUID in some cases,
 510         * e.g. load an old snapshot
 511         */
 512        qemu_uuid_unparse(&state->uuid, uuid_src);
 513        warn_report("UUID is received %s, but local uuid isn't set",
 514                     uuid_src);
 515        return 0;
 516    }
 517    if (!qemu_uuid_is_equal(&state->uuid, &qemu_uuid)) {
 518        qemu_uuid_unparse(&state->uuid, uuid_src);
 519        qemu_uuid_unparse(&qemu_uuid, uuid_dst);
 520        error_report("UUID received is %s and local is %s", uuid_src, uuid_dst);
 521        return -EINVAL;
 522    }
 523    return 0;
 524}
 525
 526static const VMStateDescription vmstate_uuid = {
 527    .name = "configuration/uuid",
 528    .version_id = 1,
 529    .minimum_version_id = 1,
 530    .needed = vmstate_uuid_needed,
 531    .post_load = vmstate_uuid_post_load,
 532    .fields = (VMStateField[]) {
 533        VMSTATE_UINT8_ARRAY_V(uuid.data, SaveState, sizeof(QemuUUID), 1),
 534        VMSTATE_END_OF_LIST()
 535    }
 536};
 537
 538static const VMStateDescription vmstate_configuration = {
 539    .name = "configuration",
 540    .version_id = 1,
 541    .pre_load = configuration_pre_load,
 542    .post_load = configuration_post_load,
 543    .pre_save = configuration_pre_save,
 544    .post_save = configuration_post_save,
 545    .fields = (VMStateField[]) {
 546        VMSTATE_UINT32(len, SaveState),
 547        VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
 548        VMSTATE_END_OF_LIST()
 549    },
 550    .subsections = (const VMStateDescription *[]) {
 551        &vmstate_target_page_bits,
 552        &vmstate_capabilites,
 553        &vmstate_uuid,
 554        NULL
 555    }
 556};
 557
 558static void dump_vmstate_vmsd(FILE *out_file,
 559                              const VMStateDescription *vmsd, int indent,
 560                              bool is_subsection);
 561
 562static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field,
 563                              int indent)
 564{
 565    fprintf(out_file, "%*s{\n", indent, "");
 566    indent += 2;
 567    fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name);
 568    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 569            field->version_id);
 570    fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "",
 571            field->field_exists ? "true" : "false");
 572    fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size);
 573    if (field->vmsd != NULL) {
 574        fprintf(out_file, ",\n");
 575        dump_vmstate_vmsd(out_file, field->vmsd, indent, false);
 576    }
 577    fprintf(out_file, "\n%*s}", indent - 2, "");
 578}
 579
 580static void dump_vmstate_vmss(FILE *out_file,
 581                              const VMStateDescription **subsection,
 582                              int indent)
 583{
 584    if (*subsection != NULL) {
 585        dump_vmstate_vmsd(out_file, *subsection, indent, true);
 586    }
 587}
 588
 589static void dump_vmstate_vmsd(FILE *out_file,
 590                              const VMStateDescription *vmsd, int indent,
 591                              bool is_subsection)
 592{
 593    if (is_subsection) {
 594        fprintf(out_file, "%*s{\n", indent, "");
 595    } else {
 596        fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description");
 597    }
 598    indent += 2;
 599    fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name);
 600    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 601            vmsd->version_id);
 602    fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "",
 603            vmsd->minimum_version_id);
 604    if (vmsd->fields != NULL) {
 605        const VMStateField *field = vmsd->fields;
 606        bool first;
 607
 608        fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, "");
 609        first = true;
 610        while (field->name != NULL) {
 611            if (field->flags & VMS_MUST_EXIST) {
 612                /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */
 613                field++;
 614                continue;
 615            }
 616            if (!first) {
 617                fprintf(out_file, ",\n");
 618            }
 619            dump_vmstate_vmsf(out_file, field, indent + 2);
 620            field++;
 621            first = false;
 622        }
 623        fprintf(out_file, "\n%*s]", indent, "");
 624    }
 625    if (vmsd->subsections != NULL) {
 626        const VMStateDescription **subsection = vmsd->subsections;
 627        bool first;
 628
 629        fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, "");
 630        first = true;
 631        while (*subsection != NULL) {
 632            if (!first) {
 633                fprintf(out_file, ",\n");
 634            }
 635            dump_vmstate_vmss(out_file, subsection, indent + 2);
 636            subsection++;
 637            first = false;
 638        }
 639        fprintf(out_file, "\n%*s]", indent, "");
 640    }
 641    fprintf(out_file, "\n%*s}", indent - 2, "");
 642}
 643
 644static void dump_machine_type(FILE *out_file)
 645{
 646    MachineClass *mc;
 647
 648    mc = MACHINE_GET_CLASS(current_machine);
 649
 650    fprintf(out_file, "  \"vmschkmachine\": {\n");
 651    fprintf(out_file, "    \"Name\": \"%s\"\n", mc->name);
 652    fprintf(out_file, "  },\n");
 653}
 654
 655void dump_vmstate_json_to_file(FILE *out_file)
 656{
 657    GSList *list, *elt;
 658    bool first;
 659
 660    fprintf(out_file, "{\n");
 661    dump_machine_type(out_file);
 662
 663    first = true;
 664    list = object_class_get_list(TYPE_DEVICE, true);
 665    for (elt = list; elt; elt = elt->next) {
 666        DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data,
 667                                             TYPE_DEVICE);
 668        const char *name;
 669        int indent = 2;
 670
 671        if (!dc->vmsd) {
 672            continue;
 673        }
 674
 675        if (!first) {
 676            fprintf(out_file, ",\n");
 677        }
 678        name = object_class_get_name(OBJECT_CLASS(dc));
 679        fprintf(out_file, "%*s\"%s\": {\n", indent, "", name);
 680        indent += 2;
 681        fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name);
 682        fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 683                dc->vmsd->version_id);
 684        fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "",
 685                dc->vmsd->minimum_version_id);
 686
 687        dump_vmstate_vmsd(out_file, dc->vmsd, indent, false);
 688
 689        fprintf(out_file, "\n%*s}", indent - 2, "");
 690        first = false;
 691    }
 692    fprintf(out_file, "\n}\n");
 693    fclose(out_file);
 694    g_slist_free(list);
 695}
 696
 697static uint32_t calculate_new_instance_id(const char *idstr)
 698{
 699    SaveStateEntry *se;
 700    uint32_t instance_id = 0;
 701
 702    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 703        if (strcmp(idstr, se->idstr) == 0
 704            && instance_id <= se->instance_id) {
 705            instance_id = se->instance_id + 1;
 706        }
 707    }
 708    /* Make sure we never loop over without being noticed */
 709    assert(instance_id != VMSTATE_INSTANCE_ID_ANY);
 710    return instance_id;
 711}
 712
 713static int calculate_compat_instance_id(const char *idstr)
 714{
 715    SaveStateEntry *se;
 716    int instance_id = 0;
 717
 718    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 719        if (!se->compat) {
 720            continue;
 721        }
 722
 723        if (strcmp(idstr, se->compat->idstr) == 0
 724            && instance_id <= se->compat->instance_id) {
 725            instance_id = se->compat->instance_id + 1;
 726        }
 727    }
 728    return instance_id;
 729}
 730
 731static inline MigrationPriority save_state_priority(SaveStateEntry *se)
 732{
 733    if (se->vmsd) {
 734        return se->vmsd->priority;
 735    }
 736    return MIG_PRI_DEFAULT;
 737}
 738
 739static void savevm_state_handler_insert(SaveStateEntry *nse)
 740{
 741    MigrationPriority priority = save_state_priority(nse);
 742    SaveStateEntry *se;
 743    int i;
 744
 745    assert(priority <= MIG_PRI_MAX);
 746
 747    for (i = priority - 1; i >= 0; i--) {
 748        se = savevm_state.handler_pri_head[i];
 749        if (se != NULL) {
 750            assert(save_state_priority(se) < priority);
 751            break;
 752        }
 753    }
 754
 755    if (i >= 0) {
 756        QTAILQ_INSERT_BEFORE(se, nse, entry);
 757    } else {
 758        QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry);
 759    }
 760
 761    if (savevm_state.handler_pri_head[priority] == NULL) {
 762        savevm_state.handler_pri_head[priority] = nse;
 763    }
 764}
 765
 766static void savevm_state_handler_remove(SaveStateEntry *se)
 767{
 768    SaveStateEntry *next;
 769    MigrationPriority priority = save_state_priority(se);
 770
 771    if (se == savevm_state.handler_pri_head[priority]) {
 772        next = QTAILQ_NEXT(se, entry);
 773        if (next != NULL && save_state_priority(next) == priority) {
 774            savevm_state.handler_pri_head[priority] = next;
 775        } else {
 776            savevm_state.handler_pri_head[priority] = NULL;
 777        }
 778    }
 779    QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
 780}
 781
 782/* TODO: Individual devices generally have very little idea about the rest
 783   of the system, so instance_id should be removed/replaced.
 784   Meanwhile pass -1 as instance_id if you do not already have a clearly
 785   distinguishing id for all instances of your device class. */
 786int register_savevm_live(const char *idstr,
 787                         uint32_t instance_id,
 788                         int version_id,
 789                         const SaveVMHandlers *ops,
 790                         void *opaque)
 791{
 792    SaveStateEntry *se;
 793
 794    se = g_new0(SaveStateEntry, 1);
 795    se->version_id = version_id;
 796    se->section_id = savevm_state.global_section_id++;
 797    se->ops = ops;
 798    se->opaque = opaque;
 799    se->vmsd = NULL;
 800    /* if this is a live_savem then set is_ram */
 801    if (ops->save_setup != NULL) {
 802        se->is_ram = 1;
 803    }
 804
 805    pstrcat(se->idstr, sizeof(se->idstr), idstr);
 806
 807    if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
 808        se->instance_id = calculate_new_instance_id(se->idstr);
 809    } else {
 810        se->instance_id = instance_id;
 811    }
 812    assert(!se->compat || se->instance_id == 0);
 813    savevm_state_handler_insert(se);
 814    return 0;
 815}
 816
 817void unregister_savevm(VMStateIf *obj, const char *idstr, void *opaque)
 818{
 819    SaveStateEntry *se, *new_se;
 820    char id[256] = "";
 821
 822    if (obj) {
 823        char *oid = vmstate_if_get_id(obj);
 824        if (oid) {
 825            pstrcpy(id, sizeof(id), oid);
 826            pstrcat(id, sizeof(id), "/");
 827            g_free(oid);
 828        }
 829    }
 830    pstrcat(id, sizeof(id), idstr);
 831
 832    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
 833        if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
 834            savevm_state_handler_remove(se);
 835            g_free(se->compat);
 836            g_free(se);
 837        }
 838    }
 839}
 840
 841int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
 842                                   const VMStateDescription *vmsd,
 843                                   void *opaque, int alias_id,
 844                                   int required_for_version,
 845                                   Error **errp)
 846{
 847    SaveStateEntry *se;
 848
 849    /* If this triggers, alias support can be dropped for the vmsd. */
 850    assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id);
 851
 852    se = g_new0(SaveStateEntry, 1);
 853    se->version_id = vmsd->version_id;
 854    se->section_id = savevm_state.global_section_id++;
 855    se->opaque = opaque;
 856    se->vmsd = vmsd;
 857    se->alias_id = alias_id;
 858
 859    if (obj) {
 860        char *id = vmstate_if_get_id(obj);
 861        if (id) {
 862            if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
 863                sizeof(se->idstr)) {
 864                error_setg(errp, "Path too long for VMState (%s)", id);
 865                g_free(id);
 866                g_free(se);
 867
 868                return -1;
 869            }
 870            g_free(id);
 871
 872            se->compat = g_new0(CompatEntry, 1);
 873            pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name);
 874            se->compat->instance_id = instance_id == VMSTATE_INSTANCE_ID_ANY ?
 875                         calculate_compat_instance_id(vmsd->name) : instance_id;
 876            instance_id = VMSTATE_INSTANCE_ID_ANY;
 877        }
 878    }
 879    pstrcat(se->idstr, sizeof(se->idstr), vmsd->name);
 880
 881    if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
 882        se->instance_id = calculate_new_instance_id(se->idstr);
 883    } else {
 884        se->instance_id = instance_id;
 885    }
 886    assert(!se->compat || se->instance_id == 0);
 887    savevm_state_handler_insert(se);
 888    return 0;
 889}
 890
 891void vmstate_unregister(VMStateIf *obj, const VMStateDescription *vmsd,
 892                        void *opaque)
 893{
 894    SaveStateEntry *se, *new_se;
 895
 896    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
 897        if (se->vmsd == vmsd && se->opaque == opaque) {
 898            savevm_state_handler_remove(se);
 899            g_free(se->compat);
 900            g_free(se);
 901        }
 902    }
 903}
 904
 905static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
 906{
 907    trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
 908    if (!se->vmsd) {         /* Old style */
 909        return se->ops->load_state(f, se->opaque, se->load_version_id);
 910    }
 911    return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id);
 912}
 913
 914static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se,
 915                                   JSONWriter *vmdesc)
 916{
 917    int64_t old_offset, size;
 918
 919    old_offset = qemu_ftell_fast(f);
 920    se->ops->save_state(f, se->opaque);
 921    size = qemu_ftell_fast(f) - old_offset;
 922
 923    if (vmdesc) {
 924        json_writer_int64(vmdesc, "size", size);
 925        json_writer_start_array(vmdesc, "fields");
 926        json_writer_start_object(vmdesc, NULL);
 927        json_writer_str(vmdesc, "name", "data");
 928        json_writer_int64(vmdesc, "size", size);
 929        json_writer_str(vmdesc, "type", "buffer");
 930        json_writer_end_object(vmdesc);
 931        json_writer_end_array(vmdesc);
 932    }
 933}
 934
 935static int vmstate_save(QEMUFile *f, SaveStateEntry *se,
 936                        JSONWriter *vmdesc)
 937{
 938    trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
 939    if (!se->vmsd) {
 940        vmstate_save_old_style(f, se, vmdesc);
 941        return 0;
 942    }
 943    return vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
 944}
 945
 946/*
 947 * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
 948 */
 949static void save_section_header(QEMUFile *f, SaveStateEntry *se,
 950                                uint8_t section_type)
 951{
 952    qemu_put_byte(f, section_type);
 953    qemu_put_be32(f, se->section_id);
 954
 955    if (section_type == QEMU_VM_SECTION_FULL ||
 956        section_type == QEMU_VM_SECTION_START) {
 957        /* ID string */
 958        size_t len = strlen(se->idstr);
 959        qemu_put_byte(f, len);
 960        qemu_put_buffer(f, (uint8_t *)se->idstr, len);
 961
 962        qemu_put_be32(f, se->instance_id);
 963        qemu_put_be32(f, se->version_id);
 964    }
 965}
 966
 967/*
 968 * Write a footer onto device sections that catches cases misformatted device
 969 * sections.
 970 */
 971static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
 972{
 973    if (migrate_get_current()->send_section_footer) {
 974        qemu_put_byte(f, QEMU_VM_SECTION_FOOTER);
 975        qemu_put_be32(f, se->section_id);
 976    }
 977}
 978
 979/**
 980 * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
 981 *                           command and associated data.
 982 *
 983 * @f: File to send command on
 984 * @command: Command type to send
 985 * @len: Length of associated data
 986 * @data: Data associated with command.
 987 */
 988static void qemu_savevm_command_send(QEMUFile *f,
 989                                     enum qemu_vm_cmd command,
 990                                     uint16_t len,
 991                                     uint8_t *data)
 992{
 993    trace_savevm_command_send(command, len);
 994    qemu_put_byte(f, QEMU_VM_COMMAND);
 995    qemu_put_be16(f, (uint16_t)command);
 996    qemu_put_be16(f, len);
 997    qemu_put_buffer(f, data, len);
 998    qemu_fflush(f);
 999}
1000
1001void qemu_savevm_send_colo_enable(QEMUFile *f)
1002{
1003    trace_savevm_send_colo_enable();
1004    qemu_savevm_command_send(f, MIG_CMD_ENABLE_COLO, 0, NULL);
1005}
1006
1007void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
1008{
1009    uint32_t buf;
1010
1011    trace_savevm_send_ping(value);
1012    buf = cpu_to_be32(value);
1013    qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
1014}
1015
1016void qemu_savevm_send_open_return_path(QEMUFile *f)
1017{
1018    trace_savevm_send_open_return_path();
1019    qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
1020}
1021
1022/* We have a buffer of data to send; we don't want that all to be loaded
1023 * by the command itself, so the command contains just the length of the
1024 * extra buffer that we then send straight after it.
1025 * TODO: Must be a better way to organise that
1026 *
1027 * Returns:
1028 *    0 on success
1029 *    -ve on error
1030 */
1031int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len)
1032{
1033    uint32_t tmp;
1034
1035    if (len > MAX_VM_CMD_PACKAGED_SIZE) {
1036        error_report("%s: Unreasonably large packaged state: %zu",
1037                     __func__, len);
1038        return -1;
1039    }
1040
1041    tmp = cpu_to_be32(len);
1042
1043    trace_qemu_savevm_send_packaged();
1044    qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
1045
1046    qemu_put_buffer(f, buf, len);
1047
1048    return 0;
1049}
1050
1051/* Send prior to any postcopy transfer */
1052void qemu_savevm_send_postcopy_advise(QEMUFile *f)
1053{
1054    if (migrate_postcopy_ram()) {
1055        uint64_t tmp[2];
1056        tmp[0] = cpu_to_be64(ram_pagesize_summary());
1057        tmp[1] = cpu_to_be64(qemu_target_page_size());
1058
1059        trace_qemu_savevm_send_postcopy_advise();
1060        qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE,
1061                                 16, (uint8_t *)tmp);
1062    } else {
1063        qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 0, NULL);
1064    }
1065}
1066
1067/* Sent prior to starting the destination running in postcopy, discard pages
1068 * that have already been sent but redirtied on the source.
1069 * CMD_POSTCOPY_RAM_DISCARD consist of:
1070 *      byte   version (0)
1071 *      byte   Length of name field (not including 0)
1072 *  n x byte   RAM block name
1073 *      byte   0 terminator (just for safety)
1074 *  n x        Byte ranges within the named RAMBlock
1075 *      be64   Start of the range
1076 *      be64   Length
1077 *
1078 *  name:  RAMBlock name that these entries are part of
1079 *  len: Number of page entries
1080 *  start_list: 'len' addresses
1081 *  length_list: 'len' addresses
1082 *
1083 */
1084void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
1085                                           uint16_t len,
1086                                           uint64_t *start_list,
1087                                           uint64_t *length_list)
1088{
1089    uint8_t *buf;
1090    uint16_t tmplen;
1091    uint16_t t;
1092    size_t name_len = strlen(name);
1093
1094    trace_qemu_savevm_send_postcopy_ram_discard(name, len);
1095    assert(name_len < 256);
1096    buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
1097    buf[0] = postcopy_ram_discard_version;
1098    buf[1] = name_len;
1099    memcpy(buf + 2, name, name_len);
1100    tmplen = 2 + name_len;
1101    buf[tmplen++] = '\0';
1102
1103    for (t = 0; t < len; t++) {
1104        stq_be_p(buf + tmplen, start_list[t]);
1105        tmplen += 8;
1106        stq_be_p(buf + tmplen, length_list[t]);
1107        tmplen += 8;
1108    }
1109    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
1110    g_free(buf);
1111}
1112
1113/* Get the destination into a state where it can receive postcopy data. */
1114void qemu_savevm_send_postcopy_listen(QEMUFile *f)
1115{
1116    trace_savevm_send_postcopy_listen();
1117    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
1118}
1119
1120/* Kick the destination into running */
1121void qemu_savevm_send_postcopy_run(QEMUFile *f)
1122{
1123    trace_savevm_send_postcopy_run();
1124    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
1125}
1126
1127void qemu_savevm_send_postcopy_resume(QEMUFile *f)
1128{
1129    trace_savevm_send_postcopy_resume();
1130    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RESUME, 0, NULL);
1131}
1132
1133void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name)
1134{
1135    size_t len;
1136    char buf[256];
1137
1138    trace_savevm_send_recv_bitmap(block_name);
1139
1140    buf[0] = len = strlen(block_name);
1141    memcpy(buf + 1, block_name, len);
1142
1143    qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
1144}
1145
1146bool qemu_savevm_state_blocked(Error **errp)
1147{
1148    SaveStateEntry *se;
1149
1150    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1151        if (se->vmsd && se->vmsd->unmigratable) {
1152            error_setg(errp, "State blocked by non-migratable device '%s'",
1153                       se->idstr);
1154            return true;
1155        }
1156    }
1157    return false;
1158}
1159
1160void qemu_savevm_non_migratable_list(strList **reasons)
1161{
1162    SaveStateEntry *se;
1163
1164    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1165        if (se->vmsd && se->vmsd->unmigratable) {
1166            QAPI_LIST_PREPEND(*reasons,
1167                              g_strdup_printf("non-migratable device: %s",
1168                                              se->idstr));
1169        }
1170    }
1171}
1172
1173void qemu_savevm_state_header(QEMUFile *f)
1174{
1175    trace_savevm_state_header();
1176    qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1177    qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1178
1179    if (migrate_get_current()->send_configuration) {
1180        qemu_put_byte(f, QEMU_VM_CONFIGURATION);
1181        vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
1182    }
1183}
1184
1185bool qemu_savevm_state_guest_unplug_pending(void)
1186{
1187    SaveStateEntry *se;
1188
1189    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1190        if (se->vmsd && se->vmsd->dev_unplug_pending &&
1191            se->vmsd->dev_unplug_pending(se->opaque)) {
1192            return true;
1193        }
1194    }
1195
1196    return false;
1197}
1198
1199void qemu_savevm_state_setup(QEMUFile *f)
1200{
1201    SaveStateEntry *se;
1202    Error *local_err = NULL;
1203    int ret;
1204
1205    trace_savevm_state_setup();
1206    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1207        if (!se->ops || !se->ops->save_setup) {
1208            continue;
1209        }
1210        if (se->ops->is_active) {
1211            if (!se->ops->is_active(se->opaque)) {
1212                continue;
1213            }
1214        }
1215        save_section_header(f, se, QEMU_VM_SECTION_START);
1216
1217        ret = se->ops->save_setup(f, se->opaque);
1218        save_section_footer(f, se);
1219        if (ret < 0) {
1220            qemu_file_set_error(f, ret);
1221            break;
1222        }
1223    }
1224
1225    if (precopy_notify(PRECOPY_NOTIFY_SETUP, &local_err)) {
1226        error_report_err(local_err);
1227    }
1228}
1229
1230int qemu_savevm_state_resume_prepare(MigrationState *s)
1231{
1232    SaveStateEntry *se;
1233    int ret;
1234
1235    trace_savevm_state_resume_prepare();
1236
1237    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1238        if (!se->ops || !se->ops->resume_prepare) {
1239            continue;
1240        }
1241        if (se->ops->is_active) {
1242            if (!se->ops->is_active(se->opaque)) {
1243                continue;
1244            }
1245        }
1246        ret = se->ops->resume_prepare(s, se->opaque);
1247        if (ret < 0) {
1248            return ret;
1249        }
1250    }
1251
1252    return 0;
1253}
1254
1255/*
1256 * this function has three return values:
1257 *   negative: there was one error, and we have -errno.
1258 *   0 : We haven't finished, caller have to go again
1259 *   1 : We have finished, we can go to complete phase
1260 */
1261int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
1262{
1263    SaveStateEntry *se;
1264    int ret = 1;
1265
1266    trace_savevm_state_iterate();
1267    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1268        if (!se->ops || !se->ops->save_live_iterate) {
1269            continue;
1270        }
1271        if (se->ops->is_active &&
1272            !se->ops->is_active(se->opaque)) {
1273            continue;
1274        }
1275        if (se->ops->is_active_iterate &&
1276            !se->ops->is_active_iterate(se->opaque)) {
1277            continue;
1278        }
1279        /*
1280         * In the postcopy phase, any device that doesn't know how to
1281         * do postcopy should have saved it's state in the _complete
1282         * call that's already run, it might get confused if we call
1283         * iterate afterwards.
1284         */
1285        if (postcopy &&
1286            !(se->ops->has_postcopy && se->ops->has_postcopy(se->opaque))) {
1287            continue;
1288        }
1289        if (qemu_file_rate_limit(f)) {
1290            return 0;
1291        }
1292        trace_savevm_section_start(se->idstr, se->section_id);
1293
1294        save_section_header(f, se, QEMU_VM_SECTION_PART);
1295
1296        ret = se->ops->save_live_iterate(f, se->opaque);
1297        trace_savevm_section_end(se->idstr, se->section_id, ret);
1298        save_section_footer(f, se);
1299
1300        if (ret < 0) {
1301            error_report("failed to save SaveStateEntry with id(name): "
1302                         "%d(%s): %d",
1303                         se->section_id, se->idstr, ret);
1304            qemu_file_set_error(f, ret);
1305        }
1306        if (ret <= 0) {
1307            /* Do not proceed to the next vmstate before this one reported
1308               completion of the current stage. This serializes the migration
1309               and reduces the probability that a faster changing state is
1310               synchronized over and over again. */
1311            break;
1312        }
1313    }
1314    return ret;
1315}
1316
1317static bool should_send_vmdesc(void)
1318{
1319    MachineState *machine = MACHINE(qdev_get_machine());
1320    bool in_postcopy = migration_in_postcopy();
1321    return !machine->suppress_vmdesc && !in_postcopy;
1322}
1323
1324/*
1325 * Calls the save_live_complete_postcopy methods
1326 * causing the last few pages to be sent immediately and doing any associated
1327 * cleanup.
1328 * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
1329 * all the other devices, but that happens at the point we switch to postcopy.
1330 */
1331void qemu_savevm_state_complete_postcopy(QEMUFile *f)
1332{
1333    SaveStateEntry *se;
1334    int ret;
1335
1336    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1337        if (!se->ops || !se->ops->save_live_complete_postcopy) {
1338            continue;
1339        }
1340        if (se->ops->is_active) {
1341            if (!se->ops->is_active(se->opaque)) {
1342                continue;
1343            }
1344        }
1345        trace_savevm_section_start(se->idstr, se->section_id);
1346        /* Section type */
1347        qemu_put_byte(f, QEMU_VM_SECTION_END);
1348        qemu_put_be32(f, se->section_id);
1349
1350        ret = se->ops->save_live_complete_postcopy(f, se->opaque);
1351        trace_savevm_section_end(se->idstr, se->section_id, ret);
1352        save_section_footer(f, se);
1353        if (ret < 0) {
1354            qemu_file_set_error(f, ret);
1355            return;
1356        }
1357    }
1358
1359    qemu_put_byte(f, QEMU_VM_EOF);
1360    qemu_fflush(f);
1361}
1362
1363static
1364int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
1365{
1366    SaveStateEntry *se;
1367    int ret;
1368
1369    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1370        if (!se->ops ||
1371            (in_postcopy && se->ops->has_postcopy &&
1372             se->ops->has_postcopy(se->opaque)) ||
1373            !se->ops->save_live_complete_precopy) {
1374            continue;
1375        }
1376
1377        if (se->ops->is_active) {
1378            if (!se->ops->is_active(se->opaque)) {
1379                continue;
1380            }
1381        }
1382        trace_savevm_section_start(se->idstr, se->section_id);
1383
1384        save_section_header(f, se, QEMU_VM_SECTION_END);
1385
1386        ret = se->ops->save_live_complete_precopy(f, se->opaque);
1387        trace_savevm_section_end(se->idstr, se->section_id, ret);
1388        save_section_footer(f, se);
1389        if (ret < 0) {
1390            qemu_file_set_error(f, ret);
1391            return -1;
1392        }
1393    }
1394
1395    return 0;
1396}
1397
1398int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
1399                                                    bool in_postcopy,
1400                                                    bool inactivate_disks)
1401{
1402    g_autoptr(JSONWriter) vmdesc = NULL;
1403    int vmdesc_len;
1404    SaveStateEntry *se;
1405    int ret;
1406
1407    vmdesc = json_writer_new(false);
1408    json_writer_start_object(vmdesc, NULL);
1409    json_writer_int64(vmdesc, "page_size", qemu_target_page_size());
1410    json_writer_start_array(vmdesc, "devices");
1411    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1412
1413        if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1414            continue;
1415        }
1416        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1417            trace_savevm_section_skip(se->idstr, se->section_id);
1418            continue;
1419        }
1420
1421        trace_savevm_section_start(se->idstr, se->section_id);
1422
1423        json_writer_start_object(vmdesc, NULL);
1424        json_writer_str(vmdesc, "name", se->idstr);
1425        json_writer_int64(vmdesc, "instance_id", se->instance_id);
1426
1427        save_section_header(f, se, QEMU_VM_SECTION_FULL);
1428        ret = vmstate_save(f, se, vmdesc);
1429        if (ret) {
1430            qemu_file_set_error(f, ret);
1431            return ret;
1432        }
1433        trace_savevm_section_end(se->idstr, se->section_id, 0);
1434        save_section_footer(f, se);
1435
1436        json_writer_end_object(vmdesc);
1437    }
1438
1439    if (inactivate_disks) {
1440        /* Inactivate before sending QEMU_VM_EOF so that the
1441         * bdrv_activate_all() on the other end won't fail. */
1442        ret = bdrv_inactivate_all();
1443        if (ret) {
1444            error_report("%s: bdrv_inactivate_all() failed (%d)",
1445                         __func__, ret);
1446            qemu_file_set_error(f, ret);
1447            return ret;
1448        }
1449    }
1450    if (!in_postcopy) {
1451        /* Postcopy stream will still be going */
1452        qemu_put_byte(f, QEMU_VM_EOF);
1453    }
1454
1455    json_writer_end_array(vmdesc);
1456    json_writer_end_object(vmdesc);
1457    vmdesc_len = strlen(json_writer_get(vmdesc));
1458
1459    if (should_send_vmdesc()) {
1460        qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
1461        qemu_put_be32(f, vmdesc_len);
1462        qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len);
1463    }
1464
1465    return 0;
1466}
1467
1468int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
1469                                       bool inactivate_disks)
1470{
1471    int ret;
1472    Error *local_err = NULL;
1473    bool in_postcopy = migration_in_postcopy();
1474
1475    if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) {
1476        error_report_err(local_err);
1477    }
1478
1479    trace_savevm_state_complete_precopy();
1480
1481    cpu_synchronize_all_states();
1482
1483    if (!in_postcopy || iterable_only) {
1484        ret = qemu_savevm_state_complete_precopy_iterable(f, in_postcopy);
1485        if (ret) {
1486            return ret;
1487        }
1488    }
1489
1490    if (iterable_only) {
1491        goto flush;
1492    }
1493
1494    ret = qemu_savevm_state_complete_precopy_non_iterable(f, in_postcopy,
1495                                                          inactivate_disks);
1496    if (ret) {
1497        return ret;
1498    }
1499
1500flush:
1501    qemu_fflush(f);
1502    return 0;
1503}
1504
1505/* Give an estimate of the amount left to be transferred,
1506 * the result is split into the amount for units that can and
1507 * for units that can't do postcopy.
1508 */
1509void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
1510                               uint64_t *res_precopy_only,
1511                               uint64_t *res_compatible,
1512                               uint64_t *res_postcopy_only)
1513{
1514    SaveStateEntry *se;
1515
1516    *res_precopy_only = 0;
1517    *res_compatible = 0;
1518    *res_postcopy_only = 0;
1519
1520
1521    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1522        if (!se->ops || !se->ops->save_live_pending) {
1523            continue;
1524        }
1525        if (se->ops->is_active) {
1526            if (!se->ops->is_active(se->opaque)) {
1527                continue;
1528            }
1529        }
1530        se->ops->save_live_pending(f, se->opaque, threshold_size,
1531                                   res_precopy_only, res_compatible,
1532                                   res_postcopy_only);
1533    }
1534}
1535
1536void qemu_savevm_state_cleanup(void)
1537{
1538    SaveStateEntry *se;
1539    Error *local_err = NULL;
1540
1541    if (precopy_notify(PRECOPY_NOTIFY_CLEANUP, &local_err)) {
1542        error_report_err(local_err);
1543    }
1544
1545    trace_savevm_state_cleanup();
1546    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1547        if (se->ops && se->ops->save_cleanup) {
1548            se->ops->save_cleanup(se->opaque);
1549        }
1550    }
1551}
1552
1553static int qemu_savevm_state(QEMUFile *f, Error **errp)
1554{
1555    int ret;
1556    MigrationState *ms = migrate_get_current();
1557    MigrationStatus status;
1558
1559    if (migration_is_running(ms->state)) {
1560        error_setg(errp, QERR_MIGRATION_ACTIVE);
1561        return -EINVAL;
1562    }
1563
1564    if (migrate_use_block()) {
1565        error_setg(errp, "Block migration and snapshots are incompatible");
1566        return -EINVAL;
1567    }
1568
1569    migrate_init(ms);
1570    memset(&ram_counters, 0, sizeof(ram_counters));
1571    memset(&compression_counters, 0, sizeof(compression_counters));
1572    ms->to_dst_file = f;
1573
1574    qemu_mutex_unlock_iothread();
1575    qemu_savevm_state_header(f);
1576    qemu_savevm_state_setup(f);
1577    qemu_mutex_lock_iothread();
1578
1579    while (qemu_file_get_error(f) == 0) {
1580        if (qemu_savevm_state_iterate(f, false) > 0) {
1581            break;
1582        }
1583    }
1584
1585    ret = qemu_file_get_error(f);
1586    if (ret == 0) {
1587        qemu_savevm_state_complete_precopy(f, false, false);
1588        ret = qemu_file_get_error(f);
1589    }
1590    qemu_savevm_state_cleanup();
1591    if (ret != 0) {
1592        error_setg_errno(errp, -ret, "Error while writing VM state");
1593    }
1594
1595    if (ret != 0) {
1596        status = MIGRATION_STATUS_FAILED;
1597    } else {
1598        status = MIGRATION_STATUS_COMPLETED;
1599    }
1600    migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status);
1601
1602    /* f is outer parameter, it should not stay in global migration state after
1603     * this function finished */
1604    ms->to_dst_file = NULL;
1605
1606    return ret;
1607}
1608
1609void qemu_savevm_live_state(QEMUFile *f)
1610{
1611    /* save QEMU_VM_SECTION_END section */
1612    qemu_savevm_state_complete_precopy(f, true, false);
1613    qemu_put_byte(f, QEMU_VM_EOF);
1614}
1615
1616int qemu_save_device_state(QEMUFile *f)
1617{
1618    SaveStateEntry *se;
1619
1620    if (!migration_in_colo_state()) {
1621        qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1622        qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1623    }
1624    cpu_synchronize_all_states();
1625
1626    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1627        int ret;
1628
1629        if (se->is_ram) {
1630            continue;
1631        }
1632        if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1633            continue;
1634        }
1635        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1636            continue;
1637        }
1638
1639        save_section_header(f, se, QEMU_VM_SECTION_FULL);
1640
1641        ret = vmstate_save(f, se, NULL);
1642        if (ret) {
1643            return ret;
1644        }
1645
1646        save_section_footer(f, se);
1647    }
1648
1649    qemu_put_byte(f, QEMU_VM_EOF);
1650
1651    return qemu_file_get_error(f);
1652}
1653
1654static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id)
1655{
1656    SaveStateEntry *se;
1657
1658    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1659        if (!strcmp(se->idstr, idstr) &&
1660            (instance_id == se->instance_id ||
1661             instance_id == se->alias_id))
1662            return se;
1663        /* Migrating from an older version? */
1664        if (strstr(se->idstr, idstr) && se->compat) {
1665            if (!strcmp(se->compat->idstr, idstr) &&
1666                (instance_id == se->compat->instance_id ||
1667                 instance_id == se->alias_id))
1668                return se;
1669        }
1670    }
1671    return NULL;
1672}
1673
1674enum LoadVMExitCodes {
1675    /* Allow a command to quit all layers of nested loadvm loops */
1676    LOADVM_QUIT     =  1,
1677};
1678
1679/* ------ incoming postcopy messages ------ */
1680/* 'advise' arrives before any transfers just to tell us that a postcopy
1681 * *might* happen - it might be skipped if precopy transferred everything
1682 * quickly.
1683 */
1684static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
1685                                         uint16_t len)
1686{
1687    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1688    uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps;
1689    size_t page_size = qemu_target_page_size();
1690    Error *local_err = NULL;
1691
1692    trace_loadvm_postcopy_handle_advise();
1693    if (ps != POSTCOPY_INCOMING_NONE) {
1694        error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
1695        return -1;
1696    }
1697
1698    switch (len) {
1699    case 0:
1700        if (migrate_postcopy_ram()) {
1701            error_report("RAM postcopy is enabled but have 0 byte advise");
1702            return -EINVAL;
1703        }
1704        return 0;
1705    case 8 + 8:
1706        if (!migrate_postcopy_ram()) {
1707            error_report("RAM postcopy is disabled but have 16 byte advise");
1708            return -EINVAL;
1709        }
1710        break;
1711    default:
1712        error_report("CMD_POSTCOPY_ADVISE invalid length (%d)", len);
1713        return -EINVAL;
1714    }
1715
1716    if (!postcopy_ram_supported_by_host(mis)) {
1717        postcopy_state_set(POSTCOPY_INCOMING_NONE);
1718        return -1;
1719    }
1720
1721    remote_pagesize_summary = qemu_get_be64(mis->from_src_file);
1722    local_pagesize_summary = ram_pagesize_summary();
1723
1724    if (remote_pagesize_summary != local_pagesize_summary)  {
1725        /*
1726         * This detects two potential causes of mismatch:
1727         *   a) A mismatch in host page sizes
1728         *      Some combinations of mismatch are probably possible but it gets
1729         *      a bit more complicated.  In particular we need to place whole
1730         *      host pages on the dest at once, and we need to ensure that we
1731         *      handle dirtying to make sure we never end up sending part of
1732         *      a hostpage on it's own.
1733         *   b) The use of different huge page sizes on source/destination
1734         *      a more fine grain test is performed during RAM block migration
1735         *      but this test here causes a nice early clear failure, and
1736         *      also fails when passed to an older qemu that doesn't
1737         *      do huge pages.
1738         */
1739        error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64
1740                                                             " d=%" PRIx64 ")",
1741                     remote_pagesize_summary, local_pagesize_summary);
1742        return -1;
1743    }
1744
1745    remote_tps = qemu_get_be64(mis->from_src_file);
1746    if (remote_tps != page_size) {
1747        /*
1748         * Again, some differences could be dealt with, but for now keep it
1749         * simple.
1750         */
1751        error_report("Postcopy needs matching target page sizes (s=%d d=%zd)",
1752                     (int)remote_tps, page_size);
1753        return -1;
1754    }
1755
1756    if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_ADVISE, &local_err)) {
1757        error_report_err(local_err);
1758        return -1;
1759    }
1760
1761    if (ram_postcopy_incoming_init(mis)) {
1762        return -1;
1763    }
1764
1765    return 0;
1766}
1767
1768/* After postcopy we will be told to throw some pages away since they're
1769 * dirty and will have to be demand fetched.  Must happen before CPU is
1770 * started.
1771 * There can be 0..many of these messages, each encoding multiple pages.
1772 */
1773static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
1774                                              uint16_t len)
1775{
1776    int tmp;
1777    char ramid[256];
1778    PostcopyState ps = postcopy_state_get();
1779
1780    trace_loadvm_postcopy_ram_handle_discard();
1781
1782    switch (ps) {
1783    case POSTCOPY_INCOMING_ADVISE:
1784        /* 1st discard */
1785        tmp = postcopy_ram_prepare_discard(mis);
1786        if (tmp) {
1787            return tmp;
1788        }
1789        break;
1790
1791    case POSTCOPY_INCOMING_DISCARD:
1792        /* Expected state */
1793        break;
1794
1795    default:
1796        error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
1797                     ps);
1798        return -1;
1799    }
1800    /* We're expecting a
1801     *    Version (0)
1802     *    a RAM ID string (length byte, name, 0 term)
1803     *    then at least 1 16 byte chunk
1804    */
1805    if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
1806        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1807        return -1;
1808    }
1809
1810    tmp = qemu_get_byte(mis->from_src_file);
1811    if (tmp != postcopy_ram_discard_version) {
1812        error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
1813        return -1;
1814    }
1815
1816    if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
1817        error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
1818        return -1;
1819    }
1820    tmp = qemu_get_byte(mis->from_src_file);
1821    if (tmp != 0) {
1822        error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
1823        return -1;
1824    }
1825
1826    len -= 3 + strlen(ramid);
1827    if (len % 16) {
1828        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1829        return -1;
1830    }
1831    trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
1832    while (len) {
1833        uint64_t start_addr, block_length;
1834        start_addr = qemu_get_be64(mis->from_src_file);
1835        block_length = qemu_get_be64(mis->from_src_file);
1836
1837        len -= 16;
1838        int ret = ram_discard_range(ramid, start_addr, block_length);
1839        if (ret) {
1840            return ret;
1841        }
1842    }
1843    trace_loadvm_postcopy_ram_handle_discard_end();
1844
1845    return 0;
1846}
1847
1848/*
1849 * Triggered by a postcopy_listen command; this thread takes over reading
1850 * the input stream, leaving the main thread free to carry on loading the rest
1851 * of the device state (from RAM).
1852 * (TODO:This could do with being in a postcopy file - but there again it's
1853 * just another input loop, not that postcopy specific)
1854 */
1855static void *postcopy_ram_listen_thread(void *opaque)
1856{
1857    MigrationIncomingState *mis = migration_incoming_get_current();
1858    QEMUFile *f = mis->from_src_file;
1859    int load_res;
1860    MigrationState *migr = migrate_get_current();
1861
1862    object_ref(OBJECT(migr));
1863
1864    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
1865                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);
1866    qemu_sem_post(&mis->thread_sync_sem);
1867    trace_postcopy_ram_listen_thread_start();
1868
1869    rcu_register_thread();
1870    /*
1871     * Because we're a thread and not a coroutine we can't yield
1872     * in qemu_file, and thus we must be blocking now.
1873     */
1874    qemu_file_set_blocking(f, true);
1875    load_res = qemu_loadvm_state_main(f, mis);
1876
1877    /*
1878     * This is tricky, but, mis->from_src_file can change after it
1879     * returns, when postcopy recovery happened. In the future, we may
1880     * want a wrapper for the QEMUFile handle.
1881     */
1882    f = mis->from_src_file;
1883
1884    /* And non-blocking again so we don't block in any cleanup */
1885    qemu_file_set_blocking(f, false);
1886
1887    trace_postcopy_ram_listen_thread_exit();
1888    if (load_res < 0) {
1889        qemu_file_set_error(f, load_res);
1890        dirty_bitmap_mig_cancel_incoming();
1891        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
1892            !migrate_postcopy_ram() && migrate_dirty_bitmaps())
1893        {
1894            error_report("%s: loadvm failed during postcopy: %d. All states "
1895                         "are migrated except dirty bitmaps. Some dirty "
1896                         "bitmaps may be lost, and present migrated dirty "
1897                         "bitmaps are correctly migrated and valid.",
1898                         __func__, load_res);
1899            load_res = 0; /* prevent further exit() */
1900        } else {
1901            error_report("%s: loadvm failed: %d", __func__, load_res);
1902            migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1903                                           MIGRATION_STATUS_FAILED);
1904        }
1905    }
1906    if (load_res >= 0) {
1907        /*
1908         * This looks good, but it's possible that the device loading in the
1909         * main thread hasn't finished yet, and so we might not be in 'RUN'
1910         * state yet; wait for the end of the main thread.
1911         */
1912        qemu_event_wait(&mis->main_thread_load_event);
1913    }
1914    postcopy_ram_incoming_cleanup(mis);
1915
1916    if (load_res < 0) {
1917        /*
1918         * If something went wrong then we have a bad state so exit;
1919         * depending how far we got it might be possible at this point
1920         * to leave the guest running and fire MCEs for pages that never
1921         * arrived as a desperate recovery step.
1922         */
1923        rcu_unregister_thread();
1924        exit(EXIT_FAILURE);
1925    }
1926
1927    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1928                                   MIGRATION_STATUS_COMPLETED);
1929    /*
1930     * If everything has worked fine, then the main thread has waited
1931     * for us to start, and we're the last use of the mis.
1932     * (If something broke then qemu will have to exit anyway since it's
1933     * got a bad migration state).
1934     */
1935    migration_incoming_state_destroy();
1936    qemu_loadvm_state_cleanup();
1937
1938    rcu_unregister_thread();
1939    mis->have_listen_thread = false;
1940    postcopy_state_set(POSTCOPY_INCOMING_END);
1941
1942    object_unref(OBJECT(migr));
1943
1944    return NULL;
1945}
1946
1947/* After this message we must be able to immediately receive postcopy data */
1948static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
1949{
1950    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
1951    Error *local_err = NULL;
1952
1953    trace_loadvm_postcopy_handle_listen("enter");
1954
1955    if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
1956        error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
1957        return -1;
1958    }
1959    if (ps == POSTCOPY_INCOMING_ADVISE) {
1960        /*
1961         * A rare case, we entered listen without having to do any discards,
1962         * so do the setup that's normally done at the time of the 1st discard.
1963         */
1964        if (migrate_postcopy_ram()) {
1965            postcopy_ram_prepare_discard(mis);
1966        }
1967    }
1968
1969    trace_loadvm_postcopy_handle_listen("after discard");
1970
1971    /*
1972     * Sensitise RAM - can now generate requests for blocks that don't exist
1973     * However, at this point the CPU shouldn't be running, and the IO
1974     * shouldn't be doing anything yet so don't actually expect requests
1975     */
1976    if (migrate_postcopy_ram()) {
1977        if (postcopy_ram_incoming_setup(mis)) {
1978            postcopy_ram_incoming_cleanup(mis);
1979            return -1;
1980        }
1981    }
1982
1983    trace_loadvm_postcopy_handle_listen("after uffd");
1984
1985    if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) {
1986        error_report_err(local_err);
1987        return -1;
1988    }
1989
1990    mis->have_listen_thread = true;
1991    postcopy_thread_create(mis, &mis->listen_thread, "postcopy/listen",
1992                           postcopy_ram_listen_thread, QEMU_THREAD_DETACHED);
1993    trace_loadvm_postcopy_handle_listen("return");
1994
1995    return 0;
1996}
1997
1998static void loadvm_postcopy_handle_run_bh(void *opaque)
1999{
2000    Error *local_err = NULL;
2001    MigrationIncomingState *mis = opaque;
2002
2003    trace_loadvm_postcopy_handle_run_bh("enter");
2004
2005    /* TODO we should move all of this lot into postcopy_ram.c or a shared code
2006     * in migration.c
2007     */
2008    cpu_synchronize_all_post_init();
2009
2010    trace_loadvm_postcopy_handle_run_bh("after cpu sync");
2011
2012    qemu_announce_self(&mis->announce_timer, migrate_announce_params());
2013
2014    trace_loadvm_postcopy_handle_run_bh("after announce");
2015
2016    /* Make sure all file formats throw away their mutable metadata.
2017     * If we get an error here, just don't restart the VM yet. */
2018    bdrv_activate_all(&local_err);
2019    if (local_err) {
2020        error_report_err(local_err);
2021        local_err = NULL;
2022        autostart = false;
2023    }
2024
2025    trace_loadvm_postcopy_handle_run_bh("after invalidate cache");
2026
2027    dirty_bitmap_mig_before_vm_start();
2028
2029    if (autostart) {
2030        /* Hold onto your hats, starting the CPU */
2031        vm_start();
2032    } else {
2033        /* leave it paused and let management decide when to start the CPU */
2034        runstate_set(RUN_STATE_PAUSED);
2035    }
2036
2037    qemu_bh_delete(mis->bh);
2038
2039    trace_loadvm_postcopy_handle_run_bh("return");
2040}
2041
2042/* After all discards we can start running and asking for pages */
2043static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
2044{
2045    PostcopyState ps = postcopy_state_get();
2046
2047    trace_loadvm_postcopy_handle_run();
2048    if (ps != POSTCOPY_INCOMING_LISTENING) {
2049        error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
2050        return -1;
2051    }
2052
2053    postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
2054    mis->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, mis);
2055    qemu_bh_schedule(mis->bh);
2056
2057    /* We need to finish reading the stream from the package
2058     * and also stop reading anything more from the stream that loaded the
2059     * package (since it's now being read by the listener thread).
2060     * LOADVM_QUIT will quit all the layers of nested loadvm loops.
2061     */
2062    return LOADVM_QUIT;
2063}
2064
2065/* We must be with page_request_mutex held */
2066static gboolean postcopy_sync_page_req(gpointer key, gpointer value,
2067                                       gpointer data)
2068{
2069    MigrationIncomingState *mis = data;
2070    void *host_addr = (void *) key;
2071    ram_addr_t rb_offset;
2072    RAMBlock *rb;
2073    int ret;
2074
2075    rb = qemu_ram_block_from_host(host_addr, true, &rb_offset);
2076    if (!rb) {
2077        /*
2078         * This should _never_ happen.  However be nice for a migrating VM to
2079         * not crash/assert.  Post an error (note: intended to not use *_once
2080         * because we do want to see all the illegal addresses; and this can
2081         * never be triggered by the guest so we're safe) and move on next.
2082         */
2083        error_report("%s: illegal host addr %p", __func__, host_addr);
2084        /* Try the next entry */
2085        return FALSE;
2086    }
2087
2088    ret = migrate_send_rp_message_req_pages(mis, rb, rb_offset);
2089    if (ret) {
2090        /* Please refer to above comment. */
2091        error_report("%s: send rp message failed for addr %p",
2092                     __func__, host_addr);
2093        return FALSE;
2094    }
2095
2096    trace_postcopy_page_req_sync(host_addr);
2097
2098    return FALSE;
2099}
2100
2101static void migrate_send_rp_req_pages_pending(MigrationIncomingState *mis)
2102{
2103    WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
2104        g_tree_foreach(mis->page_requested, postcopy_sync_page_req, mis);
2105    }
2106}
2107
2108static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis)
2109{
2110    if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
2111        error_report("%s: illegal resume received", __func__);
2112        /* Don't fail the load, only for this. */
2113        return 0;
2114    }
2115
2116    /*
2117     * Reset the last_rb before we resend any page req to source again, since
2118     * the source should have it reset already.
2119     */
2120    mis->last_rb = NULL;
2121
2122    /*
2123     * This means source VM is ready to resume the postcopy migration.
2124     */
2125    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2126                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
2127
2128    trace_loadvm_postcopy_handle_resume();
2129
2130    /* Tell source that "we are ready" */
2131    migrate_send_rp_resume_ack(mis, MIGRATION_RESUME_ACK_VALUE);
2132
2133    /*
2134     * After a postcopy recovery, the source should have lost the postcopy
2135     * queue, or potentially the requested pages could have been lost during
2136     * the network down phase.  Let's re-sync with the source VM by re-sending
2137     * all the pending pages that we eagerly need, so these threads won't get
2138     * blocked too long due to the recovery.
2139     *
2140     * Without this procedure, the faulted destination VM threads (waiting for
2141     * page requests right before the postcopy is interrupted) can keep hanging
2142     * until the pages are sent by the source during the background copying of
2143     * pages, or another thread faulted on the same address accidentally.
2144     */
2145    migrate_send_rp_req_pages_pending(mis);
2146
2147    /*
2148     * It's time to switch state and release the fault thread to continue
2149     * service page faults.  Note that this should be explicitly after the
2150     * above call to migrate_send_rp_req_pages_pending().  In short:
2151     * migrate_send_rp_message_req_pages() is not thread safe, yet.
2152     */
2153    qemu_sem_post(&mis->postcopy_pause_sem_fault);
2154
2155    return 0;
2156}
2157
2158/**
2159 * Immediately following this command is a blob of data containing an embedded
2160 * chunk of migration stream; read it and load it.
2161 *
2162 * @mis: Incoming state
2163 * @length: Length of packaged data to read
2164 *
2165 * Returns: Negative values on error
2166 *
2167 */
2168static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
2169{
2170    int ret;
2171    size_t length;
2172    QIOChannelBuffer *bioc;
2173
2174    length = qemu_get_be32(mis->from_src_file);
2175    trace_loadvm_handle_cmd_packaged(length);
2176
2177    if (length > MAX_VM_CMD_PACKAGED_SIZE) {
2178        error_report("Unreasonably large packaged state: %zu", length);
2179        return -1;
2180    }
2181
2182    bioc = qio_channel_buffer_new(length);
2183    qio_channel_set_name(QIO_CHANNEL(bioc), "migration-loadvm-buffer");
2184    ret = qemu_get_buffer(mis->from_src_file,
2185                          bioc->data,
2186                          length);
2187    if (ret != length) {
2188        object_unref(OBJECT(bioc));
2189        error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu",
2190                     ret, length);
2191        return (ret < 0) ? ret : -EAGAIN;
2192    }
2193    bioc->usage += length;
2194    trace_loadvm_handle_cmd_packaged_received(ret);
2195
2196    QEMUFile *packf = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
2197
2198    ret = qemu_loadvm_state_main(packf, mis);
2199    trace_loadvm_handle_cmd_packaged_main(ret);
2200    qemu_fclose(packf);
2201    object_unref(OBJECT(bioc));
2202
2203    return ret;
2204}
2205
2206/*
2207 * Handle request that source requests for recved_bitmap on
2208 * destination. Payload format:
2209 *
2210 * len (1 byte) + ramblock_name (<255 bytes)
2211 */
2212static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
2213                                     uint16_t len)
2214{
2215    QEMUFile *file = mis->from_src_file;
2216    RAMBlock *rb;
2217    char block_name[256];
2218    size_t cnt;
2219
2220    cnt = qemu_get_counted_string(file, block_name);
2221    if (!cnt) {
2222        error_report("%s: failed to read block name", __func__);
2223        return -EINVAL;
2224    }
2225
2226    /* Validate before using the data */
2227    if (qemu_file_get_error(file)) {
2228        return qemu_file_get_error(file);
2229    }
2230
2231    if (len != cnt + 1) {
2232        error_report("%s: invalid payload length (%d)", __func__, len);
2233        return -EINVAL;
2234    }
2235
2236    rb = qemu_ram_block_by_name(block_name);
2237    if (!rb) {
2238        error_report("%s: block '%s' not found", __func__, block_name);
2239        return -EINVAL;
2240    }
2241
2242    migrate_send_rp_recv_bitmap(mis, block_name);
2243
2244    trace_loadvm_handle_recv_bitmap(block_name);
2245
2246    return 0;
2247}
2248
2249static int loadvm_process_enable_colo(MigrationIncomingState *mis)
2250{
2251    int ret = migration_incoming_enable_colo();
2252
2253    if (!ret) {
2254        ret = colo_init_ram_cache();
2255        if (ret) {
2256            migration_incoming_disable_colo();
2257        }
2258    }
2259    return ret;
2260}
2261
2262/*
2263 * Process an incoming 'QEMU_VM_COMMAND'
2264 * 0           just a normal return
2265 * LOADVM_QUIT All good, but exit the loop
2266 * <0          Error
2267 */
2268static int loadvm_process_command(QEMUFile *f)
2269{
2270    MigrationIncomingState *mis = migration_incoming_get_current();
2271    uint16_t cmd;
2272    uint16_t len;
2273    uint32_t tmp32;
2274
2275    cmd = qemu_get_be16(f);
2276    len = qemu_get_be16(f);
2277
2278    /* Check validity before continue processing of cmds */
2279    if (qemu_file_get_error(f)) {
2280        return qemu_file_get_error(f);
2281    }
2282
2283    if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
2284        error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
2285        return -EINVAL;
2286    }
2287
2288    trace_loadvm_process_command(mig_cmd_args[cmd].name, len);
2289
2290    if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
2291        error_report("%s received with bad length - expecting %zu, got %d",
2292                     mig_cmd_args[cmd].name,
2293                     (size_t)mig_cmd_args[cmd].len, len);
2294        return -ERANGE;
2295    }
2296
2297    switch (cmd) {
2298    case MIG_CMD_OPEN_RETURN_PATH:
2299        if (mis->to_src_file) {
2300            error_report("CMD_OPEN_RETURN_PATH called when RP already open");
2301            /* Not really a problem, so don't give up */
2302            return 0;
2303        }
2304        mis->to_src_file = qemu_file_get_return_path(f);
2305        if (!mis->to_src_file) {
2306            error_report("CMD_OPEN_RETURN_PATH failed");
2307            return -1;
2308        }
2309        break;
2310
2311    case MIG_CMD_PING:
2312        tmp32 = qemu_get_be32(f);
2313        trace_loadvm_process_command_ping(tmp32);
2314        if (!mis->to_src_file) {
2315            error_report("CMD_PING (0x%x) received with no return path",
2316                         tmp32);
2317            return -1;
2318        }
2319        migrate_send_rp_pong(mis, tmp32);
2320        break;
2321
2322    case MIG_CMD_PACKAGED:
2323        return loadvm_handle_cmd_packaged(mis);
2324
2325    case MIG_CMD_POSTCOPY_ADVISE:
2326        return loadvm_postcopy_handle_advise(mis, len);
2327
2328    case MIG_CMD_POSTCOPY_LISTEN:
2329        return loadvm_postcopy_handle_listen(mis);
2330
2331    case MIG_CMD_POSTCOPY_RUN:
2332        return loadvm_postcopy_handle_run(mis);
2333
2334    case MIG_CMD_POSTCOPY_RAM_DISCARD:
2335        return loadvm_postcopy_ram_handle_discard(mis, len);
2336
2337    case MIG_CMD_POSTCOPY_RESUME:
2338        return loadvm_postcopy_handle_resume(mis);
2339
2340    case MIG_CMD_RECV_BITMAP:
2341        return loadvm_handle_recv_bitmap(mis, len);
2342
2343    case MIG_CMD_ENABLE_COLO:
2344        return loadvm_process_enable_colo(mis);
2345    }
2346
2347    return 0;
2348}
2349
2350/*
2351 * Read a footer off the wire and check that it matches the expected section
2352 *
2353 * Returns: true if the footer was good
2354 *          false if there is a problem (and calls error_report to say why)
2355 */
2356static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
2357{
2358    int ret;
2359    uint8_t read_mark;
2360    uint32_t read_section_id;
2361
2362    if (!migrate_get_current()->send_section_footer) {
2363        /* No footer to check */
2364        return true;
2365    }
2366
2367    read_mark = qemu_get_byte(f);
2368
2369    ret = qemu_file_get_error(f);
2370    if (ret) {
2371        error_report("%s: Read section footer failed: %d",
2372                     __func__, ret);
2373        return false;
2374    }
2375
2376    if (read_mark != QEMU_VM_SECTION_FOOTER) {
2377        error_report("Missing section footer for %s", se->idstr);
2378        return false;
2379    }
2380
2381    read_section_id = qemu_get_be32(f);
2382    if (read_section_id != se->load_section_id) {
2383        error_report("Mismatched section id in footer for %s -"
2384                     " read 0x%x expected 0x%x",
2385                     se->idstr, read_section_id, se->load_section_id);
2386        return false;
2387    }
2388
2389    /* All good */
2390    return true;
2391}
2392
2393static int
2394qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
2395{
2396    uint32_t instance_id, version_id, section_id;
2397    SaveStateEntry *se;
2398    char idstr[256];
2399    int ret;
2400
2401    /* Read section start */
2402    section_id = qemu_get_be32(f);
2403    if (!qemu_get_counted_string(f, idstr)) {
2404        error_report("Unable to read ID string for section %u",
2405                     section_id);
2406        return -EINVAL;
2407    }
2408    instance_id = qemu_get_be32(f);
2409    version_id = qemu_get_be32(f);
2410
2411    ret = qemu_file_get_error(f);
2412    if (ret) {
2413        error_report("%s: Failed to read instance/version ID: %d",
2414                     __func__, ret);
2415        return ret;
2416    }
2417
2418    trace_qemu_loadvm_state_section_startfull(section_id, idstr,
2419            instance_id, version_id);
2420    /* Find savevm section */
2421    se = find_se(idstr, instance_id);
2422    if (se == NULL) {
2423        error_report("Unknown savevm section or instance '%s' %"PRIu32". "
2424                     "Make sure that your current VM setup matches your "
2425                     "saved VM setup, including any hotplugged devices",
2426                     idstr, instance_id);
2427        return -EINVAL;
2428    }
2429
2430    /* Validate version */
2431    if (version_id > se->version_id) {
2432        error_report("savevm: unsupported version %d for '%s' v%d",
2433                     version_id, idstr, se->version_id);
2434        return -EINVAL;
2435    }
2436    se->load_version_id = version_id;
2437    se->load_section_id = section_id;
2438
2439    /* Validate if it is a device's state */
2440    if (xen_enabled() && se->is_ram) {
2441        error_report("loadvm: %s RAM loading not allowed on Xen", idstr);
2442        return -EINVAL;
2443    }
2444
2445    ret = vmstate_load(f, se);
2446    if (ret < 0) {
2447        error_report("error while loading state for instance 0x%"PRIx32" of"
2448                     " device '%s'", instance_id, idstr);
2449        return ret;
2450    }
2451    if (!check_section_footer(f, se)) {
2452        return -EINVAL;
2453    }
2454
2455    return 0;
2456}
2457
2458static int
2459qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
2460{
2461    uint32_t section_id;
2462    SaveStateEntry *se;
2463    int ret;
2464
2465    section_id = qemu_get_be32(f);
2466
2467    ret = qemu_file_get_error(f);
2468    if (ret) {
2469        error_report("%s: Failed to read section ID: %d",
2470                     __func__, ret);
2471        return ret;
2472    }
2473
2474    trace_qemu_loadvm_state_section_partend(section_id);
2475    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2476        if (se->load_section_id == section_id) {
2477            break;
2478        }
2479    }
2480    if (se == NULL) {
2481        error_report("Unknown savevm section %d", section_id);
2482        return -EINVAL;
2483    }
2484
2485    ret = vmstate_load(f, se);
2486    if (ret < 0) {
2487        error_report("error while loading state section id %d(%s)",
2488                     section_id, se->idstr);
2489        return ret;
2490    }
2491    if (!check_section_footer(f, se)) {
2492        return -EINVAL;
2493    }
2494
2495    return 0;
2496}
2497
2498static int qemu_loadvm_state_header(QEMUFile *f)
2499{
2500    unsigned int v;
2501    int ret;
2502
2503    v = qemu_get_be32(f);
2504    if (v != QEMU_VM_FILE_MAGIC) {
2505        error_report("Not a migration stream");
2506        return -EINVAL;
2507    }
2508
2509    v = qemu_get_be32(f);
2510    if (v == QEMU_VM_FILE_VERSION_COMPAT) {
2511        error_report("SaveVM v2 format is obsolete and don't work anymore");
2512        return -ENOTSUP;
2513    }
2514    if (v != QEMU_VM_FILE_VERSION) {
2515        error_report("Unsupported migration stream version");
2516        return -ENOTSUP;
2517    }
2518
2519    if (migrate_get_current()->send_configuration) {
2520        if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
2521            error_report("Configuration section missing");
2522            qemu_loadvm_state_cleanup();
2523            return -EINVAL;
2524        }
2525        ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
2526
2527        if (ret) {
2528            qemu_loadvm_state_cleanup();
2529            return ret;
2530        }
2531    }
2532    return 0;
2533}
2534
2535static int qemu_loadvm_state_setup(QEMUFile *f)
2536{
2537    SaveStateEntry *se;
2538    int ret;
2539
2540    trace_loadvm_state_setup();
2541    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2542        if (!se->ops || !se->ops->load_setup) {
2543            continue;
2544        }
2545        if (se->ops->is_active) {
2546            if (!se->ops->is_active(se->opaque)) {
2547                continue;
2548            }
2549        }
2550
2551        ret = se->ops->load_setup(f, se->opaque);
2552        if (ret < 0) {
2553            qemu_file_set_error(f, ret);
2554            error_report("Load state of device %s failed", se->idstr);
2555            return ret;
2556        }
2557    }
2558    return 0;
2559}
2560
2561void qemu_loadvm_state_cleanup(void)
2562{
2563    SaveStateEntry *se;
2564
2565    trace_loadvm_state_cleanup();
2566    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2567        if (se->ops && se->ops->load_cleanup) {
2568            se->ops->load_cleanup(se->opaque);
2569        }
2570    }
2571}
2572
2573/* Return true if we should continue the migration, or false. */
2574static bool postcopy_pause_incoming(MigrationIncomingState *mis)
2575{
2576    int i;
2577
2578    /*
2579     * If network is interrupted, any temp page we received will be useless
2580     * because we didn't mark them as "received" in receivedmap.  After a
2581     * proper recovery later (which will sync src dirty bitmap with receivedmap
2582     * on dest) these cached small pages will be resent again.
2583     */
2584    for (i = 0; i < mis->postcopy_channels; i++) {
2585        postcopy_temp_page_reset(&mis->postcopy_tmp_pages[i]);
2586    }
2587
2588    trace_postcopy_pause_incoming();
2589
2590    assert(migrate_postcopy_ram());
2591
2592    /* Clear the triggered bit to allow one recovery */
2593    mis->postcopy_recover_triggered = false;
2594
2595    /*
2596     * Unregister yank with either from/to src would work, since ioc behind it
2597     * is the same
2598     */
2599    migration_ioc_unregister_yank_from_file(mis->from_src_file);
2600
2601    assert(mis->from_src_file);
2602    qemu_file_shutdown(mis->from_src_file);
2603    qemu_fclose(mis->from_src_file);
2604    mis->from_src_file = NULL;
2605
2606    assert(mis->to_src_file);
2607    qemu_file_shutdown(mis->to_src_file);
2608    qemu_mutex_lock(&mis->rp_mutex);
2609    qemu_fclose(mis->to_src_file);
2610    mis->to_src_file = NULL;
2611    qemu_mutex_unlock(&mis->rp_mutex);
2612
2613    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2614                      MIGRATION_STATUS_POSTCOPY_PAUSED);
2615
2616    /* Notify the fault thread for the invalidated file handle */
2617    postcopy_fault_thread_notify(mis);
2618
2619    error_report("Detected IO failure for postcopy. "
2620                 "Migration paused.");
2621
2622    while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
2623        qemu_sem_wait(&mis->postcopy_pause_sem_dst);
2624    }
2625
2626    trace_postcopy_pause_incoming_continued();
2627
2628    return true;
2629}
2630
2631int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
2632{
2633    uint8_t section_type;
2634    int ret = 0;
2635
2636retry:
2637    while (true) {
2638        section_type = qemu_get_byte(f);
2639
2640        if (qemu_file_get_error(f)) {
2641            ret = qemu_file_get_error(f);
2642            break;
2643        }
2644
2645        trace_qemu_loadvm_state_section(section_type);
2646        switch (section_type) {
2647        case QEMU_VM_SECTION_START:
2648        case QEMU_VM_SECTION_FULL:
2649            ret = qemu_loadvm_section_start_full(f, mis);
2650            if (ret < 0) {
2651                goto out;
2652            }
2653            break;
2654        case QEMU_VM_SECTION_PART:
2655        case QEMU_VM_SECTION_END:
2656            ret = qemu_loadvm_section_part_end(f, mis);
2657            if (ret < 0) {
2658                goto out;
2659            }
2660            break;
2661        case QEMU_VM_COMMAND:
2662            ret = loadvm_process_command(f);
2663            trace_qemu_loadvm_state_section_command(ret);
2664            if ((ret < 0) || (ret == LOADVM_QUIT)) {
2665                goto out;
2666            }
2667            break;
2668        case QEMU_VM_EOF:
2669            /* This is the end of migration */
2670            goto out;
2671        default:
2672            error_report("Unknown savevm section type %d", section_type);
2673            ret = -EINVAL;
2674            goto out;
2675        }
2676    }
2677
2678out:
2679    if (ret < 0) {
2680        qemu_file_set_error(f, ret);
2681
2682        /* Cancel bitmaps incoming regardless of recovery */
2683        dirty_bitmap_mig_cancel_incoming();
2684
2685        /*
2686         * If we are during an active postcopy, then we pause instead
2687         * of bail out to at least keep the VM's dirty data.  Note
2688         * that POSTCOPY_INCOMING_LISTENING stage is still not enough,
2689         * during which we're still receiving device states and we
2690         * still haven't yet started the VM on destination.
2691         *
2692         * Only RAM postcopy supports recovery. Still, if RAM postcopy is
2693         * enabled, canceled bitmaps postcopy will not affect RAM postcopy
2694         * recovering.
2695         */
2696        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
2697            migrate_postcopy_ram() && postcopy_pause_incoming(mis)) {
2698            /* Reset f to point to the newly created channel */
2699            f = mis->from_src_file;
2700            goto retry;
2701        }
2702    }
2703    return ret;
2704}
2705
2706int qemu_loadvm_state(QEMUFile *f)
2707{
2708    MigrationIncomingState *mis = migration_incoming_get_current();
2709    Error *local_err = NULL;
2710    int ret;
2711
2712    if (qemu_savevm_state_blocked(&local_err)) {
2713        error_report_err(local_err);
2714        return -EINVAL;
2715    }
2716
2717    ret = qemu_loadvm_state_header(f);
2718    if (ret) {
2719        return ret;
2720    }
2721
2722    if (qemu_loadvm_state_setup(f) != 0) {
2723        return -EINVAL;
2724    }
2725
2726    cpu_synchronize_all_pre_loadvm();
2727
2728    ret = qemu_loadvm_state_main(f, mis);
2729    qemu_event_set(&mis->main_thread_load_event);
2730
2731    trace_qemu_loadvm_state_post_main(ret);
2732
2733    if (mis->have_listen_thread) {
2734        /* Listen thread still going, can't clean up yet */
2735        return ret;
2736    }
2737
2738    if (ret == 0) {
2739        ret = qemu_file_get_error(f);
2740    }
2741
2742    /*
2743     * Try to read in the VMDESC section as well, so that dumping tools that
2744     * intercept our migration stream have the chance to see it.
2745     */
2746
2747    /* We've got to be careful; if we don't read the data and just shut the fd
2748     * then the sender can error if we close while it's still sending.
2749     * We also mustn't read data that isn't there; some transports (RDMA)
2750     * will stall waiting for that data when the source has already closed.
2751     */
2752    if (ret == 0 && should_send_vmdesc()) {
2753        uint8_t *buf;
2754        uint32_t size;
2755        uint8_t  section_type = qemu_get_byte(f);
2756
2757        if (section_type != QEMU_VM_VMDESCRIPTION) {
2758            error_report("Expected vmdescription section, but got %d",
2759                         section_type);
2760            /*
2761             * It doesn't seem worth failing at this point since
2762             * we apparently have an otherwise valid VM state
2763             */
2764        } else {
2765            buf = g_malloc(0x1000);
2766            size = qemu_get_be32(f);
2767
2768            while (size > 0) {
2769                uint32_t read_chunk = MIN(size, 0x1000);
2770                qemu_get_buffer(f, buf, read_chunk);
2771                size -= read_chunk;
2772            }
2773            g_free(buf);
2774        }
2775    }
2776
2777    qemu_loadvm_state_cleanup();
2778    cpu_synchronize_all_post_init();
2779
2780    return ret;
2781}
2782
2783int qemu_load_device_state(QEMUFile *f)
2784{
2785    MigrationIncomingState *mis = migration_incoming_get_current();
2786    int ret;
2787
2788    /* Load QEMU_VM_SECTION_FULL section */
2789    ret = qemu_loadvm_state_main(f, mis);
2790    if (ret < 0) {
2791        error_report("Failed to load device state: %d", ret);
2792        return ret;
2793    }
2794
2795    cpu_synchronize_all_post_init();
2796    return 0;
2797}
2798
2799bool save_snapshot(const char *name, bool overwrite, const char *vmstate,
2800                  bool has_devices, strList *devices, Error **errp)
2801{
2802    BlockDriverState *bs;
2803    QEMUSnapshotInfo sn1, *sn = &sn1;
2804    int ret = -1, ret2;
2805    QEMUFile *f;
2806    int saved_vm_running;
2807    uint64_t vm_state_size;
2808    g_autoptr(GDateTime) now = g_date_time_new_now_local();
2809    AioContext *aio_context;
2810
2811    GLOBAL_STATE_CODE();
2812
2813    if (migration_is_blocked(errp)) {
2814        return false;
2815    }
2816
2817    if (!replay_can_snapshot()) {
2818        error_setg(errp, "Record/replay does not allow making snapshot "
2819                   "right now. Try once more later.");
2820        return false;
2821    }
2822
2823    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
2824        return false;
2825    }
2826
2827    /* Delete old snapshots of the same name */
2828    if (name) {
2829        if (overwrite) {
2830            if (bdrv_all_delete_snapshot(name, has_devices,
2831                                         devices, errp) < 0) {
2832                return false;
2833            }
2834        } else {
2835            ret2 = bdrv_all_has_snapshot(name, has_devices, devices, errp);
2836            if (ret2 < 0) {
2837                return false;
2838            }
2839            if (ret2 == 1) {
2840                error_setg(errp,
2841                           "Snapshot '%s' already exists in one or more devices",
2842                           name);
2843                return false;
2844            }
2845        }
2846    }
2847
2848    bs = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
2849    if (bs == NULL) {
2850        return false;
2851    }
2852    aio_context = bdrv_get_aio_context(bs);
2853
2854    saved_vm_running = runstate_is_running();
2855
2856    ret = global_state_store();
2857    if (ret) {
2858        error_setg(errp, "Error saving global state");
2859        return false;
2860    }
2861    vm_stop(RUN_STATE_SAVE_VM);
2862
2863    bdrv_drain_all_begin();
2864
2865    aio_context_acquire(aio_context);
2866
2867    memset(sn, 0, sizeof(*sn));
2868
2869    /* fill auxiliary fields */
2870    sn->date_sec = g_date_time_to_unix(now);
2871    sn->date_nsec = g_date_time_get_microsecond(now) * 1000;
2872    sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
2873    if (replay_mode != REPLAY_MODE_NONE) {
2874        sn->icount = replay_get_current_icount();
2875    } else {
2876        sn->icount = -1ULL;
2877    }
2878
2879    if (name) {
2880        pstrcpy(sn->name, sizeof(sn->name), name);
2881    } else {
2882        g_autofree char *autoname = g_date_time_format(now,  "vm-%Y%m%d%H%M%S");
2883        pstrcpy(sn->name, sizeof(sn->name), autoname);
2884    }
2885
2886    /* save the VM state */
2887    f = qemu_fopen_bdrv(bs, 1);
2888    if (!f) {
2889        error_setg(errp, "Could not open VM state file");
2890        goto the_end;
2891    }
2892    ret = qemu_savevm_state(f, errp);
2893    vm_state_size = qemu_ftell(f);
2894    ret2 = qemu_fclose(f);
2895    if (ret < 0) {
2896        goto the_end;
2897    }
2898    if (ret2 < 0) {
2899        ret = ret2;
2900        goto the_end;
2901    }
2902
2903    /* The bdrv_all_create_snapshot() call that follows acquires the AioContext
2904     * for itself.  BDRV_POLL_WHILE() does not support nested locking because
2905     * it only releases the lock once.  Therefore synchronous I/O will deadlock
2906     * unless we release the AioContext before bdrv_all_create_snapshot().
2907     */
2908    aio_context_release(aio_context);
2909    aio_context = NULL;
2910
2911    ret = bdrv_all_create_snapshot(sn, bs, vm_state_size,
2912                                   has_devices, devices, errp);
2913    if (ret < 0) {
2914        bdrv_all_delete_snapshot(sn->name, has_devices, devices, NULL);
2915        goto the_end;
2916    }
2917
2918    ret = 0;
2919
2920 the_end:
2921    if (aio_context) {
2922        aio_context_release(aio_context);
2923    }
2924
2925    bdrv_drain_all_end();
2926
2927    if (saved_vm_running) {
2928        vm_start();
2929    }
2930    return ret == 0;
2931}
2932
2933void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
2934                                Error **errp)
2935{
2936    QEMUFile *f;
2937    QIOChannelFile *ioc;
2938    int saved_vm_running;
2939    int ret;
2940
2941    if (!has_live) {
2942        /* live default to true so old version of Xen tool stack can have a
2943         * successful live migration */
2944        live = true;
2945    }
2946
2947    saved_vm_running = runstate_is_running();
2948    vm_stop(RUN_STATE_SAVE_VM);
2949    global_state_store_running();
2950
2951    ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT | O_TRUNC,
2952                                    0660, errp);
2953    if (!ioc) {
2954        goto the_end;
2955    }
2956    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-save-state");
2957    f = qemu_fopen_channel_output(QIO_CHANNEL(ioc));
2958    object_unref(OBJECT(ioc));
2959    ret = qemu_save_device_state(f);
2960    if (ret < 0 || qemu_fclose(f) < 0) {
2961        error_setg(errp, QERR_IO_ERROR);
2962    } else {
2963        /* libxl calls the QMP command "stop" before calling
2964         * "xen-save-devices-state" and in case of migration failure, libxl
2965         * would call "cont".
2966         * So call bdrv_inactivate_all (release locks) here to let the other
2967         * side of the migration take control of the images.
2968         */
2969        if (live && !saved_vm_running) {
2970            ret = bdrv_inactivate_all();
2971            if (ret) {
2972                error_setg(errp, "%s: bdrv_inactivate_all() failed (%d)",
2973                           __func__, ret);
2974            }
2975        }
2976    }
2977
2978 the_end:
2979    if (saved_vm_running) {
2980        vm_start();
2981    }
2982}
2983
2984void qmp_xen_load_devices_state(const char *filename, Error **errp)
2985{
2986    QEMUFile *f;
2987    QIOChannelFile *ioc;
2988    int ret;
2989
2990    /* Guest must be paused before loading the device state; the RAM state
2991     * will already have been loaded by xc
2992     */
2993    if (runstate_is_running()) {
2994        error_setg(errp, "Cannot update device state while vm is running");
2995        return;
2996    }
2997    vm_stop(RUN_STATE_RESTORE_VM);
2998
2999    ioc = qio_channel_file_new_path(filename, O_RDONLY | O_BINARY, 0, errp);
3000    if (!ioc) {
3001        return;
3002    }
3003    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-load-state");
3004    f = qemu_fopen_channel_input(QIO_CHANNEL(ioc));
3005    object_unref(OBJECT(ioc));
3006
3007    ret = qemu_loadvm_state(f);
3008    qemu_fclose(f);
3009    if (ret < 0) {
3010        error_setg(errp, QERR_IO_ERROR);
3011    }
3012    migration_incoming_state_destroy();
3013}
3014
3015bool load_snapshot(const char *name, const char *vmstate,
3016                   bool has_devices, strList *devices, Error **errp)
3017{
3018    BlockDriverState *bs_vm_state;
3019    QEMUSnapshotInfo sn;
3020    QEMUFile *f;
3021    int ret;
3022    AioContext *aio_context;
3023    MigrationIncomingState *mis = migration_incoming_get_current();
3024
3025    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
3026        return false;
3027    }
3028    ret = bdrv_all_has_snapshot(name, has_devices, devices, errp);
3029    if (ret < 0) {
3030        return false;
3031    }
3032    if (ret == 0) {
3033        error_setg(errp, "Snapshot '%s' does not exist in one or more devices",
3034                   name);
3035        return false;
3036    }
3037
3038    bs_vm_state = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
3039    if (!bs_vm_state) {
3040        return false;
3041    }
3042    aio_context = bdrv_get_aio_context(bs_vm_state);
3043
3044    /* Don't even try to load empty VM states */
3045    aio_context_acquire(aio_context);
3046    ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
3047    aio_context_release(aio_context);
3048    if (ret < 0) {
3049        return false;
3050    } else if (sn.vm_state_size == 0) {
3051        error_setg(errp, "This is a disk-only snapshot. Revert to it "
3052                   " offline using qemu-img");
3053        return false;
3054    }
3055
3056    /*
3057     * Flush the record/replay queue. Now the VM state is going
3058     * to change. Therefore we don't need to preserve its consistency
3059     */
3060    replay_flush_events();
3061
3062    /* Flush all IO requests so they don't interfere with the new state.  */
3063    bdrv_drain_all_begin();
3064
3065    ret = bdrv_all_goto_snapshot(name, has_devices, devices, errp);
3066    if (ret < 0) {
3067        goto err_drain;
3068    }
3069
3070    /* restore the VM state */
3071    f = qemu_fopen_bdrv(bs_vm_state, 0);
3072    if (!f) {
3073        error_setg(errp, "Could not open VM state file");
3074        goto err_drain;
3075    }
3076
3077    qemu_system_reset(SHUTDOWN_CAUSE_NONE);
3078    mis->from_src_file = f;
3079
3080    if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
3081        ret = -EINVAL;
3082        goto err_drain;
3083    }
3084    aio_context_acquire(aio_context);
3085    ret = qemu_loadvm_state(f);
3086    migration_incoming_state_destroy();
3087    aio_context_release(aio_context);
3088
3089    bdrv_drain_all_end();
3090
3091    if (ret < 0) {
3092        error_setg(errp, "Error %d while loading VM state", ret);
3093        return false;
3094    }
3095
3096    return true;
3097
3098err_drain:
3099    bdrv_drain_all_end();
3100    return false;
3101}
3102
3103bool delete_snapshot(const char *name, bool has_devices,
3104                     strList *devices, Error **errp)
3105{
3106    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
3107        return false;
3108    }
3109
3110    if (bdrv_all_delete_snapshot(name, has_devices, devices, errp) < 0) {
3111        return false;
3112    }
3113
3114    return true;
3115}
3116
3117void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
3118{
3119    qemu_ram_set_idstr(mr->ram_block,
3120                       memory_region_name(mr), dev);
3121    qemu_ram_set_migratable(mr->ram_block);
3122}
3123
3124void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
3125{
3126    qemu_ram_unset_idstr(mr->ram_block);
3127    qemu_ram_unset_migratable(mr->ram_block);
3128}
3129
3130void vmstate_register_ram_global(MemoryRegion *mr)
3131{
3132    vmstate_register_ram(mr, NULL);
3133}
3134
3135bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
3136{
3137    /* check needed if --only-migratable is specified */
3138    if (!only_migratable) {
3139        return true;
3140    }
3141
3142    return !(vmsd && vmsd->unmigratable);
3143}
3144
3145typedef struct SnapshotJob {
3146    Job common;
3147    char *tag;
3148    char *vmstate;
3149    strList *devices;
3150    Coroutine *co;
3151    Error **errp;
3152    bool ret;
3153} SnapshotJob;
3154
3155static void qmp_snapshot_job_free(SnapshotJob *s)
3156{
3157    g_free(s->tag);
3158    g_free(s->vmstate);
3159    qapi_free_strList(s->devices);
3160}
3161
3162
3163static void snapshot_load_job_bh(void *opaque)
3164{
3165    Job *job = opaque;
3166    SnapshotJob *s = container_of(job, SnapshotJob, common);
3167    int orig_vm_running;
3168
3169    job_progress_set_remaining(&s->common, 1);
3170
3171    orig_vm_running = runstate_is_running();
3172    vm_stop(RUN_STATE_RESTORE_VM);
3173
3174    s->ret = load_snapshot(s->tag, s->vmstate, true, s->devices, s->errp);
3175    if (s->ret && orig_vm_running) {
3176        vm_start();
3177    }
3178
3179    job_progress_update(&s->common, 1);
3180
3181    qmp_snapshot_job_free(s);
3182    aio_co_wake(s->co);
3183}
3184
3185static void snapshot_save_job_bh(void *opaque)
3186{
3187    Job *job = opaque;
3188    SnapshotJob *s = container_of(job, SnapshotJob, common);
3189
3190    job_progress_set_remaining(&s->common, 1);
3191    s->ret = save_snapshot(s->tag, false, s->vmstate,
3192                           true, s->devices, s->errp);
3193    job_progress_update(&s->common, 1);
3194
3195    qmp_snapshot_job_free(s);
3196    aio_co_wake(s->co);
3197}
3198
3199static void snapshot_delete_job_bh(void *opaque)
3200{
3201    Job *job = opaque;
3202    SnapshotJob *s = container_of(job, SnapshotJob, common);
3203
3204    job_progress_set_remaining(&s->common, 1);
3205    s->ret = delete_snapshot(s->tag, true, s->devices, s->errp);
3206    job_progress_update(&s->common, 1);
3207
3208    qmp_snapshot_job_free(s);
3209    aio_co_wake(s->co);
3210}
3211
3212static int coroutine_fn snapshot_save_job_run(Job *job, Error **errp)
3213{
3214    SnapshotJob *s = container_of(job, SnapshotJob, common);
3215    s->errp = errp;
3216    s->co = qemu_coroutine_self();
3217    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3218                            snapshot_save_job_bh, job);
3219    qemu_coroutine_yield();
3220    return s->ret ? 0 : -1;
3221}
3222
3223static int coroutine_fn snapshot_load_job_run(Job *job, Error **errp)
3224{
3225    SnapshotJob *s = container_of(job, SnapshotJob, common);
3226    s->errp = errp;
3227    s->co = qemu_coroutine_self();
3228    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3229                            snapshot_load_job_bh, job);
3230    qemu_coroutine_yield();
3231    return s->ret ? 0 : -1;
3232}
3233
3234static int coroutine_fn snapshot_delete_job_run(Job *job, Error **errp)
3235{
3236    SnapshotJob *s = container_of(job, SnapshotJob, common);
3237    s->errp = errp;
3238    s->co = qemu_coroutine_self();
3239    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3240                            snapshot_delete_job_bh, job);
3241    qemu_coroutine_yield();
3242    return s->ret ? 0 : -1;
3243}
3244
3245
3246static const JobDriver snapshot_load_job_driver = {
3247    .instance_size = sizeof(SnapshotJob),
3248    .job_type      = JOB_TYPE_SNAPSHOT_LOAD,
3249    .run           = snapshot_load_job_run,
3250};
3251
3252static const JobDriver snapshot_save_job_driver = {
3253    .instance_size = sizeof(SnapshotJob),
3254    .job_type      = JOB_TYPE_SNAPSHOT_SAVE,
3255    .run           = snapshot_save_job_run,
3256};
3257
3258static const JobDriver snapshot_delete_job_driver = {
3259    .instance_size = sizeof(SnapshotJob),
3260    .job_type      = JOB_TYPE_SNAPSHOT_DELETE,
3261    .run           = snapshot_delete_job_run,
3262};
3263
3264
3265void qmp_snapshot_save(const char *job_id,
3266                       const char *tag,
3267                       const char *vmstate,
3268                       strList *devices,
3269                       Error **errp)
3270{
3271    SnapshotJob *s;
3272
3273    s = job_create(job_id, &snapshot_save_job_driver, NULL,
3274                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3275                   NULL, NULL, errp);
3276    if (!s) {
3277        return;
3278    }
3279
3280    s->tag = g_strdup(tag);
3281    s->vmstate = g_strdup(vmstate);
3282    s->devices = QAPI_CLONE(strList, devices);
3283
3284    job_start(&s->common);
3285}
3286
3287void qmp_snapshot_load(const char *job_id,
3288                       const char *tag,
3289                       const char *vmstate,
3290                       strList *devices,
3291                       Error **errp)
3292{
3293    SnapshotJob *s;
3294
3295    s = job_create(job_id, &snapshot_load_job_driver, NULL,
3296                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3297                   NULL, NULL, errp);
3298    if (!s) {
3299        return;
3300    }
3301
3302    s->tag = g_strdup(tag);
3303    s->vmstate = g_strdup(vmstate);
3304    s->devices = QAPI_CLONE(strList, devices);
3305
3306    job_start(&s->common);
3307}
3308
3309void qmp_snapshot_delete(const char *job_id,
3310                         const char *tag,
3311                         strList *devices,
3312                         Error **errp)
3313{
3314    SnapshotJob *s;
3315
3316    s = job_create(job_id, &snapshot_delete_job_driver, NULL,
3317                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3318                   NULL, NULL, errp);
3319    if (!s) {
3320        return;
3321    }
3322
3323    s->tag = g_strdup(tag);
3324    s->devices = QAPI_CLONE(strList, devices);
3325
3326    job_start(&s->common);
3327}
3328