qemu/migration/savevm.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2009-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include "hw/boards.h"
  31#include "net/net.h"
  32#include "migration.h"
  33#include "migration/snapshot.h"
  34#include "migration/vmstate.h"
  35#include "migration/misc.h"
  36#include "migration/register.h"
  37#include "migration/global_state.h"
  38#include "ram.h"
  39#include "qemu-file-channel.h"
  40#include "qemu-file.h"
  41#include "savevm.h"
  42#include "postcopy-ram.h"
  43#include "qapi/error.h"
  44#include "qapi/qapi-commands-migration.h"
  45#include "qapi/qmp/json-writer.h"
  46#include "qapi/clone-visitor.h"
  47#include "qapi/qapi-builtin-visit.h"
  48#include "qapi/qmp/qerror.h"
  49#include "qemu/error-report.h"
  50#include "sysemu/cpus.h"
  51#include "exec/memory.h"
  52#include "exec/target_page.h"
  53#include "trace.h"
  54#include "qemu/iov.h"
  55#include "qemu/main-loop.h"
  56#include "block/snapshot.h"
  57#include "qemu/cutils.h"
  58#include "io/channel-buffer.h"
  59#include "io/channel-file.h"
  60#include "sysemu/replay.h"
  61#include "sysemu/runstate.h"
  62#include "sysemu/sysemu.h"
  63#include "sysemu/xen.h"
  64#include "migration/colo.h"
  65#include "qemu/bitmap.h"
  66#include "net/announce.h"
  67#include "qemu/yank.h"
  68#include "yank_functions.h"
  69
  70const unsigned int postcopy_ram_discard_version;
  71
  72/* Subcommands for QEMU_VM_COMMAND */
  73enum qemu_vm_cmd {
  74    MIG_CMD_INVALID = 0,   /* Must be 0 */
  75    MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
  76    MIG_CMD_PING,              /* Request a PONG on the RP */
  77
  78    MIG_CMD_POSTCOPY_ADVISE,       /* Prior to any page transfers, just
  79                                      warn we might want to do PC */
  80    MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
  81                                      pages as it's running. */
  82    MIG_CMD_POSTCOPY_RUN,          /* Start execution */
  83
  84    MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
  85                                      were previously sent during
  86                                      precopy but are dirty. */
  87    MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
  88    MIG_CMD_ENABLE_COLO,       /* Enable COLO */
  89    MIG_CMD_POSTCOPY_RESUME,   /* resume postcopy on dest */
  90    MIG_CMD_RECV_BITMAP,       /* Request for recved bitmap on dst */
  91    MIG_CMD_MAX
  92};
  93
  94#define MAX_VM_CMD_PACKAGED_SIZE UINT32_MAX
  95static struct mig_cmd_args {
  96    ssize_t     len; /* -1 = variable */
  97    const char *name;
  98} mig_cmd_args[] = {
  99    [MIG_CMD_INVALID]          = { .len = -1, .name = "INVALID" },
 100    [MIG_CMD_OPEN_RETURN_PATH] = { .len =  0, .name = "OPEN_RETURN_PATH" },
 101    [MIG_CMD_PING]             = { .len = sizeof(uint32_t), .name = "PING" },
 102    [MIG_CMD_POSTCOPY_ADVISE]  = { .len = -1, .name = "POSTCOPY_ADVISE" },
 103    [MIG_CMD_POSTCOPY_LISTEN]  = { .len =  0, .name = "POSTCOPY_LISTEN" },
 104    [MIG_CMD_POSTCOPY_RUN]     = { .len =  0, .name = "POSTCOPY_RUN" },
 105    [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
 106                                   .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
 107    [MIG_CMD_POSTCOPY_RESUME]  = { .len =  0, .name = "POSTCOPY_RESUME" },
 108    [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
 109    [MIG_CMD_RECV_BITMAP]      = { .len = -1, .name = "RECV_BITMAP" },
 110    [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
 111};
 112
 113/* Note for MIG_CMD_POSTCOPY_ADVISE:
 114 * The format of arguments is depending on postcopy mode:
 115 * - postcopy RAM only
 116 *   uint64_t host page size
 117 *   uint64_t taget page size
 118 *
 119 * - postcopy RAM and postcopy dirty bitmaps
 120 *   format is the same as for postcopy RAM only
 121 *
 122 * - postcopy dirty bitmaps only
 123 *   Nothing. Command length field is 0.
 124 *
 125 * Be careful: adding a new postcopy entity with some other parameters should
 126 * not break format self-description ability. Good way is to introduce some
 127 * generic extendable format with an exception for two old entities.
 128 */
 129
 130/***********************************************************/
 131/* savevm/loadvm support */
 132
 133static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
 134                                   int64_t pos, Error **errp)
 135{
 136    int ret;
 137    QEMUIOVector qiov;
 138
 139    qemu_iovec_init_external(&qiov, iov, iovcnt);
 140    ret = bdrv_writev_vmstate(opaque, &qiov, pos);
 141    if (ret < 0) {
 142        return ret;
 143    }
 144
 145    return qiov.size;
 146}
 147
 148static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
 149                                size_t size, Error **errp)
 150{
 151    return bdrv_load_vmstate(opaque, buf, pos, size);
 152}
 153
 154static int bdrv_fclose(void *opaque, Error **errp)
 155{
 156    return bdrv_flush(opaque);
 157}
 158
 159static const QEMUFileOps bdrv_read_ops = {
 160    .get_buffer = block_get_buffer,
 161    .close =      bdrv_fclose
 162};
 163
 164static const QEMUFileOps bdrv_write_ops = {
 165    .writev_buffer  = block_writev_buffer,
 166    .close          = bdrv_fclose
 167};
 168
 169static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
 170{
 171    if (is_writable) {
 172        return qemu_fopen_ops(bs, &bdrv_write_ops, false);
 173    }
 174    return qemu_fopen_ops(bs, &bdrv_read_ops, false);
 175}
 176
 177
 178/* QEMUFile timer support.
 179 * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c
 180 */
 181
 182void timer_put(QEMUFile *f, QEMUTimer *ts)
 183{
 184    uint64_t expire_time;
 185
 186    expire_time = timer_expire_time_ns(ts);
 187    qemu_put_be64(f, expire_time);
 188}
 189
 190void timer_get(QEMUFile *f, QEMUTimer *ts)
 191{
 192    uint64_t expire_time;
 193
 194    expire_time = qemu_get_be64(f);
 195    if (expire_time != -1) {
 196        timer_mod_ns(ts, expire_time);
 197    } else {
 198        timer_del(ts);
 199    }
 200}
 201
 202
 203/* VMState timer support.
 204 * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c
 205 */
 206
 207static int get_timer(QEMUFile *f, void *pv, size_t size,
 208                     const VMStateField *field)
 209{
 210    QEMUTimer *v = pv;
 211    timer_get(f, v);
 212    return 0;
 213}
 214
 215static int put_timer(QEMUFile *f, void *pv, size_t size,
 216                     const VMStateField *field, JSONWriter *vmdesc)
 217{
 218    QEMUTimer *v = pv;
 219    timer_put(f, v);
 220
 221    return 0;
 222}
 223
 224const VMStateInfo vmstate_info_timer = {
 225    .name = "timer",
 226    .get  = get_timer,
 227    .put  = put_timer,
 228};
 229
 230
 231typedef struct CompatEntry {
 232    char idstr[256];
 233    int instance_id;
 234} CompatEntry;
 235
 236typedef struct SaveStateEntry {
 237    QTAILQ_ENTRY(SaveStateEntry) entry;
 238    char idstr[256];
 239    uint32_t instance_id;
 240    int alias_id;
 241    int version_id;
 242    /* version id read from the stream */
 243    int load_version_id;
 244    int section_id;
 245    /* section id read from the stream */
 246    int load_section_id;
 247    const SaveVMHandlers *ops;
 248    const VMStateDescription *vmsd;
 249    void *opaque;
 250    CompatEntry *compat;
 251    int is_ram;
 252} SaveStateEntry;
 253
 254typedef struct SaveState {
 255    QTAILQ_HEAD(, SaveStateEntry) handlers;
 256    SaveStateEntry *handler_pri_head[MIG_PRI_MAX + 1];
 257    int global_section_id;
 258    uint32_t len;
 259    const char *name;
 260    uint32_t target_page_bits;
 261    uint32_t caps_count;
 262    MigrationCapability *capabilities;
 263    QemuUUID uuid;
 264} SaveState;
 265
 266static SaveState savevm_state = {
 267    .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
 268    .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL },
 269    .global_section_id = 0,
 270};
 271
 272static bool should_validate_capability(int capability)
 273{
 274    assert(capability >= 0 && capability < MIGRATION_CAPABILITY__MAX);
 275    /* Validate only new capabilities to keep compatibility. */
 276    switch (capability) {
 277    case MIGRATION_CAPABILITY_X_IGNORE_SHARED:
 278        return true;
 279    default:
 280        return false;
 281    }
 282}
 283
 284static uint32_t get_validatable_capabilities_count(void)
 285{
 286    MigrationState *s = migrate_get_current();
 287    uint32_t result = 0;
 288    int i;
 289    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 290        if (should_validate_capability(i) && s->enabled_capabilities[i]) {
 291            result++;
 292        }
 293    }
 294    return result;
 295}
 296
 297static int configuration_pre_save(void *opaque)
 298{
 299    SaveState *state = opaque;
 300    const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
 301    MigrationState *s = migrate_get_current();
 302    int i, j;
 303
 304    state->len = strlen(current_name);
 305    state->name = current_name;
 306    state->target_page_bits = qemu_target_page_bits();
 307
 308    state->caps_count = get_validatable_capabilities_count();
 309    state->capabilities = g_renew(MigrationCapability, state->capabilities,
 310                                  state->caps_count);
 311    for (i = j = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 312        if (should_validate_capability(i) && s->enabled_capabilities[i]) {
 313            state->capabilities[j++] = i;
 314        }
 315    }
 316    state->uuid = qemu_uuid;
 317
 318    return 0;
 319}
 320
 321static int configuration_post_save(void *opaque)
 322{
 323    SaveState *state = opaque;
 324
 325    g_free(state->capabilities);
 326    state->capabilities = NULL;
 327    state->caps_count = 0;
 328    return 0;
 329}
 330
 331static int configuration_pre_load(void *opaque)
 332{
 333    SaveState *state = opaque;
 334
 335    /* If there is no target-page-bits subsection it means the source
 336     * predates the variable-target-page-bits support and is using the
 337     * minimum possible value for this CPU.
 338     */
 339    state->target_page_bits = qemu_target_page_bits_min();
 340    return 0;
 341}
 342
 343static bool configuration_validate_capabilities(SaveState *state)
 344{
 345    bool ret = true;
 346    MigrationState *s = migrate_get_current();
 347    unsigned long *source_caps_bm;
 348    int i;
 349
 350    source_caps_bm = bitmap_new(MIGRATION_CAPABILITY__MAX);
 351    for (i = 0; i < state->caps_count; i++) {
 352        MigrationCapability capability = state->capabilities[i];
 353        set_bit(capability, source_caps_bm);
 354    }
 355
 356    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 357        bool source_state, target_state;
 358        if (!should_validate_capability(i)) {
 359            continue;
 360        }
 361        source_state = test_bit(i, source_caps_bm);
 362        target_state = s->enabled_capabilities[i];
 363        if (source_state != target_state) {
 364            error_report("Capability %s is %s, but received capability is %s",
 365                         MigrationCapability_str(i),
 366                         target_state ? "on" : "off",
 367                         source_state ? "on" : "off");
 368            ret = false;
 369            /* Don't break here to report all failed capabilities */
 370        }
 371    }
 372
 373    g_free(source_caps_bm);
 374    return ret;
 375}
 376
 377static int configuration_post_load(void *opaque, int version_id)
 378{
 379    SaveState *state = opaque;
 380    const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
 381    int ret = 0;
 382
 383    if (strncmp(state->name, current_name, state->len) != 0) {
 384        error_report("Machine type received is '%.*s' and local is '%s'",
 385                     (int) state->len, state->name, current_name);
 386        ret = -EINVAL;
 387        goto out;
 388    }
 389
 390    if (state->target_page_bits != qemu_target_page_bits()) {
 391        error_report("Received TARGET_PAGE_BITS is %d but local is %d",
 392                     state->target_page_bits, qemu_target_page_bits());
 393        ret = -EINVAL;
 394        goto out;
 395    }
 396
 397    if (!configuration_validate_capabilities(state)) {
 398        ret = -EINVAL;
 399        goto out;
 400    }
 401
 402out:
 403    g_free((void *)state->name);
 404    state->name = NULL;
 405    state->len = 0;
 406    g_free(state->capabilities);
 407    state->capabilities = NULL;
 408    state->caps_count = 0;
 409
 410    return ret;
 411}
 412
 413static int get_capability(QEMUFile *f, void *pv, size_t size,
 414                          const VMStateField *field)
 415{
 416    MigrationCapability *capability = pv;
 417    char capability_str[UINT8_MAX + 1];
 418    uint8_t len;
 419    int i;
 420
 421    len = qemu_get_byte(f);
 422    qemu_get_buffer(f, (uint8_t *)capability_str, len);
 423    capability_str[len] = '\0';
 424    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 425        if (!strcmp(MigrationCapability_str(i), capability_str)) {
 426            *capability = i;
 427            return 0;
 428        }
 429    }
 430    error_report("Received unknown capability %s", capability_str);
 431    return -EINVAL;
 432}
 433
 434static int put_capability(QEMUFile *f, void *pv, size_t size,
 435                          const VMStateField *field, JSONWriter *vmdesc)
 436{
 437    MigrationCapability *capability = pv;
 438    const char *capability_str = MigrationCapability_str(*capability);
 439    size_t len = strlen(capability_str);
 440    assert(len <= UINT8_MAX);
 441
 442    qemu_put_byte(f, len);
 443    qemu_put_buffer(f, (uint8_t *)capability_str, len);
 444    return 0;
 445}
 446
 447static const VMStateInfo vmstate_info_capability = {
 448    .name = "capability",
 449    .get  = get_capability,
 450    .put  = put_capability,
 451};
 452
 453/* The target-page-bits subsection is present only if the
 454 * target page size is not the same as the default (ie the
 455 * minimum page size for a variable-page-size guest CPU).
 456 * If it is present then it contains the actual target page
 457 * bits for the machine, and migration will fail if the
 458 * two ends don't agree about it.
 459 */
 460static bool vmstate_target_page_bits_needed(void *opaque)
 461{
 462    return qemu_target_page_bits()
 463        > qemu_target_page_bits_min();
 464}
 465
 466static const VMStateDescription vmstate_target_page_bits = {
 467    .name = "configuration/target-page-bits",
 468    .version_id = 1,
 469    .minimum_version_id = 1,
 470    .needed = vmstate_target_page_bits_needed,
 471    .fields = (VMStateField[]) {
 472        VMSTATE_UINT32(target_page_bits, SaveState),
 473        VMSTATE_END_OF_LIST()
 474    }
 475};
 476
 477static bool vmstate_capabilites_needed(void *opaque)
 478{
 479    return get_validatable_capabilities_count() > 0;
 480}
 481
 482static const VMStateDescription vmstate_capabilites = {
 483    .name = "configuration/capabilities",
 484    .version_id = 1,
 485    .minimum_version_id = 1,
 486    .needed = vmstate_capabilites_needed,
 487    .fields = (VMStateField[]) {
 488        VMSTATE_UINT32_V(caps_count, SaveState, 1),
 489        VMSTATE_VARRAY_UINT32_ALLOC(capabilities, SaveState, caps_count, 1,
 490                                    vmstate_info_capability,
 491                                    MigrationCapability),
 492        VMSTATE_END_OF_LIST()
 493    }
 494};
 495
 496static bool vmstate_uuid_needed(void *opaque)
 497{
 498    return qemu_uuid_set && migrate_validate_uuid();
 499}
 500
 501static int vmstate_uuid_post_load(void *opaque, int version_id)
 502{
 503    SaveState *state = opaque;
 504    char uuid_src[UUID_FMT_LEN + 1];
 505    char uuid_dst[UUID_FMT_LEN + 1];
 506
 507    if (!qemu_uuid_set) {
 508        /*
 509         * It's warning because user might not know UUID in some cases,
 510         * e.g. load an old snapshot
 511         */
 512        qemu_uuid_unparse(&state->uuid, uuid_src);
 513        warn_report("UUID is received %s, but local uuid isn't set",
 514                     uuid_src);
 515        return 0;
 516    }
 517    if (!qemu_uuid_is_equal(&state->uuid, &qemu_uuid)) {
 518        qemu_uuid_unparse(&state->uuid, uuid_src);
 519        qemu_uuid_unparse(&qemu_uuid, uuid_dst);
 520        error_report("UUID received is %s and local is %s", uuid_src, uuid_dst);
 521        return -EINVAL;
 522    }
 523    return 0;
 524}
 525
 526static const VMStateDescription vmstate_uuid = {
 527    .name = "configuration/uuid",
 528    .version_id = 1,
 529    .minimum_version_id = 1,
 530    .needed = vmstate_uuid_needed,
 531    .post_load = vmstate_uuid_post_load,
 532    .fields = (VMStateField[]) {
 533        VMSTATE_UINT8_ARRAY_V(uuid.data, SaveState, sizeof(QemuUUID), 1),
 534        VMSTATE_END_OF_LIST()
 535    }
 536};
 537
 538static const VMStateDescription vmstate_configuration = {
 539    .name = "configuration",
 540    .version_id = 1,
 541    .pre_load = configuration_pre_load,
 542    .post_load = configuration_post_load,
 543    .pre_save = configuration_pre_save,
 544    .post_save = configuration_post_save,
 545    .fields = (VMStateField[]) {
 546        VMSTATE_UINT32(len, SaveState),
 547        VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
 548        VMSTATE_END_OF_LIST()
 549    },
 550    .subsections = (const VMStateDescription *[]) {
 551        &vmstate_target_page_bits,
 552        &vmstate_capabilites,
 553        &vmstate_uuid,
 554        NULL
 555    }
 556};
 557
 558static void dump_vmstate_vmsd(FILE *out_file,
 559                              const VMStateDescription *vmsd, int indent,
 560                              bool is_subsection);
 561
 562static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field,
 563                              int indent)
 564{
 565    fprintf(out_file, "%*s{\n", indent, "");
 566    indent += 2;
 567    fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name);
 568    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 569            field->version_id);
 570    fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "",
 571            field->field_exists ? "true" : "false");
 572    fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size);
 573    if (field->vmsd != NULL) {
 574        fprintf(out_file, ",\n");
 575        dump_vmstate_vmsd(out_file, field->vmsd, indent, false);
 576    }
 577    fprintf(out_file, "\n%*s}", indent - 2, "");
 578}
 579
 580static void dump_vmstate_vmss(FILE *out_file,
 581                              const VMStateDescription **subsection,
 582                              int indent)
 583{
 584    if (*subsection != NULL) {
 585        dump_vmstate_vmsd(out_file, *subsection, indent, true);
 586    }
 587}
 588
 589static void dump_vmstate_vmsd(FILE *out_file,
 590                              const VMStateDescription *vmsd, int indent,
 591                              bool is_subsection)
 592{
 593    if (is_subsection) {
 594        fprintf(out_file, "%*s{\n", indent, "");
 595    } else {
 596        fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description");
 597    }
 598    indent += 2;
 599    fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name);
 600    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 601            vmsd->version_id);
 602    fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "",
 603            vmsd->minimum_version_id);
 604    if (vmsd->fields != NULL) {
 605        const VMStateField *field = vmsd->fields;
 606        bool first;
 607
 608        fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, "");
 609        first = true;
 610        while (field->name != NULL) {
 611            if (field->flags & VMS_MUST_EXIST) {
 612                /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */
 613                field++;
 614                continue;
 615            }
 616            if (!first) {
 617                fprintf(out_file, ",\n");
 618            }
 619            dump_vmstate_vmsf(out_file, field, indent + 2);
 620            field++;
 621            first = false;
 622        }
 623        fprintf(out_file, "\n%*s]", indent, "");
 624    }
 625    if (vmsd->subsections != NULL) {
 626        const VMStateDescription **subsection = vmsd->subsections;
 627        bool first;
 628
 629        fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, "");
 630        first = true;
 631        while (*subsection != NULL) {
 632            if (!first) {
 633                fprintf(out_file, ",\n");
 634            }
 635            dump_vmstate_vmss(out_file, subsection, indent + 2);
 636            subsection++;
 637            first = false;
 638        }
 639        fprintf(out_file, "\n%*s]", indent, "");
 640    }
 641    fprintf(out_file, "\n%*s}", indent - 2, "");
 642}
 643
 644static void dump_machine_type(FILE *out_file)
 645{
 646    MachineClass *mc;
 647
 648    mc = MACHINE_GET_CLASS(current_machine);
 649
 650    fprintf(out_file, "  \"vmschkmachine\": {\n");
 651    fprintf(out_file, "    \"Name\": \"%s\"\n", mc->name);
 652    fprintf(out_file, "  },\n");
 653}
 654
 655void dump_vmstate_json_to_file(FILE *out_file)
 656{
 657    GSList *list, *elt;
 658    bool first;
 659
 660    fprintf(out_file, "{\n");
 661    dump_machine_type(out_file);
 662
 663    first = true;
 664    list = object_class_get_list(TYPE_DEVICE, true);
 665    for (elt = list; elt; elt = elt->next) {
 666        DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data,
 667                                             TYPE_DEVICE);
 668        const char *name;
 669        int indent = 2;
 670
 671        if (!dc->vmsd) {
 672            continue;
 673        }
 674
 675        if (!first) {
 676            fprintf(out_file, ",\n");
 677        }
 678        name = object_class_get_name(OBJECT_CLASS(dc));
 679        fprintf(out_file, "%*s\"%s\": {\n", indent, "", name);
 680        indent += 2;
 681        fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name);
 682        fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 683                dc->vmsd->version_id);
 684        fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "",
 685                dc->vmsd->minimum_version_id);
 686
 687        dump_vmstate_vmsd(out_file, dc->vmsd, indent, false);
 688
 689        fprintf(out_file, "\n%*s}", indent - 2, "");
 690        first = false;
 691    }
 692    fprintf(out_file, "\n}\n");
 693    fclose(out_file);
 694    g_slist_free(list);
 695}
 696
 697static uint32_t calculate_new_instance_id(const char *idstr)
 698{
 699    SaveStateEntry *se;
 700    uint32_t instance_id = 0;
 701
 702    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 703        if (strcmp(idstr, se->idstr) == 0
 704            && instance_id <= se->instance_id) {
 705            instance_id = se->instance_id + 1;
 706        }
 707    }
 708    /* Make sure we never loop over without being noticed */
 709    assert(instance_id != VMSTATE_INSTANCE_ID_ANY);
 710    return instance_id;
 711}
 712
 713static int calculate_compat_instance_id(const char *idstr)
 714{
 715    SaveStateEntry *se;
 716    int instance_id = 0;
 717
 718    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 719        if (!se->compat) {
 720            continue;
 721        }
 722
 723        if (strcmp(idstr, se->compat->idstr) == 0
 724            && instance_id <= se->compat->instance_id) {
 725            instance_id = se->compat->instance_id + 1;
 726        }
 727    }
 728    return instance_id;
 729}
 730
 731static inline MigrationPriority save_state_priority(SaveStateEntry *se)
 732{
 733    if (se->vmsd) {
 734        return se->vmsd->priority;
 735    }
 736    return MIG_PRI_DEFAULT;
 737}
 738
 739static void savevm_state_handler_insert(SaveStateEntry *nse)
 740{
 741    MigrationPriority priority = save_state_priority(nse);
 742    SaveStateEntry *se;
 743    int i;
 744
 745    assert(priority <= MIG_PRI_MAX);
 746
 747    for (i = priority - 1; i >= 0; i--) {
 748        se = savevm_state.handler_pri_head[i];
 749        if (se != NULL) {
 750            assert(save_state_priority(se) < priority);
 751            break;
 752        }
 753    }
 754
 755    if (i >= 0) {
 756        QTAILQ_INSERT_BEFORE(se, nse, entry);
 757    } else {
 758        QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry);
 759    }
 760
 761    if (savevm_state.handler_pri_head[priority] == NULL) {
 762        savevm_state.handler_pri_head[priority] = nse;
 763    }
 764}
 765
 766static void savevm_state_handler_remove(SaveStateEntry *se)
 767{
 768    SaveStateEntry *next;
 769    MigrationPriority priority = save_state_priority(se);
 770
 771    if (se == savevm_state.handler_pri_head[priority]) {
 772        next = QTAILQ_NEXT(se, entry);
 773        if (next != NULL && save_state_priority(next) == priority) {
 774            savevm_state.handler_pri_head[priority] = next;
 775        } else {
 776            savevm_state.handler_pri_head[priority] = NULL;
 777        }
 778    }
 779    QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
 780}
 781
 782/* TODO: Individual devices generally have very little idea about the rest
 783   of the system, so instance_id should be removed/replaced.
 784   Meanwhile pass -1 as instance_id if you do not already have a clearly
 785   distinguishing id for all instances of your device class. */
 786int register_savevm_live(const char *idstr,
 787                         uint32_t instance_id,
 788                         int version_id,
 789                         const SaveVMHandlers *ops,
 790                         void *opaque)
 791{
 792    SaveStateEntry *se;
 793
 794    se = g_new0(SaveStateEntry, 1);
 795    se->version_id = version_id;
 796    se->section_id = savevm_state.global_section_id++;
 797    se->ops = ops;
 798    se->opaque = opaque;
 799    se->vmsd = NULL;
 800    /* if this is a live_savem then set is_ram */
 801    if (ops->save_setup != NULL) {
 802        se->is_ram = 1;
 803    }
 804
 805    pstrcat(se->idstr, sizeof(se->idstr), idstr);
 806
 807    if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
 808        se->instance_id = calculate_new_instance_id(se->idstr);
 809    } else {
 810        se->instance_id = instance_id;
 811    }
 812    assert(!se->compat || se->instance_id == 0);
 813    savevm_state_handler_insert(se);
 814    return 0;
 815}
 816
 817void unregister_savevm(VMStateIf *obj, const char *idstr, void *opaque)
 818{
 819    SaveStateEntry *se, *new_se;
 820    char id[256] = "";
 821
 822    if (obj) {
 823        char *oid = vmstate_if_get_id(obj);
 824        if (oid) {
 825            pstrcpy(id, sizeof(id), oid);
 826            pstrcat(id, sizeof(id), "/");
 827            g_free(oid);
 828        }
 829    }
 830    pstrcat(id, sizeof(id), idstr);
 831
 832    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
 833        if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
 834            savevm_state_handler_remove(se);
 835            g_free(se->compat);
 836            g_free(se);
 837        }
 838    }
 839}
 840
 841int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
 842                                   const VMStateDescription *vmsd,
 843                                   void *opaque, int alias_id,
 844                                   int required_for_version,
 845                                   Error **errp)
 846{
 847    SaveStateEntry *se;
 848
 849    /* If this triggers, alias support can be dropped for the vmsd. */
 850    assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id);
 851
 852    se = g_new0(SaveStateEntry, 1);
 853    se->version_id = vmsd->version_id;
 854    se->section_id = savevm_state.global_section_id++;
 855    se->opaque = opaque;
 856    se->vmsd = vmsd;
 857    se->alias_id = alias_id;
 858
 859    if (obj) {
 860        char *id = vmstate_if_get_id(obj);
 861        if (id) {
 862            if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
 863                sizeof(se->idstr)) {
 864                error_setg(errp, "Path too long for VMState (%s)", id);
 865                g_free(id);
 866                g_free(se);
 867
 868                return -1;
 869            }
 870            g_free(id);
 871
 872            se->compat = g_new0(CompatEntry, 1);
 873            pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name);
 874            se->compat->instance_id = instance_id == VMSTATE_INSTANCE_ID_ANY ?
 875                         calculate_compat_instance_id(vmsd->name) : instance_id;
 876            instance_id = VMSTATE_INSTANCE_ID_ANY;
 877        }
 878    }
 879    pstrcat(se->idstr, sizeof(se->idstr), vmsd->name);
 880
 881    if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
 882        se->instance_id = calculate_new_instance_id(se->idstr);
 883    } else {
 884        se->instance_id = instance_id;
 885    }
 886    assert(!se->compat || se->instance_id == 0);
 887    savevm_state_handler_insert(se);
 888    return 0;
 889}
 890
 891void vmstate_unregister(VMStateIf *obj, const VMStateDescription *vmsd,
 892                        void *opaque)
 893{
 894    SaveStateEntry *se, *new_se;
 895
 896    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
 897        if (se->vmsd == vmsd && se->opaque == opaque) {
 898            savevm_state_handler_remove(se);
 899            g_free(se->compat);
 900            g_free(se);
 901        }
 902    }
 903}
 904
 905static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
 906{
 907    trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
 908    if (!se->vmsd) {         /* Old style */
 909        return se->ops->load_state(f, se->opaque, se->load_version_id);
 910    }
 911    return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id);
 912}
 913
 914static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se,
 915                                   JSONWriter *vmdesc)
 916{
 917    int64_t old_offset, size;
 918
 919    old_offset = qemu_ftell_fast(f);
 920    se->ops->save_state(f, se->opaque);
 921    size = qemu_ftell_fast(f) - old_offset;
 922
 923    if (vmdesc) {
 924        json_writer_int64(vmdesc, "size", size);
 925        json_writer_start_array(vmdesc, "fields");
 926        json_writer_start_object(vmdesc, NULL);
 927        json_writer_str(vmdesc, "name", "data");
 928        json_writer_int64(vmdesc, "size", size);
 929        json_writer_str(vmdesc, "type", "buffer");
 930        json_writer_end_object(vmdesc);
 931        json_writer_end_array(vmdesc);
 932    }
 933}
 934
 935static int vmstate_save(QEMUFile *f, SaveStateEntry *se,
 936                        JSONWriter *vmdesc)
 937{
 938    trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
 939    if (!se->vmsd) {
 940        vmstate_save_old_style(f, se, vmdesc);
 941        return 0;
 942    }
 943    return vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
 944}
 945
 946/*
 947 * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
 948 */
 949static void save_section_header(QEMUFile *f, SaveStateEntry *se,
 950                                uint8_t section_type)
 951{
 952    qemu_put_byte(f, section_type);
 953    qemu_put_be32(f, se->section_id);
 954
 955    if (section_type == QEMU_VM_SECTION_FULL ||
 956        section_type == QEMU_VM_SECTION_START) {
 957        /* ID string */
 958        size_t len = strlen(se->idstr);
 959        qemu_put_byte(f, len);
 960        qemu_put_buffer(f, (uint8_t *)se->idstr, len);
 961
 962        qemu_put_be32(f, se->instance_id);
 963        qemu_put_be32(f, se->version_id);
 964    }
 965}
 966
 967/*
 968 * Write a footer onto device sections that catches cases misformatted device
 969 * sections.
 970 */
 971static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
 972{
 973    if (migrate_get_current()->send_section_footer) {
 974        qemu_put_byte(f, QEMU_VM_SECTION_FOOTER);
 975        qemu_put_be32(f, se->section_id);
 976    }
 977}
 978
 979/**
 980 * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
 981 *                           command and associated data.
 982 *
 983 * @f: File to send command on
 984 * @command: Command type to send
 985 * @len: Length of associated data
 986 * @data: Data associated with command.
 987 */
 988static void qemu_savevm_command_send(QEMUFile *f,
 989                                     enum qemu_vm_cmd command,
 990                                     uint16_t len,
 991                                     uint8_t *data)
 992{
 993    trace_savevm_command_send(command, len);
 994    qemu_put_byte(f, QEMU_VM_COMMAND);
 995    qemu_put_be16(f, (uint16_t)command);
 996    qemu_put_be16(f, len);
 997    qemu_put_buffer(f, data, len);
 998    qemu_fflush(f);
 999}
1000
1001void qemu_savevm_send_colo_enable(QEMUFile *f)
1002{
1003    trace_savevm_send_colo_enable();
1004    qemu_savevm_command_send(f, MIG_CMD_ENABLE_COLO, 0, NULL);
1005}
1006
1007void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
1008{
1009    uint32_t buf;
1010
1011    trace_savevm_send_ping(value);
1012    buf = cpu_to_be32(value);
1013    qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
1014}
1015
1016void qemu_savevm_send_open_return_path(QEMUFile *f)
1017{
1018    trace_savevm_send_open_return_path();
1019    qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
1020}
1021
1022/* We have a buffer of data to send; we don't want that all to be loaded
1023 * by the command itself, so the command contains just the length of the
1024 * extra buffer that we then send straight after it.
1025 * TODO: Must be a better way to organise that
1026 *
1027 * Returns:
1028 *    0 on success
1029 *    -ve on error
1030 */
1031int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len)
1032{
1033    uint32_t tmp;
1034
1035    if (len > MAX_VM_CMD_PACKAGED_SIZE) {
1036        error_report("%s: Unreasonably large packaged state: %zu",
1037                     __func__, len);
1038        return -1;
1039    }
1040
1041    tmp = cpu_to_be32(len);
1042
1043    trace_qemu_savevm_send_packaged();
1044    qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
1045
1046    qemu_put_buffer(f, buf, len);
1047
1048    return 0;
1049}
1050
1051/* Send prior to any postcopy transfer */
1052void qemu_savevm_send_postcopy_advise(QEMUFile *f)
1053{
1054    if (migrate_postcopy_ram()) {
1055        uint64_t tmp[2];
1056        tmp[0] = cpu_to_be64(ram_pagesize_summary());
1057        tmp[1] = cpu_to_be64(qemu_target_page_size());
1058
1059        trace_qemu_savevm_send_postcopy_advise();
1060        qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE,
1061                                 16, (uint8_t *)tmp);
1062    } else {
1063        qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 0, NULL);
1064    }
1065}
1066
1067/* Sent prior to starting the destination running in postcopy, discard pages
1068 * that have already been sent but redirtied on the source.
1069 * CMD_POSTCOPY_RAM_DISCARD consist of:
1070 *      byte   version (0)
1071 *      byte   Length of name field (not including 0)
1072 *  n x byte   RAM block name
1073 *      byte   0 terminator (just for safety)
1074 *  n x        Byte ranges within the named RAMBlock
1075 *      be64   Start of the range
1076 *      be64   Length
1077 *
1078 *  name:  RAMBlock name that these entries are part of
1079 *  len: Number of page entries
1080 *  start_list: 'len' addresses
1081 *  length_list: 'len' addresses
1082 *
1083 */
1084void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
1085                                           uint16_t len,
1086                                           uint64_t *start_list,
1087                                           uint64_t *length_list)
1088{
1089    uint8_t *buf;
1090    uint16_t tmplen;
1091    uint16_t t;
1092    size_t name_len = strlen(name);
1093
1094    trace_qemu_savevm_send_postcopy_ram_discard(name, len);
1095    assert(name_len < 256);
1096    buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
1097    buf[0] = postcopy_ram_discard_version;
1098    buf[1] = name_len;
1099    memcpy(buf + 2, name, name_len);
1100    tmplen = 2 + name_len;
1101    buf[tmplen++] = '\0';
1102
1103    for (t = 0; t < len; t++) {
1104        stq_be_p(buf + tmplen, start_list[t]);
1105        tmplen += 8;
1106        stq_be_p(buf + tmplen, length_list[t]);
1107        tmplen += 8;
1108    }
1109    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
1110    g_free(buf);
1111}
1112
1113/* Get the destination into a state where it can receive postcopy data. */
1114void qemu_savevm_send_postcopy_listen(QEMUFile *f)
1115{
1116    trace_savevm_send_postcopy_listen();
1117    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
1118}
1119
1120/* Kick the destination into running */
1121void qemu_savevm_send_postcopy_run(QEMUFile *f)
1122{
1123    trace_savevm_send_postcopy_run();
1124    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
1125}
1126
1127void qemu_savevm_send_postcopy_resume(QEMUFile *f)
1128{
1129    trace_savevm_send_postcopy_resume();
1130    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RESUME, 0, NULL);
1131}
1132
1133void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name)
1134{
1135    size_t len;
1136    char buf[256];
1137
1138    trace_savevm_send_recv_bitmap(block_name);
1139
1140    buf[0] = len = strlen(block_name);
1141    memcpy(buf + 1, block_name, len);
1142
1143    qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
1144}
1145
1146bool qemu_savevm_state_blocked(Error **errp)
1147{
1148    SaveStateEntry *se;
1149
1150    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1151        if (se->vmsd && se->vmsd->unmigratable) {
1152            error_setg(errp, "State blocked by non-migratable device '%s'",
1153                       se->idstr);
1154            return true;
1155        }
1156    }
1157    return false;
1158}
1159
1160void qemu_savevm_non_migratable_list(strList **reasons)
1161{
1162    SaveStateEntry *se;
1163
1164    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1165        if (se->vmsd && se->vmsd->unmigratable) {
1166            QAPI_LIST_PREPEND(*reasons,
1167                              g_strdup_printf("non-migratable device: %s",
1168                                              se->idstr));
1169        }
1170    }
1171}
1172
1173void qemu_savevm_state_header(QEMUFile *f)
1174{
1175    trace_savevm_state_header();
1176    qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1177    qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1178
1179    if (migrate_get_current()->send_configuration) {
1180        qemu_put_byte(f, QEMU_VM_CONFIGURATION);
1181        vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
1182    }
1183}
1184
1185bool qemu_savevm_state_guest_unplug_pending(void)
1186{
1187    SaveStateEntry *se;
1188
1189    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1190        if (se->vmsd && se->vmsd->dev_unplug_pending &&
1191            se->vmsd->dev_unplug_pending(se->opaque)) {
1192            return true;
1193        }
1194    }
1195
1196    return false;
1197}
1198
1199void qemu_savevm_state_setup(QEMUFile *f)
1200{
1201    SaveStateEntry *se;
1202    Error *local_err = NULL;
1203    int ret;
1204
1205    trace_savevm_state_setup();
1206    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1207        if (!se->ops || !se->ops->save_setup) {
1208            continue;
1209        }
1210        if (se->ops->is_active) {
1211            if (!se->ops->is_active(se->opaque)) {
1212                continue;
1213            }
1214        }
1215        save_section_header(f, se, QEMU_VM_SECTION_START);
1216
1217        ret = se->ops->save_setup(f, se->opaque);
1218        save_section_footer(f, se);
1219        if (ret < 0) {
1220            qemu_file_set_error(f, ret);
1221            break;
1222        }
1223    }
1224
1225    if (precopy_notify(PRECOPY_NOTIFY_SETUP, &local_err)) {
1226        error_report_err(local_err);
1227    }
1228}
1229
1230int qemu_savevm_state_resume_prepare(MigrationState *s)
1231{
1232    SaveStateEntry *se;
1233    int ret;
1234
1235    trace_savevm_state_resume_prepare();
1236
1237    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1238        if (!se->ops || !se->ops->resume_prepare) {
1239            continue;
1240        }
1241        if (se->ops->is_active) {
1242            if (!se->ops->is_active(se->opaque)) {
1243                continue;
1244            }
1245        }
1246        ret = se->ops->resume_prepare(s, se->opaque);
1247        if (ret < 0) {
1248            return ret;
1249        }
1250    }
1251
1252    return 0;
1253}
1254
1255/*
1256 * this function has three return values:
1257 *   negative: there was one error, and we have -errno.
1258 *   0 : We haven't finished, caller have to go again
1259 *   1 : We have finished, we can go to complete phase
1260 */
1261int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
1262{
1263    SaveStateEntry *se;
1264    int ret = 1;
1265
1266    trace_savevm_state_iterate();
1267    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1268        if (!se->ops || !se->ops->save_live_iterate) {
1269            continue;
1270        }
1271        if (se->ops->is_active &&
1272            !se->ops->is_active(se->opaque)) {
1273            continue;
1274        }
1275        if (se->ops->is_active_iterate &&
1276            !se->ops->is_active_iterate(se->opaque)) {
1277            continue;
1278        }
1279        /*
1280         * In the postcopy phase, any device that doesn't know how to
1281         * do postcopy should have saved it's state in the _complete
1282         * call that's already run, it might get confused if we call
1283         * iterate afterwards.
1284         */
1285        if (postcopy &&
1286            !(se->ops->has_postcopy && se->ops->has_postcopy(se->opaque))) {
1287            continue;
1288        }
1289        if (qemu_file_rate_limit(f)) {
1290            return 0;
1291        }
1292        trace_savevm_section_start(se->idstr, se->section_id);
1293
1294        save_section_header(f, se, QEMU_VM_SECTION_PART);
1295
1296        ret = se->ops->save_live_iterate(f, se->opaque);
1297        trace_savevm_section_end(se->idstr, se->section_id, ret);
1298        save_section_footer(f, se);
1299
1300        if (ret < 0) {
1301            error_report("failed to save SaveStateEntry with id(name): %d(%s)",
1302                         se->section_id, se->idstr);
1303            qemu_file_set_error(f, ret);
1304        }
1305        if (ret <= 0) {
1306            /* Do not proceed to the next vmstate before this one reported
1307               completion of the current stage. This serializes the migration
1308               and reduces the probability that a faster changing state is
1309               synchronized over and over again. */
1310            break;
1311        }
1312    }
1313    return ret;
1314}
1315
1316static bool should_send_vmdesc(void)
1317{
1318    MachineState *machine = MACHINE(qdev_get_machine());
1319    bool in_postcopy = migration_in_postcopy();
1320    return !machine->suppress_vmdesc && !in_postcopy;
1321}
1322
1323/*
1324 * Calls the save_live_complete_postcopy methods
1325 * causing the last few pages to be sent immediately and doing any associated
1326 * cleanup.
1327 * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
1328 * all the other devices, but that happens at the point we switch to postcopy.
1329 */
1330void qemu_savevm_state_complete_postcopy(QEMUFile *f)
1331{
1332    SaveStateEntry *se;
1333    int ret;
1334
1335    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1336        if (!se->ops || !se->ops->save_live_complete_postcopy) {
1337            continue;
1338        }
1339        if (se->ops->is_active) {
1340            if (!se->ops->is_active(se->opaque)) {
1341                continue;
1342            }
1343        }
1344        trace_savevm_section_start(se->idstr, se->section_id);
1345        /* Section type */
1346        qemu_put_byte(f, QEMU_VM_SECTION_END);
1347        qemu_put_be32(f, se->section_id);
1348
1349        ret = se->ops->save_live_complete_postcopy(f, se->opaque);
1350        trace_savevm_section_end(se->idstr, se->section_id, ret);
1351        save_section_footer(f, se);
1352        if (ret < 0) {
1353            qemu_file_set_error(f, ret);
1354            return;
1355        }
1356    }
1357
1358    qemu_put_byte(f, QEMU_VM_EOF);
1359    qemu_fflush(f);
1360}
1361
1362static
1363int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
1364{
1365    SaveStateEntry *se;
1366    int ret;
1367
1368    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1369        if (!se->ops ||
1370            (in_postcopy && se->ops->has_postcopy &&
1371             se->ops->has_postcopy(se->opaque)) ||
1372            !se->ops->save_live_complete_precopy) {
1373            continue;
1374        }
1375
1376        if (se->ops->is_active) {
1377            if (!se->ops->is_active(se->opaque)) {
1378                continue;
1379            }
1380        }
1381        trace_savevm_section_start(se->idstr, se->section_id);
1382
1383        save_section_header(f, se, QEMU_VM_SECTION_END);
1384
1385        ret = se->ops->save_live_complete_precopy(f, se->opaque);
1386        trace_savevm_section_end(se->idstr, se->section_id, ret);
1387        save_section_footer(f, se);
1388        if (ret < 0) {
1389            qemu_file_set_error(f, ret);
1390            return -1;
1391        }
1392    }
1393
1394    return 0;
1395}
1396
1397int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
1398                                                    bool in_postcopy,
1399                                                    bool inactivate_disks)
1400{
1401    g_autoptr(JSONWriter) vmdesc = NULL;
1402    int vmdesc_len;
1403    SaveStateEntry *se;
1404    int ret;
1405
1406    vmdesc = json_writer_new(false);
1407    json_writer_start_object(vmdesc, NULL);
1408    json_writer_int64(vmdesc, "page_size", qemu_target_page_size());
1409    json_writer_start_array(vmdesc, "devices");
1410    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1411
1412        if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1413            continue;
1414        }
1415        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1416            trace_savevm_section_skip(se->idstr, se->section_id);
1417            continue;
1418        }
1419
1420        trace_savevm_section_start(se->idstr, se->section_id);
1421
1422        json_writer_start_object(vmdesc, NULL);
1423        json_writer_str(vmdesc, "name", se->idstr);
1424        json_writer_int64(vmdesc, "instance_id", se->instance_id);
1425
1426        save_section_header(f, se, QEMU_VM_SECTION_FULL);
1427        ret = vmstate_save(f, se, vmdesc);
1428        if (ret) {
1429            qemu_file_set_error(f, ret);
1430            return ret;
1431        }
1432        trace_savevm_section_end(se->idstr, se->section_id, 0);
1433        save_section_footer(f, se);
1434
1435        json_writer_end_object(vmdesc);
1436    }
1437
1438    if (inactivate_disks) {
1439        /* Inactivate before sending QEMU_VM_EOF so that the
1440         * bdrv_invalidate_cache_all() on the other end won't fail. */
1441        ret = bdrv_inactivate_all();
1442        if (ret) {
1443            error_report("%s: bdrv_inactivate_all() failed (%d)",
1444                         __func__, ret);
1445            qemu_file_set_error(f, ret);
1446            return ret;
1447        }
1448    }
1449    if (!in_postcopy) {
1450        /* Postcopy stream will still be going */
1451        qemu_put_byte(f, QEMU_VM_EOF);
1452    }
1453
1454    json_writer_end_array(vmdesc);
1455    json_writer_end_object(vmdesc);
1456    vmdesc_len = strlen(json_writer_get(vmdesc));
1457
1458    if (should_send_vmdesc()) {
1459        qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
1460        qemu_put_be32(f, vmdesc_len);
1461        qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len);
1462    }
1463
1464    return 0;
1465}
1466
1467int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
1468                                       bool inactivate_disks)
1469{
1470    int ret;
1471    Error *local_err = NULL;
1472    bool in_postcopy = migration_in_postcopy();
1473
1474    if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) {
1475        error_report_err(local_err);
1476    }
1477
1478    trace_savevm_state_complete_precopy();
1479
1480    cpu_synchronize_all_states();
1481
1482    if (!in_postcopy || iterable_only) {
1483        ret = qemu_savevm_state_complete_precopy_iterable(f, in_postcopy);
1484        if (ret) {
1485            return ret;
1486        }
1487    }
1488
1489    if (iterable_only) {
1490        goto flush;
1491    }
1492
1493    ret = qemu_savevm_state_complete_precopy_non_iterable(f, in_postcopy,
1494                                                          inactivate_disks);
1495    if (ret) {
1496        return ret;
1497    }
1498
1499flush:
1500    qemu_fflush(f);
1501    return 0;
1502}
1503
1504/* Give an estimate of the amount left to be transferred,
1505 * the result is split into the amount for units that can and
1506 * for units that can't do postcopy.
1507 */
1508void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
1509                               uint64_t *res_precopy_only,
1510                               uint64_t *res_compatible,
1511                               uint64_t *res_postcopy_only)
1512{
1513    SaveStateEntry *se;
1514
1515    *res_precopy_only = 0;
1516    *res_compatible = 0;
1517    *res_postcopy_only = 0;
1518
1519
1520    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1521        if (!se->ops || !se->ops->save_live_pending) {
1522            continue;
1523        }
1524        if (se->ops->is_active) {
1525            if (!se->ops->is_active(se->opaque)) {
1526                continue;
1527            }
1528        }
1529        se->ops->save_live_pending(f, se->opaque, threshold_size,
1530                                   res_precopy_only, res_compatible,
1531                                   res_postcopy_only);
1532    }
1533}
1534
1535void qemu_savevm_state_cleanup(void)
1536{
1537    SaveStateEntry *se;
1538    Error *local_err = NULL;
1539
1540    if (precopy_notify(PRECOPY_NOTIFY_CLEANUP, &local_err)) {
1541        error_report_err(local_err);
1542    }
1543
1544    trace_savevm_state_cleanup();
1545    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1546        if (se->ops && se->ops->save_cleanup) {
1547            se->ops->save_cleanup(se->opaque);
1548        }
1549    }
1550}
1551
1552static int qemu_savevm_state(QEMUFile *f, Error **errp)
1553{
1554    int ret;
1555    MigrationState *ms = migrate_get_current();
1556    MigrationStatus status;
1557
1558    if (migration_is_running(ms->state)) {
1559        error_setg(errp, QERR_MIGRATION_ACTIVE);
1560        return -EINVAL;
1561    }
1562
1563    if (migrate_use_block()) {
1564        error_setg(errp, "Block migration and snapshots are incompatible");
1565        return -EINVAL;
1566    }
1567
1568    migrate_init(ms);
1569    memset(&ram_counters, 0, sizeof(ram_counters));
1570    ms->to_dst_file = f;
1571
1572    qemu_mutex_unlock_iothread();
1573    qemu_savevm_state_header(f);
1574    qemu_savevm_state_setup(f);
1575    qemu_mutex_lock_iothread();
1576
1577    while (qemu_file_get_error(f) == 0) {
1578        if (qemu_savevm_state_iterate(f, false) > 0) {
1579            break;
1580        }
1581    }
1582
1583    ret = qemu_file_get_error(f);
1584    if (ret == 0) {
1585        qemu_savevm_state_complete_precopy(f, false, false);
1586        ret = qemu_file_get_error(f);
1587    }
1588    qemu_savevm_state_cleanup();
1589    if (ret != 0) {
1590        error_setg_errno(errp, -ret, "Error while writing VM state");
1591    }
1592
1593    if (ret != 0) {
1594        status = MIGRATION_STATUS_FAILED;
1595    } else {
1596        status = MIGRATION_STATUS_COMPLETED;
1597    }
1598    migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status);
1599
1600    /* f is outer parameter, it should not stay in global migration state after
1601     * this function finished */
1602    ms->to_dst_file = NULL;
1603
1604    return ret;
1605}
1606
1607void qemu_savevm_live_state(QEMUFile *f)
1608{
1609    /* save QEMU_VM_SECTION_END section */
1610    qemu_savevm_state_complete_precopy(f, true, false);
1611    qemu_put_byte(f, QEMU_VM_EOF);
1612}
1613
1614int qemu_save_device_state(QEMUFile *f)
1615{
1616    SaveStateEntry *se;
1617
1618    if (!migration_in_colo_state()) {
1619        qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1620        qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1621    }
1622    cpu_synchronize_all_states();
1623
1624    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1625        int ret;
1626
1627        if (se->is_ram) {
1628            continue;
1629        }
1630        if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1631            continue;
1632        }
1633        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1634            continue;
1635        }
1636
1637        save_section_header(f, se, QEMU_VM_SECTION_FULL);
1638
1639        ret = vmstate_save(f, se, NULL);
1640        if (ret) {
1641            return ret;
1642        }
1643
1644        save_section_footer(f, se);
1645    }
1646
1647    qemu_put_byte(f, QEMU_VM_EOF);
1648
1649    return qemu_file_get_error(f);
1650}
1651
1652static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id)
1653{
1654    SaveStateEntry *se;
1655
1656    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1657        if (!strcmp(se->idstr, idstr) &&
1658            (instance_id == se->instance_id ||
1659             instance_id == se->alias_id))
1660            return se;
1661        /* Migrating from an older version? */
1662        if (strstr(se->idstr, idstr) && se->compat) {
1663            if (!strcmp(se->compat->idstr, idstr) &&
1664                (instance_id == se->compat->instance_id ||
1665                 instance_id == se->alias_id))
1666                return se;
1667        }
1668    }
1669    return NULL;
1670}
1671
1672enum LoadVMExitCodes {
1673    /* Allow a command to quit all layers of nested loadvm loops */
1674    LOADVM_QUIT     =  1,
1675};
1676
1677/* ------ incoming postcopy messages ------ */
1678/* 'advise' arrives before any transfers just to tell us that a postcopy
1679 * *might* happen - it might be skipped if precopy transferred everything
1680 * quickly.
1681 */
1682static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
1683                                         uint16_t len)
1684{
1685    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1686    uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps;
1687    Error *local_err = NULL;
1688
1689    trace_loadvm_postcopy_handle_advise();
1690    if (ps != POSTCOPY_INCOMING_NONE) {
1691        error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
1692        return -1;
1693    }
1694
1695    switch (len) {
1696    case 0:
1697        if (migrate_postcopy_ram()) {
1698            error_report("RAM postcopy is enabled but have 0 byte advise");
1699            return -EINVAL;
1700        }
1701        return 0;
1702    case 8 + 8:
1703        if (!migrate_postcopy_ram()) {
1704            error_report("RAM postcopy is disabled but have 16 byte advise");
1705            return -EINVAL;
1706        }
1707        break;
1708    default:
1709        error_report("CMD_POSTCOPY_ADVISE invalid length (%d)", len);
1710        return -EINVAL;
1711    }
1712
1713    if (!postcopy_ram_supported_by_host(mis)) {
1714        postcopy_state_set(POSTCOPY_INCOMING_NONE);
1715        return -1;
1716    }
1717
1718    remote_pagesize_summary = qemu_get_be64(mis->from_src_file);
1719    local_pagesize_summary = ram_pagesize_summary();
1720
1721    if (remote_pagesize_summary != local_pagesize_summary)  {
1722        /*
1723         * This detects two potential causes of mismatch:
1724         *   a) A mismatch in host page sizes
1725         *      Some combinations of mismatch are probably possible but it gets
1726         *      a bit more complicated.  In particular we need to place whole
1727         *      host pages on the dest at once, and we need to ensure that we
1728         *      handle dirtying to make sure we never end up sending part of
1729         *      a hostpage on it's own.
1730         *   b) The use of different huge page sizes on source/destination
1731         *      a more fine grain test is performed during RAM block migration
1732         *      but this test here causes a nice early clear failure, and
1733         *      also fails when passed to an older qemu that doesn't
1734         *      do huge pages.
1735         */
1736        error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64
1737                                                             " d=%" PRIx64 ")",
1738                     remote_pagesize_summary, local_pagesize_summary);
1739        return -1;
1740    }
1741
1742    remote_tps = qemu_get_be64(mis->from_src_file);
1743    if (remote_tps != qemu_target_page_size()) {
1744        /*
1745         * Again, some differences could be dealt with, but for now keep it
1746         * simple.
1747         */
1748        error_report("Postcopy needs matching target page sizes (s=%d d=%zd)",
1749                     (int)remote_tps, qemu_target_page_size());
1750        return -1;
1751    }
1752
1753    if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_ADVISE, &local_err)) {
1754        error_report_err(local_err);
1755        return -1;
1756    }
1757
1758    if (ram_postcopy_incoming_init(mis)) {
1759        return -1;
1760    }
1761
1762    return 0;
1763}
1764
1765/* After postcopy we will be told to throw some pages away since they're
1766 * dirty and will have to be demand fetched.  Must happen before CPU is
1767 * started.
1768 * There can be 0..many of these messages, each encoding multiple pages.
1769 */
1770static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
1771                                              uint16_t len)
1772{
1773    int tmp;
1774    char ramid[256];
1775    PostcopyState ps = postcopy_state_get();
1776
1777    trace_loadvm_postcopy_ram_handle_discard();
1778
1779    switch (ps) {
1780    case POSTCOPY_INCOMING_ADVISE:
1781        /* 1st discard */
1782        tmp = postcopy_ram_prepare_discard(mis);
1783        if (tmp) {
1784            return tmp;
1785        }
1786        break;
1787
1788    case POSTCOPY_INCOMING_DISCARD:
1789        /* Expected state */
1790        break;
1791
1792    default:
1793        error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
1794                     ps);
1795        return -1;
1796    }
1797    /* We're expecting a
1798     *    Version (0)
1799     *    a RAM ID string (length byte, name, 0 term)
1800     *    then at least 1 16 byte chunk
1801    */
1802    if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
1803        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1804        return -1;
1805    }
1806
1807    tmp = qemu_get_byte(mis->from_src_file);
1808    if (tmp != postcopy_ram_discard_version) {
1809        error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
1810        return -1;
1811    }
1812
1813    if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
1814        error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
1815        return -1;
1816    }
1817    tmp = qemu_get_byte(mis->from_src_file);
1818    if (tmp != 0) {
1819        error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
1820        return -1;
1821    }
1822
1823    len -= 3 + strlen(ramid);
1824    if (len % 16) {
1825        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1826        return -1;
1827    }
1828    trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
1829    while (len) {
1830        uint64_t start_addr, block_length;
1831        start_addr = qemu_get_be64(mis->from_src_file);
1832        block_length = qemu_get_be64(mis->from_src_file);
1833
1834        len -= 16;
1835        int ret = ram_discard_range(ramid, start_addr, block_length);
1836        if (ret) {
1837            return ret;
1838        }
1839    }
1840    trace_loadvm_postcopy_ram_handle_discard_end();
1841
1842    return 0;
1843}
1844
1845/*
1846 * Triggered by a postcopy_listen command; this thread takes over reading
1847 * the input stream, leaving the main thread free to carry on loading the rest
1848 * of the device state (from RAM).
1849 * (TODO:This could do with being in a postcopy file - but there again it's
1850 * just another input loop, not that postcopy specific)
1851 */
1852static void *postcopy_ram_listen_thread(void *opaque)
1853{
1854    MigrationIncomingState *mis = migration_incoming_get_current();
1855    QEMUFile *f = mis->from_src_file;
1856    int load_res;
1857    MigrationState *migr = migrate_get_current();
1858
1859    object_ref(OBJECT(migr));
1860
1861    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
1862                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);
1863    qemu_sem_post(&mis->listen_thread_sem);
1864    trace_postcopy_ram_listen_thread_start();
1865
1866    rcu_register_thread();
1867    /*
1868     * Because we're a thread and not a coroutine we can't yield
1869     * in qemu_file, and thus we must be blocking now.
1870     */
1871    qemu_file_set_blocking(f, true);
1872    load_res = qemu_loadvm_state_main(f, mis);
1873
1874    /*
1875     * This is tricky, but, mis->from_src_file can change after it
1876     * returns, when postcopy recovery happened. In the future, we may
1877     * want a wrapper for the QEMUFile handle.
1878     */
1879    f = mis->from_src_file;
1880
1881    /* And non-blocking again so we don't block in any cleanup */
1882    qemu_file_set_blocking(f, false);
1883
1884    trace_postcopy_ram_listen_thread_exit();
1885    if (load_res < 0) {
1886        qemu_file_set_error(f, load_res);
1887        dirty_bitmap_mig_cancel_incoming();
1888        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
1889            !migrate_postcopy_ram() && migrate_dirty_bitmaps())
1890        {
1891            error_report("%s: loadvm failed during postcopy: %d. All states "
1892                         "are migrated except dirty bitmaps. Some dirty "
1893                         "bitmaps may be lost, and present migrated dirty "
1894                         "bitmaps are correctly migrated and valid.",
1895                         __func__, load_res);
1896            load_res = 0; /* prevent further exit() */
1897        } else {
1898            error_report("%s: loadvm failed: %d", __func__, load_res);
1899            migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1900                                           MIGRATION_STATUS_FAILED);
1901        }
1902    }
1903    if (load_res >= 0) {
1904        /*
1905         * This looks good, but it's possible that the device loading in the
1906         * main thread hasn't finished yet, and so we might not be in 'RUN'
1907         * state yet; wait for the end of the main thread.
1908         */
1909        qemu_event_wait(&mis->main_thread_load_event);
1910    }
1911    postcopy_ram_incoming_cleanup(mis);
1912
1913    if (load_res < 0) {
1914        /*
1915         * If something went wrong then we have a bad state so exit;
1916         * depending how far we got it might be possible at this point
1917         * to leave the guest running and fire MCEs for pages that never
1918         * arrived as a desperate recovery step.
1919         */
1920        rcu_unregister_thread();
1921        exit(EXIT_FAILURE);
1922    }
1923
1924    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1925                                   MIGRATION_STATUS_COMPLETED);
1926    /*
1927     * If everything has worked fine, then the main thread has waited
1928     * for us to start, and we're the last use of the mis.
1929     * (If something broke then qemu will have to exit anyway since it's
1930     * got a bad migration state).
1931     */
1932    migration_incoming_state_destroy();
1933    qemu_loadvm_state_cleanup();
1934
1935    rcu_unregister_thread();
1936    mis->have_listen_thread = false;
1937    postcopy_state_set(POSTCOPY_INCOMING_END);
1938
1939    object_unref(OBJECT(migr));
1940
1941    return NULL;
1942}
1943
1944/* After this message we must be able to immediately receive postcopy data */
1945static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
1946{
1947    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
1948    trace_loadvm_postcopy_handle_listen();
1949    Error *local_err = NULL;
1950
1951    if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
1952        error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
1953        return -1;
1954    }
1955    if (ps == POSTCOPY_INCOMING_ADVISE) {
1956        /*
1957         * A rare case, we entered listen without having to do any discards,
1958         * so do the setup that's normally done at the time of the 1st discard.
1959         */
1960        if (migrate_postcopy_ram()) {
1961            postcopy_ram_prepare_discard(mis);
1962        }
1963    }
1964
1965    /*
1966     * Sensitise RAM - can now generate requests for blocks that don't exist
1967     * However, at this point the CPU shouldn't be running, and the IO
1968     * shouldn't be doing anything yet so don't actually expect requests
1969     */
1970    if (migrate_postcopy_ram()) {
1971        if (postcopy_ram_incoming_setup(mis)) {
1972            postcopy_ram_incoming_cleanup(mis);
1973            return -1;
1974        }
1975    }
1976
1977    if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) {
1978        error_report_err(local_err);
1979        return -1;
1980    }
1981
1982    mis->have_listen_thread = true;
1983    /* Start up the listening thread and wait for it to signal ready */
1984    qemu_sem_init(&mis->listen_thread_sem, 0);
1985    qemu_thread_create(&mis->listen_thread, "postcopy/listen",
1986                       postcopy_ram_listen_thread, NULL,
1987                       QEMU_THREAD_DETACHED);
1988    qemu_sem_wait(&mis->listen_thread_sem);
1989    qemu_sem_destroy(&mis->listen_thread_sem);
1990
1991    return 0;
1992}
1993
1994static void loadvm_postcopy_handle_run_bh(void *opaque)
1995{
1996    Error *local_err = NULL;
1997    MigrationIncomingState *mis = opaque;
1998
1999    /* TODO we should move all of this lot into postcopy_ram.c or a shared code
2000     * in migration.c
2001     */
2002    cpu_synchronize_all_post_init();
2003
2004    qemu_announce_self(&mis->announce_timer, migrate_announce_params());
2005
2006    /* Make sure all file formats flush their mutable metadata.
2007     * If we get an error here, just don't restart the VM yet. */
2008    bdrv_invalidate_cache_all(&local_err);
2009    if (local_err) {
2010        error_report_err(local_err);
2011        local_err = NULL;
2012        autostart = false;
2013    }
2014
2015    trace_loadvm_postcopy_handle_run_cpu_sync();
2016
2017    trace_loadvm_postcopy_handle_run_vmstart();
2018
2019    dirty_bitmap_mig_before_vm_start();
2020
2021    if (autostart) {
2022        /* Hold onto your hats, starting the CPU */
2023        vm_start();
2024    } else {
2025        /* leave it paused and let management decide when to start the CPU */
2026        runstate_set(RUN_STATE_PAUSED);
2027    }
2028
2029    qemu_bh_delete(mis->bh);
2030}
2031
2032/* After all discards we can start running and asking for pages */
2033static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
2034{
2035    PostcopyState ps = postcopy_state_get();
2036
2037    trace_loadvm_postcopy_handle_run();
2038    if (ps != POSTCOPY_INCOMING_LISTENING) {
2039        error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
2040        return -1;
2041    }
2042
2043    postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
2044    mis->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, mis);
2045    qemu_bh_schedule(mis->bh);
2046
2047    /* We need to finish reading the stream from the package
2048     * and also stop reading anything more from the stream that loaded the
2049     * package (since it's now being read by the listener thread).
2050     * LOADVM_QUIT will quit all the layers of nested loadvm loops.
2051     */
2052    return LOADVM_QUIT;
2053}
2054
2055/* We must be with page_request_mutex held */
2056static gboolean postcopy_sync_page_req(gpointer key, gpointer value,
2057                                       gpointer data)
2058{
2059    MigrationIncomingState *mis = data;
2060    void *host_addr = (void *) key;
2061    ram_addr_t rb_offset;
2062    RAMBlock *rb;
2063    int ret;
2064
2065    rb = qemu_ram_block_from_host(host_addr, true, &rb_offset);
2066    if (!rb) {
2067        /*
2068         * This should _never_ happen.  However be nice for a migrating VM to
2069         * not crash/assert.  Post an error (note: intended to not use *_once
2070         * because we do want to see all the illegal addresses; and this can
2071         * never be triggered by the guest so we're safe) and move on next.
2072         */
2073        error_report("%s: illegal host addr %p", __func__, host_addr);
2074        /* Try the next entry */
2075        return FALSE;
2076    }
2077
2078    ret = migrate_send_rp_message_req_pages(mis, rb, rb_offset);
2079    if (ret) {
2080        /* Please refer to above comment. */
2081        error_report("%s: send rp message failed for addr %p",
2082                     __func__, host_addr);
2083        return FALSE;
2084    }
2085
2086    trace_postcopy_page_req_sync(host_addr);
2087
2088    return FALSE;
2089}
2090
2091static void migrate_send_rp_req_pages_pending(MigrationIncomingState *mis)
2092{
2093    WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
2094        g_tree_foreach(mis->page_requested, postcopy_sync_page_req, mis);
2095    }
2096}
2097
2098static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis)
2099{
2100    if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
2101        error_report("%s: illegal resume received", __func__);
2102        /* Don't fail the load, only for this. */
2103        return 0;
2104    }
2105
2106    /*
2107     * Reset the last_rb before we resend any page req to source again, since
2108     * the source should have it reset already.
2109     */
2110    mis->last_rb = NULL;
2111
2112    /*
2113     * This means source VM is ready to resume the postcopy migration.
2114     */
2115    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2116                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
2117
2118    trace_loadvm_postcopy_handle_resume();
2119
2120    /* Tell source that "we are ready" */
2121    migrate_send_rp_resume_ack(mis, MIGRATION_RESUME_ACK_VALUE);
2122
2123    /*
2124     * After a postcopy recovery, the source should have lost the postcopy
2125     * queue, or potentially the requested pages could have been lost during
2126     * the network down phase.  Let's re-sync with the source VM by re-sending
2127     * all the pending pages that we eagerly need, so these threads won't get
2128     * blocked too long due to the recovery.
2129     *
2130     * Without this procedure, the faulted destination VM threads (waiting for
2131     * page requests right before the postcopy is interrupted) can keep hanging
2132     * until the pages are sent by the source during the background copying of
2133     * pages, or another thread faulted on the same address accidentally.
2134     */
2135    migrate_send_rp_req_pages_pending(mis);
2136
2137    /*
2138     * It's time to switch state and release the fault thread to continue
2139     * service page faults.  Note that this should be explicitly after the
2140     * above call to migrate_send_rp_req_pages_pending().  In short:
2141     * migrate_send_rp_message_req_pages() is not thread safe, yet.
2142     */
2143    qemu_sem_post(&mis->postcopy_pause_sem_fault);
2144
2145    return 0;
2146}
2147
2148/**
2149 * Immediately following this command is a blob of data containing an embedded
2150 * chunk of migration stream; read it and load it.
2151 *
2152 * @mis: Incoming state
2153 * @length: Length of packaged data to read
2154 *
2155 * Returns: Negative values on error
2156 *
2157 */
2158static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
2159{
2160    int ret;
2161    size_t length;
2162    QIOChannelBuffer *bioc;
2163
2164    length = qemu_get_be32(mis->from_src_file);
2165    trace_loadvm_handle_cmd_packaged(length);
2166
2167    if (length > MAX_VM_CMD_PACKAGED_SIZE) {
2168        error_report("Unreasonably large packaged state: %zu", length);
2169        return -1;
2170    }
2171
2172    bioc = qio_channel_buffer_new(length);
2173    qio_channel_set_name(QIO_CHANNEL(bioc), "migration-loadvm-buffer");
2174    ret = qemu_get_buffer(mis->from_src_file,
2175                          bioc->data,
2176                          length);
2177    if (ret != length) {
2178        object_unref(OBJECT(bioc));
2179        error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu",
2180                     ret, length);
2181        return (ret < 0) ? ret : -EAGAIN;
2182    }
2183    bioc->usage += length;
2184    trace_loadvm_handle_cmd_packaged_received(ret);
2185
2186    QEMUFile *packf = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
2187
2188    ret = qemu_loadvm_state_main(packf, mis);
2189    trace_loadvm_handle_cmd_packaged_main(ret);
2190    qemu_fclose(packf);
2191    object_unref(OBJECT(bioc));
2192
2193    return ret;
2194}
2195
2196/*
2197 * Handle request that source requests for recved_bitmap on
2198 * destination. Payload format:
2199 *
2200 * len (1 byte) + ramblock_name (<255 bytes)
2201 */
2202static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
2203                                     uint16_t len)
2204{
2205    QEMUFile *file = mis->from_src_file;
2206    RAMBlock *rb;
2207    char block_name[256];
2208    size_t cnt;
2209
2210    cnt = qemu_get_counted_string(file, block_name);
2211    if (!cnt) {
2212        error_report("%s: failed to read block name", __func__);
2213        return -EINVAL;
2214    }
2215
2216    /* Validate before using the data */
2217    if (qemu_file_get_error(file)) {
2218        return qemu_file_get_error(file);
2219    }
2220
2221    if (len != cnt + 1) {
2222        error_report("%s: invalid payload length (%d)", __func__, len);
2223        return -EINVAL;
2224    }
2225
2226    rb = qemu_ram_block_by_name(block_name);
2227    if (!rb) {
2228        error_report("%s: block '%s' not found", __func__, block_name);
2229        return -EINVAL;
2230    }
2231
2232    migrate_send_rp_recv_bitmap(mis, block_name);
2233
2234    trace_loadvm_handle_recv_bitmap(block_name);
2235
2236    return 0;
2237}
2238
2239static int loadvm_process_enable_colo(MigrationIncomingState *mis)
2240{
2241    int ret = migration_incoming_enable_colo();
2242
2243    if (!ret) {
2244        ret = colo_init_ram_cache();
2245        if (ret) {
2246            migration_incoming_disable_colo();
2247        }
2248    }
2249    return ret;
2250}
2251
2252/*
2253 * Process an incoming 'QEMU_VM_COMMAND'
2254 * 0           just a normal return
2255 * LOADVM_QUIT All good, but exit the loop
2256 * <0          Error
2257 */
2258static int loadvm_process_command(QEMUFile *f)
2259{
2260    MigrationIncomingState *mis = migration_incoming_get_current();
2261    uint16_t cmd;
2262    uint16_t len;
2263    uint32_t tmp32;
2264
2265    cmd = qemu_get_be16(f);
2266    len = qemu_get_be16(f);
2267
2268    /* Check validity before continue processing of cmds */
2269    if (qemu_file_get_error(f)) {
2270        return qemu_file_get_error(f);
2271    }
2272
2273    trace_loadvm_process_command(cmd, len);
2274    if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
2275        error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
2276        return -EINVAL;
2277    }
2278
2279    if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
2280        error_report("%s received with bad length - expecting %zu, got %d",
2281                     mig_cmd_args[cmd].name,
2282                     (size_t)mig_cmd_args[cmd].len, len);
2283        return -ERANGE;
2284    }
2285
2286    switch (cmd) {
2287    case MIG_CMD_OPEN_RETURN_PATH:
2288        if (mis->to_src_file) {
2289            error_report("CMD_OPEN_RETURN_PATH called when RP already open");
2290            /* Not really a problem, so don't give up */
2291            return 0;
2292        }
2293        mis->to_src_file = qemu_file_get_return_path(f);
2294        if (!mis->to_src_file) {
2295            error_report("CMD_OPEN_RETURN_PATH failed");
2296            return -1;
2297        }
2298        break;
2299
2300    case MIG_CMD_PING:
2301        tmp32 = qemu_get_be32(f);
2302        trace_loadvm_process_command_ping(tmp32);
2303        if (!mis->to_src_file) {
2304            error_report("CMD_PING (0x%x) received with no return path",
2305                         tmp32);
2306            return -1;
2307        }
2308        migrate_send_rp_pong(mis, tmp32);
2309        break;
2310
2311    case MIG_CMD_PACKAGED:
2312        return loadvm_handle_cmd_packaged(mis);
2313
2314    case MIG_CMD_POSTCOPY_ADVISE:
2315        return loadvm_postcopy_handle_advise(mis, len);
2316
2317    case MIG_CMD_POSTCOPY_LISTEN:
2318        return loadvm_postcopy_handle_listen(mis);
2319
2320    case MIG_CMD_POSTCOPY_RUN:
2321        return loadvm_postcopy_handle_run(mis);
2322
2323    case MIG_CMD_POSTCOPY_RAM_DISCARD:
2324        return loadvm_postcopy_ram_handle_discard(mis, len);
2325
2326    case MIG_CMD_POSTCOPY_RESUME:
2327        return loadvm_postcopy_handle_resume(mis);
2328
2329    case MIG_CMD_RECV_BITMAP:
2330        return loadvm_handle_recv_bitmap(mis, len);
2331
2332    case MIG_CMD_ENABLE_COLO:
2333        return loadvm_process_enable_colo(mis);
2334    }
2335
2336    return 0;
2337}
2338
2339/*
2340 * Read a footer off the wire and check that it matches the expected section
2341 *
2342 * Returns: true if the footer was good
2343 *          false if there is a problem (and calls error_report to say why)
2344 */
2345static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
2346{
2347    int ret;
2348    uint8_t read_mark;
2349    uint32_t read_section_id;
2350
2351    if (!migrate_get_current()->send_section_footer) {
2352        /* No footer to check */
2353        return true;
2354    }
2355
2356    read_mark = qemu_get_byte(f);
2357
2358    ret = qemu_file_get_error(f);
2359    if (ret) {
2360        error_report("%s: Read section footer failed: %d",
2361                     __func__, ret);
2362        return false;
2363    }
2364
2365    if (read_mark != QEMU_VM_SECTION_FOOTER) {
2366        error_report("Missing section footer for %s", se->idstr);
2367        return false;
2368    }
2369
2370    read_section_id = qemu_get_be32(f);
2371    if (read_section_id != se->load_section_id) {
2372        error_report("Mismatched section id in footer for %s -"
2373                     " read 0x%x expected 0x%x",
2374                     se->idstr, read_section_id, se->load_section_id);
2375        return false;
2376    }
2377
2378    /* All good */
2379    return true;
2380}
2381
2382static int
2383qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
2384{
2385    uint32_t instance_id, version_id, section_id;
2386    SaveStateEntry *se;
2387    char idstr[256];
2388    int ret;
2389
2390    /* Read section start */
2391    section_id = qemu_get_be32(f);
2392    if (!qemu_get_counted_string(f, idstr)) {
2393        error_report("Unable to read ID string for section %u",
2394                     section_id);
2395        return -EINVAL;
2396    }
2397    instance_id = qemu_get_be32(f);
2398    version_id = qemu_get_be32(f);
2399
2400    ret = qemu_file_get_error(f);
2401    if (ret) {
2402        error_report("%s: Failed to read instance/version ID: %d",
2403                     __func__, ret);
2404        return ret;
2405    }
2406
2407    trace_qemu_loadvm_state_section_startfull(section_id, idstr,
2408            instance_id, version_id);
2409    /* Find savevm section */
2410    se = find_se(idstr, instance_id);
2411    if (se == NULL) {
2412        error_report("Unknown savevm section or instance '%s' %"PRIu32". "
2413                     "Make sure that your current VM setup matches your "
2414                     "saved VM setup, including any hotplugged devices",
2415                     idstr, instance_id);
2416        return -EINVAL;
2417    }
2418
2419    /* Validate version */
2420    if (version_id > se->version_id) {
2421        error_report("savevm: unsupported version %d for '%s' v%d",
2422                     version_id, idstr, se->version_id);
2423        return -EINVAL;
2424    }
2425    se->load_version_id = version_id;
2426    se->load_section_id = section_id;
2427
2428    /* Validate if it is a device's state */
2429    if (xen_enabled() && se->is_ram) {
2430        error_report("loadvm: %s RAM loading not allowed on Xen", idstr);
2431        return -EINVAL;
2432    }
2433
2434    ret = vmstate_load(f, se);
2435    if (ret < 0) {
2436        error_report("error while loading state for instance 0x%"PRIx32" of"
2437                     " device '%s'", instance_id, idstr);
2438        return ret;
2439    }
2440    if (!check_section_footer(f, se)) {
2441        return -EINVAL;
2442    }
2443
2444    return 0;
2445}
2446
2447static int
2448qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
2449{
2450    uint32_t section_id;
2451    SaveStateEntry *se;
2452    int ret;
2453
2454    section_id = qemu_get_be32(f);
2455
2456    ret = qemu_file_get_error(f);
2457    if (ret) {
2458        error_report("%s: Failed to read section ID: %d",
2459                     __func__, ret);
2460        return ret;
2461    }
2462
2463    trace_qemu_loadvm_state_section_partend(section_id);
2464    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2465        if (se->load_section_id == section_id) {
2466            break;
2467        }
2468    }
2469    if (se == NULL) {
2470        error_report("Unknown savevm section %d", section_id);
2471        return -EINVAL;
2472    }
2473
2474    ret = vmstate_load(f, se);
2475    if (ret < 0) {
2476        error_report("error while loading state section id %d(%s)",
2477                     section_id, se->idstr);
2478        return ret;
2479    }
2480    if (!check_section_footer(f, se)) {
2481        return -EINVAL;
2482    }
2483
2484    return 0;
2485}
2486
2487static int qemu_loadvm_state_header(QEMUFile *f)
2488{
2489    unsigned int v;
2490    int ret;
2491
2492    v = qemu_get_be32(f);
2493    if (v != QEMU_VM_FILE_MAGIC) {
2494        error_report("Not a migration stream");
2495        return -EINVAL;
2496    }
2497
2498    v = qemu_get_be32(f);
2499    if (v == QEMU_VM_FILE_VERSION_COMPAT) {
2500        error_report("SaveVM v2 format is obsolete and don't work anymore");
2501        return -ENOTSUP;
2502    }
2503    if (v != QEMU_VM_FILE_VERSION) {
2504        error_report("Unsupported migration stream version");
2505        return -ENOTSUP;
2506    }
2507
2508    if (migrate_get_current()->send_configuration) {
2509        if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
2510            error_report("Configuration section missing");
2511            qemu_loadvm_state_cleanup();
2512            return -EINVAL;
2513        }
2514        ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
2515
2516        if (ret) {
2517            qemu_loadvm_state_cleanup();
2518            return ret;
2519        }
2520    }
2521    return 0;
2522}
2523
2524static int qemu_loadvm_state_setup(QEMUFile *f)
2525{
2526    SaveStateEntry *se;
2527    int ret;
2528
2529    trace_loadvm_state_setup();
2530    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2531        if (!se->ops || !se->ops->load_setup) {
2532            continue;
2533        }
2534        if (se->ops->is_active) {
2535            if (!se->ops->is_active(se->opaque)) {
2536                continue;
2537            }
2538        }
2539
2540        ret = se->ops->load_setup(f, se->opaque);
2541        if (ret < 0) {
2542            qemu_file_set_error(f, ret);
2543            error_report("Load state of device %s failed", se->idstr);
2544            return ret;
2545        }
2546    }
2547    return 0;
2548}
2549
2550void qemu_loadvm_state_cleanup(void)
2551{
2552    SaveStateEntry *se;
2553
2554    trace_loadvm_state_cleanup();
2555    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2556        if (se->ops && se->ops->load_cleanup) {
2557            se->ops->load_cleanup(se->opaque);
2558        }
2559    }
2560}
2561
2562/* Return true if we should continue the migration, or false. */
2563static bool postcopy_pause_incoming(MigrationIncomingState *mis)
2564{
2565    trace_postcopy_pause_incoming();
2566
2567    assert(migrate_postcopy_ram());
2568
2569    /* Clear the triggered bit to allow one recovery */
2570    mis->postcopy_recover_triggered = false;
2571
2572    /*
2573     * Unregister yank with either from/to src would work, since ioc behind it
2574     * is the same
2575     */
2576    migration_ioc_unregister_yank_from_file(mis->from_src_file);
2577
2578    assert(mis->from_src_file);
2579    qemu_file_shutdown(mis->from_src_file);
2580    qemu_fclose(mis->from_src_file);
2581    mis->from_src_file = NULL;
2582
2583    assert(mis->to_src_file);
2584    qemu_file_shutdown(mis->to_src_file);
2585    qemu_mutex_lock(&mis->rp_mutex);
2586    qemu_fclose(mis->to_src_file);
2587    mis->to_src_file = NULL;
2588    qemu_mutex_unlock(&mis->rp_mutex);
2589
2590    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2591                      MIGRATION_STATUS_POSTCOPY_PAUSED);
2592
2593    /* Notify the fault thread for the invalidated file handle */
2594    postcopy_fault_thread_notify(mis);
2595
2596    error_report("Detected IO failure for postcopy. "
2597                 "Migration paused.");
2598
2599    while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
2600        qemu_sem_wait(&mis->postcopy_pause_sem_dst);
2601    }
2602
2603    trace_postcopy_pause_incoming_continued();
2604
2605    return true;
2606}
2607
2608int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
2609{
2610    uint8_t section_type;
2611    int ret = 0;
2612
2613retry:
2614    while (true) {
2615        section_type = qemu_get_byte(f);
2616
2617        if (qemu_file_get_error(f)) {
2618            ret = qemu_file_get_error(f);
2619            break;
2620        }
2621
2622        trace_qemu_loadvm_state_section(section_type);
2623        switch (section_type) {
2624        case QEMU_VM_SECTION_START:
2625        case QEMU_VM_SECTION_FULL:
2626            ret = qemu_loadvm_section_start_full(f, mis);
2627            if (ret < 0) {
2628                goto out;
2629            }
2630            break;
2631        case QEMU_VM_SECTION_PART:
2632        case QEMU_VM_SECTION_END:
2633            ret = qemu_loadvm_section_part_end(f, mis);
2634            if (ret < 0) {
2635                goto out;
2636            }
2637            break;
2638        case QEMU_VM_COMMAND:
2639            ret = loadvm_process_command(f);
2640            trace_qemu_loadvm_state_section_command(ret);
2641            if ((ret < 0) || (ret == LOADVM_QUIT)) {
2642                goto out;
2643            }
2644            break;
2645        case QEMU_VM_EOF:
2646            /* This is the end of migration */
2647            goto out;
2648        default:
2649            error_report("Unknown savevm section type %d", section_type);
2650            ret = -EINVAL;
2651            goto out;
2652        }
2653    }
2654
2655out:
2656    if (ret < 0) {
2657        qemu_file_set_error(f, ret);
2658
2659        /* Cancel bitmaps incoming regardless of recovery */
2660        dirty_bitmap_mig_cancel_incoming();
2661
2662        /*
2663         * If we are during an active postcopy, then we pause instead
2664         * of bail out to at least keep the VM's dirty data.  Note
2665         * that POSTCOPY_INCOMING_LISTENING stage is still not enough,
2666         * during which we're still receiving device states and we
2667         * still haven't yet started the VM on destination.
2668         *
2669         * Only RAM postcopy supports recovery. Still, if RAM postcopy is
2670         * enabled, canceled bitmaps postcopy will not affect RAM postcopy
2671         * recovering.
2672         */
2673        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
2674            migrate_postcopy_ram() && postcopy_pause_incoming(mis)) {
2675            /* Reset f to point to the newly created channel */
2676            f = mis->from_src_file;
2677            goto retry;
2678        }
2679    }
2680    return ret;
2681}
2682
2683int qemu_loadvm_state(QEMUFile *f)
2684{
2685    MigrationIncomingState *mis = migration_incoming_get_current();
2686    Error *local_err = NULL;
2687    int ret;
2688
2689    if (qemu_savevm_state_blocked(&local_err)) {
2690        error_report_err(local_err);
2691        return -EINVAL;
2692    }
2693
2694    ret = qemu_loadvm_state_header(f);
2695    if (ret) {
2696        return ret;
2697    }
2698
2699    if (qemu_loadvm_state_setup(f) != 0) {
2700        return -EINVAL;
2701    }
2702
2703    cpu_synchronize_all_pre_loadvm();
2704
2705    ret = qemu_loadvm_state_main(f, mis);
2706    qemu_event_set(&mis->main_thread_load_event);
2707
2708    trace_qemu_loadvm_state_post_main(ret);
2709
2710    if (mis->have_listen_thread) {
2711        /* Listen thread still going, can't clean up yet */
2712        return ret;
2713    }
2714
2715    if (ret == 0) {
2716        ret = qemu_file_get_error(f);
2717    }
2718
2719    /*
2720     * Try to read in the VMDESC section as well, so that dumping tools that
2721     * intercept our migration stream have the chance to see it.
2722     */
2723
2724    /* We've got to be careful; if we don't read the data and just shut the fd
2725     * then the sender can error if we close while it's still sending.
2726     * We also mustn't read data that isn't there; some transports (RDMA)
2727     * will stall waiting for that data when the source has already closed.
2728     */
2729    if (ret == 0 && should_send_vmdesc()) {
2730        uint8_t *buf;
2731        uint32_t size;
2732        uint8_t  section_type = qemu_get_byte(f);
2733
2734        if (section_type != QEMU_VM_VMDESCRIPTION) {
2735            error_report("Expected vmdescription section, but got %d",
2736                         section_type);
2737            /*
2738             * It doesn't seem worth failing at this point since
2739             * we apparently have an otherwise valid VM state
2740             */
2741        } else {
2742            buf = g_malloc(0x1000);
2743            size = qemu_get_be32(f);
2744
2745            while (size > 0) {
2746                uint32_t read_chunk = MIN(size, 0x1000);
2747                qemu_get_buffer(f, buf, read_chunk);
2748                size -= read_chunk;
2749            }
2750            g_free(buf);
2751        }
2752    }
2753
2754    qemu_loadvm_state_cleanup();
2755    cpu_synchronize_all_post_init();
2756
2757    return ret;
2758}
2759
2760int qemu_load_device_state(QEMUFile *f)
2761{
2762    MigrationIncomingState *mis = migration_incoming_get_current();
2763    int ret;
2764
2765    /* Load QEMU_VM_SECTION_FULL section */
2766    ret = qemu_loadvm_state_main(f, mis);
2767    if (ret < 0) {
2768        error_report("Failed to load device state: %d", ret);
2769        return ret;
2770    }
2771
2772    cpu_synchronize_all_post_init();
2773    return 0;
2774}
2775
2776bool save_snapshot(const char *name, bool overwrite, const char *vmstate,
2777                  bool has_devices, strList *devices, Error **errp)
2778{
2779    BlockDriverState *bs;
2780    QEMUSnapshotInfo sn1, *sn = &sn1;
2781    int ret = -1, ret2;
2782    QEMUFile *f;
2783    int saved_vm_running;
2784    uint64_t vm_state_size;
2785    g_autoptr(GDateTime) now = g_date_time_new_now_local();
2786    AioContext *aio_context;
2787
2788    if (migration_is_blocked(errp)) {
2789        return false;
2790    }
2791
2792    if (!replay_can_snapshot()) {
2793        error_setg(errp, "Record/replay does not allow making snapshot "
2794                   "right now. Try once more later.");
2795        return false;
2796    }
2797
2798    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
2799        return false;
2800    }
2801
2802    /* Delete old snapshots of the same name */
2803    if (name) {
2804        if (overwrite) {
2805            if (bdrv_all_delete_snapshot(name, has_devices,
2806                                         devices, errp) < 0) {
2807                return false;
2808            }
2809        } else {
2810            ret2 = bdrv_all_has_snapshot(name, has_devices, devices, errp);
2811            if (ret2 < 0) {
2812                return false;
2813            }
2814            if (ret2 == 1) {
2815                error_setg(errp,
2816                           "Snapshot '%s' already exists in one or more devices",
2817                           name);
2818                return false;
2819            }
2820        }
2821    }
2822
2823    bs = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
2824    if (bs == NULL) {
2825        return false;
2826    }
2827    aio_context = bdrv_get_aio_context(bs);
2828
2829    saved_vm_running = runstate_is_running();
2830
2831    ret = global_state_store();
2832    if (ret) {
2833        error_setg(errp, "Error saving global state");
2834        return false;
2835    }
2836    vm_stop(RUN_STATE_SAVE_VM);
2837
2838    bdrv_drain_all_begin();
2839
2840    aio_context_acquire(aio_context);
2841
2842    memset(sn, 0, sizeof(*sn));
2843
2844    /* fill auxiliary fields */
2845    sn->date_sec = g_date_time_to_unix(now);
2846    sn->date_nsec = g_date_time_get_microsecond(now) * 1000;
2847    sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
2848    if (replay_mode != REPLAY_MODE_NONE) {
2849        sn->icount = replay_get_current_icount();
2850    } else {
2851        sn->icount = -1ULL;
2852    }
2853
2854    if (name) {
2855        pstrcpy(sn->name, sizeof(sn->name), name);
2856    } else {
2857        g_autofree char *autoname = g_date_time_format(now,  "vm-%Y%m%d%H%M%S");
2858        pstrcpy(sn->name, sizeof(sn->name), autoname);
2859    }
2860
2861    /* save the VM state */
2862    f = qemu_fopen_bdrv(bs, 1);
2863    if (!f) {
2864        error_setg(errp, "Could not open VM state file");
2865        goto the_end;
2866    }
2867    ret = qemu_savevm_state(f, errp);
2868    vm_state_size = qemu_ftell(f);
2869    ret2 = qemu_fclose(f);
2870    if (ret < 0) {
2871        goto the_end;
2872    }
2873    if (ret2 < 0) {
2874        ret = ret2;
2875        goto the_end;
2876    }
2877
2878    /* The bdrv_all_create_snapshot() call that follows acquires the AioContext
2879     * for itself.  BDRV_POLL_WHILE() does not support nested locking because
2880     * it only releases the lock once.  Therefore synchronous I/O will deadlock
2881     * unless we release the AioContext before bdrv_all_create_snapshot().
2882     */
2883    aio_context_release(aio_context);
2884    aio_context = NULL;
2885
2886    ret = bdrv_all_create_snapshot(sn, bs, vm_state_size,
2887                                   has_devices, devices, errp);
2888    if (ret < 0) {
2889        bdrv_all_delete_snapshot(sn->name, has_devices, devices, NULL);
2890        goto the_end;
2891    }
2892
2893    ret = 0;
2894
2895 the_end:
2896    if (aio_context) {
2897        aio_context_release(aio_context);
2898    }
2899
2900    bdrv_drain_all_end();
2901
2902    if (saved_vm_running) {
2903        vm_start();
2904    }
2905    return ret == 0;
2906}
2907
2908void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
2909                                Error **errp)
2910{
2911    QEMUFile *f;
2912    QIOChannelFile *ioc;
2913    int saved_vm_running;
2914    int ret;
2915
2916    if (!has_live) {
2917        /* live default to true so old version of Xen tool stack can have a
2918         * successful live migration */
2919        live = true;
2920    }
2921
2922    saved_vm_running = runstate_is_running();
2923    vm_stop(RUN_STATE_SAVE_VM);
2924    global_state_store_running();
2925
2926    ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT | O_TRUNC,
2927                                    0660, errp);
2928    if (!ioc) {
2929        goto the_end;
2930    }
2931    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-save-state");
2932    f = qemu_fopen_channel_output(QIO_CHANNEL(ioc));
2933    object_unref(OBJECT(ioc));
2934    ret = qemu_save_device_state(f);
2935    if (ret < 0 || qemu_fclose(f) < 0) {
2936        error_setg(errp, QERR_IO_ERROR);
2937    } else {
2938        /* libxl calls the QMP command "stop" before calling
2939         * "xen-save-devices-state" and in case of migration failure, libxl
2940         * would call "cont".
2941         * So call bdrv_inactivate_all (release locks) here to let the other
2942         * side of the migration take control of the images.
2943         */
2944        if (live && !saved_vm_running) {
2945            ret = bdrv_inactivate_all();
2946            if (ret) {
2947                error_setg(errp, "%s: bdrv_inactivate_all() failed (%d)",
2948                           __func__, ret);
2949            }
2950        }
2951    }
2952
2953 the_end:
2954    if (saved_vm_running) {
2955        vm_start();
2956    }
2957}
2958
2959void qmp_xen_load_devices_state(const char *filename, Error **errp)
2960{
2961    QEMUFile *f;
2962    QIOChannelFile *ioc;
2963    int ret;
2964
2965    /* Guest must be paused before loading the device state; the RAM state
2966     * will already have been loaded by xc
2967     */
2968    if (runstate_is_running()) {
2969        error_setg(errp, "Cannot update device state while vm is running");
2970        return;
2971    }
2972    vm_stop(RUN_STATE_RESTORE_VM);
2973
2974    ioc = qio_channel_file_new_path(filename, O_RDONLY | O_BINARY, 0, errp);
2975    if (!ioc) {
2976        return;
2977    }
2978    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-load-state");
2979    f = qemu_fopen_channel_input(QIO_CHANNEL(ioc));
2980    object_unref(OBJECT(ioc));
2981
2982    ret = qemu_loadvm_state(f);
2983    qemu_fclose(f);
2984    if (ret < 0) {
2985        error_setg(errp, QERR_IO_ERROR);
2986    }
2987    migration_incoming_state_destroy();
2988}
2989
2990bool load_snapshot(const char *name, const char *vmstate,
2991                   bool has_devices, strList *devices, Error **errp)
2992{
2993    BlockDriverState *bs_vm_state;
2994    QEMUSnapshotInfo sn;
2995    QEMUFile *f;
2996    int ret;
2997    AioContext *aio_context;
2998    MigrationIncomingState *mis = migration_incoming_get_current();
2999
3000    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
3001        return false;
3002    }
3003    ret = bdrv_all_has_snapshot(name, has_devices, devices, errp);
3004    if (ret < 0) {
3005        return false;
3006    }
3007    if (ret == 0) {
3008        error_setg(errp, "Snapshot '%s' does not exist in one or more devices",
3009                   name);
3010        return false;
3011    }
3012
3013    bs_vm_state = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
3014    if (!bs_vm_state) {
3015        return false;
3016    }
3017    aio_context = bdrv_get_aio_context(bs_vm_state);
3018
3019    /* Don't even try to load empty VM states */
3020    aio_context_acquire(aio_context);
3021    ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
3022    aio_context_release(aio_context);
3023    if (ret < 0) {
3024        return false;
3025    } else if (sn.vm_state_size == 0) {
3026        error_setg(errp, "This is a disk-only snapshot. Revert to it "
3027                   " offline using qemu-img");
3028        return false;
3029    }
3030
3031    /*
3032     * Flush the record/replay queue. Now the VM state is going
3033     * to change. Therefore we don't need to preserve its consistency
3034     */
3035    replay_flush_events();
3036
3037    /* Flush all IO requests so they don't interfere with the new state.  */
3038    bdrv_drain_all_begin();
3039
3040    ret = bdrv_all_goto_snapshot(name, has_devices, devices, errp);
3041    if (ret < 0) {
3042        goto err_drain;
3043    }
3044
3045    /* restore the VM state */
3046    f = qemu_fopen_bdrv(bs_vm_state, 0);
3047    if (!f) {
3048        error_setg(errp, "Could not open VM state file");
3049        goto err_drain;
3050    }
3051
3052    qemu_system_reset(SHUTDOWN_CAUSE_NONE);
3053    mis->from_src_file = f;
3054
3055    if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
3056        ret = -EINVAL;
3057        goto err_drain;
3058    }
3059    aio_context_acquire(aio_context);
3060    ret = qemu_loadvm_state(f);
3061    migration_incoming_state_destroy();
3062    aio_context_release(aio_context);
3063
3064    bdrv_drain_all_end();
3065
3066    if (ret < 0) {
3067        error_setg(errp, "Error %d while loading VM state", ret);
3068        return false;
3069    }
3070
3071    return true;
3072
3073err_drain:
3074    bdrv_drain_all_end();
3075    return false;
3076}
3077
3078bool delete_snapshot(const char *name, bool has_devices,
3079                     strList *devices, Error **errp)
3080{
3081    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
3082        return false;
3083    }
3084
3085    if (bdrv_all_delete_snapshot(name, has_devices, devices, errp) < 0) {
3086        return false;
3087    }
3088
3089    return true;
3090}
3091
3092void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
3093{
3094    qemu_ram_set_idstr(mr->ram_block,
3095                       memory_region_name(mr), dev);
3096    qemu_ram_set_migratable(mr->ram_block);
3097}
3098
3099void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
3100{
3101    qemu_ram_unset_idstr(mr->ram_block);
3102    qemu_ram_unset_migratable(mr->ram_block);
3103}
3104
3105void vmstate_register_ram_global(MemoryRegion *mr)
3106{
3107    vmstate_register_ram(mr, NULL);
3108}
3109
3110bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
3111{
3112    /* check needed if --only-migratable is specified */
3113    if (!only_migratable) {
3114        return true;
3115    }
3116
3117    return !(vmsd && vmsd->unmigratable);
3118}
3119
3120typedef struct SnapshotJob {
3121    Job common;
3122    char *tag;
3123    char *vmstate;
3124    strList *devices;
3125    Coroutine *co;
3126    Error **errp;
3127    bool ret;
3128} SnapshotJob;
3129
3130static void qmp_snapshot_job_free(SnapshotJob *s)
3131{
3132    g_free(s->tag);
3133    g_free(s->vmstate);
3134    qapi_free_strList(s->devices);
3135}
3136
3137
3138static void snapshot_load_job_bh(void *opaque)
3139{
3140    Job *job = opaque;
3141    SnapshotJob *s = container_of(job, SnapshotJob, common);
3142    int orig_vm_running;
3143
3144    job_progress_set_remaining(&s->common, 1);
3145
3146    orig_vm_running = runstate_is_running();
3147    vm_stop(RUN_STATE_RESTORE_VM);
3148
3149    s->ret = load_snapshot(s->tag, s->vmstate, true, s->devices, s->errp);
3150    if (s->ret && orig_vm_running) {
3151        vm_start();
3152    }
3153
3154    job_progress_update(&s->common, 1);
3155
3156    qmp_snapshot_job_free(s);
3157    aio_co_wake(s->co);
3158}
3159
3160static void snapshot_save_job_bh(void *opaque)
3161{
3162    Job *job = opaque;
3163    SnapshotJob *s = container_of(job, SnapshotJob, common);
3164
3165    job_progress_set_remaining(&s->common, 1);
3166    s->ret = save_snapshot(s->tag, false, s->vmstate,
3167                           true, s->devices, s->errp);
3168    job_progress_update(&s->common, 1);
3169
3170    qmp_snapshot_job_free(s);
3171    aio_co_wake(s->co);
3172}
3173
3174static void snapshot_delete_job_bh(void *opaque)
3175{
3176    Job *job = opaque;
3177    SnapshotJob *s = container_of(job, SnapshotJob, common);
3178
3179    job_progress_set_remaining(&s->common, 1);
3180    s->ret = delete_snapshot(s->tag, true, s->devices, s->errp);
3181    job_progress_update(&s->common, 1);
3182
3183    qmp_snapshot_job_free(s);
3184    aio_co_wake(s->co);
3185}
3186
3187static int coroutine_fn snapshot_save_job_run(Job *job, Error **errp)
3188{
3189    SnapshotJob *s = container_of(job, SnapshotJob, common);
3190    s->errp = errp;
3191    s->co = qemu_coroutine_self();
3192    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3193                            snapshot_save_job_bh, job);
3194    qemu_coroutine_yield();
3195    return s->ret ? 0 : -1;
3196}
3197
3198static int coroutine_fn snapshot_load_job_run(Job *job, Error **errp)
3199{
3200    SnapshotJob *s = container_of(job, SnapshotJob, common);
3201    s->errp = errp;
3202    s->co = qemu_coroutine_self();
3203    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3204                            snapshot_load_job_bh, job);
3205    qemu_coroutine_yield();
3206    return s->ret ? 0 : -1;
3207}
3208
3209static int coroutine_fn snapshot_delete_job_run(Job *job, Error **errp)
3210{
3211    SnapshotJob *s = container_of(job, SnapshotJob, common);
3212    s->errp = errp;
3213    s->co = qemu_coroutine_self();
3214    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3215                            snapshot_delete_job_bh, job);
3216    qemu_coroutine_yield();
3217    return s->ret ? 0 : -1;
3218}
3219
3220
3221static const JobDriver snapshot_load_job_driver = {
3222    .instance_size = sizeof(SnapshotJob),
3223    .job_type      = JOB_TYPE_SNAPSHOT_LOAD,
3224    .run           = snapshot_load_job_run,
3225};
3226
3227static const JobDriver snapshot_save_job_driver = {
3228    .instance_size = sizeof(SnapshotJob),
3229    .job_type      = JOB_TYPE_SNAPSHOT_SAVE,
3230    .run           = snapshot_save_job_run,
3231};
3232
3233static const JobDriver snapshot_delete_job_driver = {
3234    .instance_size = sizeof(SnapshotJob),
3235    .job_type      = JOB_TYPE_SNAPSHOT_DELETE,
3236    .run           = snapshot_delete_job_run,
3237};
3238
3239
3240void qmp_snapshot_save(const char *job_id,
3241                       const char *tag,
3242                       const char *vmstate,
3243                       strList *devices,
3244                       Error **errp)
3245{
3246    SnapshotJob *s;
3247
3248    s = job_create(job_id, &snapshot_save_job_driver, NULL,
3249                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3250                   NULL, NULL, errp);
3251    if (!s) {
3252        return;
3253    }
3254
3255    s->tag = g_strdup(tag);
3256    s->vmstate = g_strdup(vmstate);
3257    s->devices = QAPI_CLONE(strList, devices);
3258
3259    job_start(&s->common);
3260}
3261
3262void qmp_snapshot_load(const char *job_id,
3263                       const char *tag,
3264                       const char *vmstate,
3265                       strList *devices,
3266                       Error **errp)
3267{
3268    SnapshotJob *s;
3269
3270    s = job_create(job_id, &snapshot_load_job_driver, NULL,
3271                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3272                   NULL, NULL, errp);
3273    if (!s) {
3274        return;
3275    }
3276
3277    s->tag = g_strdup(tag);
3278    s->vmstate = g_strdup(vmstate);
3279    s->devices = QAPI_CLONE(strList, devices);
3280
3281    job_start(&s->common);
3282}
3283
3284void qmp_snapshot_delete(const char *job_id,
3285                         const char *tag,
3286                         strList *devices,
3287                         Error **errp)
3288{
3289    SnapshotJob *s;
3290
3291    s = job_create(job_id, &snapshot_delete_job_driver, NULL,
3292                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3293                   NULL, NULL, errp);
3294    if (!s) {
3295        return;
3296    }
3297
3298    s->tag = g_strdup(tag);
3299    s->devices = QAPI_CLONE(strList, devices);
3300
3301    job_start(&s->common);
3302}
3303