qemu/migration/savevm.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2009-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include "hw/boards.h"
  31#include "net/net.h"
  32#include "migration.h"
  33#include "migration/snapshot.h"
  34#include "migration/vmstate.h"
  35#include "migration/misc.h"
  36#include "migration/register.h"
  37#include "migration/global_state.h"
  38#include "migration/channel-block.h"
  39#include "ram.h"
  40#include "qemu-file.h"
  41#include "savevm.h"
  42#include "postcopy-ram.h"
  43#include "qapi/error.h"
  44#include "qapi/qapi-commands-migration.h"
  45#include "qapi/qmp/json-writer.h"
  46#include "qapi/clone-visitor.h"
  47#include "qapi/qapi-builtin-visit.h"
  48#include "qapi/qmp/qerror.h"
  49#include "qemu/error-report.h"
  50#include "sysemu/cpus.h"
  51#include "exec/memory.h"
  52#include "exec/target_page.h"
  53#include "trace.h"
  54#include "qemu/iov.h"
  55#include "qemu/main-loop.h"
  56#include "block/snapshot.h"
  57#include "qemu/cutils.h"
  58#include "io/channel-buffer.h"
  59#include "io/channel-file.h"
  60#include "sysemu/replay.h"
  61#include "sysemu/runstate.h"
  62#include "sysemu/sysemu.h"
  63#include "sysemu/xen.h"
  64#include "migration/colo.h"
  65#include "qemu/bitmap.h"
  66#include "net/announce.h"
  67#include "qemu/yank.h"
  68#include "yank_functions.h"
  69
  70const unsigned int postcopy_ram_discard_version;
  71
  72/* Subcommands for QEMU_VM_COMMAND */
  73enum qemu_vm_cmd {
  74    MIG_CMD_INVALID = 0,   /* Must be 0 */
  75    MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
  76    MIG_CMD_PING,              /* Request a PONG on the RP */
  77
  78    MIG_CMD_POSTCOPY_ADVISE,       /* Prior to any page transfers, just
  79                                      warn we might want to do PC */
  80    MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
  81                                      pages as it's running. */
  82    MIG_CMD_POSTCOPY_RUN,          /* Start execution */
  83
  84    MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
  85                                      were previously sent during
  86                                      precopy but are dirty. */
  87    MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
  88    MIG_CMD_ENABLE_COLO,       /* Enable COLO */
  89    MIG_CMD_POSTCOPY_RESUME,   /* resume postcopy on dest */
  90    MIG_CMD_RECV_BITMAP,       /* Request for recved bitmap on dst */
  91    MIG_CMD_MAX
  92};
  93
  94#define MAX_VM_CMD_PACKAGED_SIZE UINT32_MAX
  95static struct mig_cmd_args {
  96    ssize_t     len; /* -1 = variable */
  97    const char *name;
  98} mig_cmd_args[] = {
  99    [MIG_CMD_INVALID]          = { .len = -1, .name = "INVALID" },
 100    [MIG_CMD_OPEN_RETURN_PATH] = { .len =  0, .name = "OPEN_RETURN_PATH" },
 101    [MIG_CMD_PING]             = { .len = sizeof(uint32_t), .name = "PING" },
 102    [MIG_CMD_POSTCOPY_ADVISE]  = { .len = -1, .name = "POSTCOPY_ADVISE" },
 103    [MIG_CMD_POSTCOPY_LISTEN]  = { .len =  0, .name = "POSTCOPY_LISTEN" },
 104    [MIG_CMD_POSTCOPY_RUN]     = { .len =  0, .name = "POSTCOPY_RUN" },
 105    [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
 106                                   .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
 107    [MIG_CMD_POSTCOPY_RESUME]  = { .len =  0, .name = "POSTCOPY_RESUME" },
 108    [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
 109    [MIG_CMD_RECV_BITMAP]      = { .len = -1, .name = "RECV_BITMAP" },
 110    [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
 111};
 112
 113/* Note for MIG_CMD_POSTCOPY_ADVISE:
 114 * The format of arguments is depending on postcopy mode:
 115 * - postcopy RAM only
 116 *   uint64_t host page size
 117 *   uint64_t taget page size
 118 *
 119 * - postcopy RAM and postcopy dirty bitmaps
 120 *   format is the same as for postcopy RAM only
 121 *
 122 * - postcopy dirty bitmaps only
 123 *   Nothing. Command length field is 0.
 124 *
 125 * Be careful: adding a new postcopy entity with some other parameters should
 126 * not break format self-description ability. Good way is to introduce some
 127 * generic extendable format with an exception for two old entities.
 128 */
 129
 130/***********************************************************/
 131/* savevm/loadvm support */
 132
 133static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
 134{
 135    if (is_writable) {
 136        return qemu_file_new_output(QIO_CHANNEL(qio_channel_block_new(bs)));
 137    } else {
 138        return qemu_file_new_input(QIO_CHANNEL(qio_channel_block_new(bs)));
 139    }
 140}
 141
 142
 143/* QEMUFile timer support.
 144 * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c
 145 */
 146
 147void timer_put(QEMUFile *f, QEMUTimer *ts)
 148{
 149    uint64_t expire_time;
 150
 151    expire_time = timer_expire_time_ns(ts);
 152    qemu_put_be64(f, expire_time);
 153}
 154
 155void timer_get(QEMUFile *f, QEMUTimer *ts)
 156{
 157    uint64_t expire_time;
 158
 159    expire_time = qemu_get_be64(f);
 160    if (expire_time != -1) {
 161        timer_mod_ns(ts, expire_time);
 162    } else {
 163        timer_del(ts);
 164    }
 165}
 166
 167
 168/* VMState timer support.
 169 * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c
 170 */
 171
 172static int get_timer(QEMUFile *f, void *pv, size_t size,
 173                     const VMStateField *field)
 174{
 175    QEMUTimer *v = pv;
 176    timer_get(f, v);
 177    return 0;
 178}
 179
 180static int put_timer(QEMUFile *f, void *pv, size_t size,
 181                     const VMStateField *field, JSONWriter *vmdesc)
 182{
 183    QEMUTimer *v = pv;
 184    timer_put(f, v);
 185
 186    return 0;
 187}
 188
 189const VMStateInfo vmstate_info_timer = {
 190    .name = "timer",
 191    .get  = get_timer,
 192    .put  = put_timer,
 193};
 194
 195
 196typedef struct CompatEntry {
 197    char idstr[256];
 198    int instance_id;
 199} CompatEntry;
 200
 201typedef struct SaveStateEntry {
 202    QTAILQ_ENTRY(SaveStateEntry) entry;
 203    char idstr[256];
 204    uint32_t instance_id;
 205    int alias_id;
 206    int version_id;
 207    /* version id read from the stream */
 208    int load_version_id;
 209    int section_id;
 210    /* section id read from the stream */
 211    int load_section_id;
 212    const SaveVMHandlers *ops;
 213    const VMStateDescription *vmsd;
 214    void *opaque;
 215    CompatEntry *compat;
 216    int is_ram;
 217} SaveStateEntry;
 218
 219typedef struct SaveState {
 220    QTAILQ_HEAD(, SaveStateEntry) handlers;
 221    SaveStateEntry *handler_pri_head[MIG_PRI_MAX + 1];
 222    int global_section_id;
 223    uint32_t len;
 224    const char *name;
 225    uint32_t target_page_bits;
 226    uint32_t caps_count;
 227    MigrationCapability *capabilities;
 228    QemuUUID uuid;
 229} SaveState;
 230
 231static SaveState savevm_state = {
 232    .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
 233    .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL },
 234    .global_section_id = 0,
 235};
 236
 237static bool should_validate_capability(int capability)
 238{
 239    assert(capability >= 0 && capability < MIGRATION_CAPABILITY__MAX);
 240    /* Validate only new capabilities to keep compatibility. */
 241    switch (capability) {
 242    case MIGRATION_CAPABILITY_X_IGNORE_SHARED:
 243        return true;
 244    default:
 245        return false;
 246    }
 247}
 248
 249static uint32_t get_validatable_capabilities_count(void)
 250{
 251    MigrationState *s = migrate_get_current();
 252    uint32_t result = 0;
 253    int i;
 254    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 255        if (should_validate_capability(i) && s->enabled_capabilities[i]) {
 256            result++;
 257        }
 258    }
 259    return result;
 260}
 261
 262static int configuration_pre_save(void *opaque)
 263{
 264    SaveState *state = opaque;
 265    const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
 266    MigrationState *s = migrate_get_current();
 267    int i, j;
 268
 269    state->len = strlen(current_name);
 270    state->name = current_name;
 271    state->target_page_bits = qemu_target_page_bits();
 272
 273    state->caps_count = get_validatable_capabilities_count();
 274    state->capabilities = g_renew(MigrationCapability, state->capabilities,
 275                                  state->caps_count);
 276    for (i = j = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 277        if (should_validate_capability(i) && s->enabled_capabilities[i]) {
 278            state->capabilities[j++] = i;
 279        }
 280    }
 281    state->uuid = qemu_uuid;
 282
 283    return 0;
 284}
 285
 286static int configuration_post_save(void *opaque)
 287{
 288    SaveState *state = opaque;
 289
 290    g_free(state->capabilities);
 291    state->capabilities = NULL;
 292    state->caps_count = 0;
 293    return 0;
 294}
 295
 296static int configuration_pre_load(void *opaque)
 297{
 298    SaveState *state = opaque;
 299
 300    /* If there is no target-page-bits subsection it means the source
 301     * predates the variable-target-page-bits support and is using the
 302     * minimum possible value for this CPU.
 303     */
 304    state->target_page_bits = qemu_target_page_bits_min();
 305    return 0;
 306}
 307
 308static bool configuration_validate_capabilities(SaveState *state)
 309{
 310    bool ret = true;
 311    MigrationState *s = migrate_get_current();
 312    unsigned long *source_caps_bm;
 313    int i;
 314
 315    source_caps_bm = bitmap_new(MIGRATION_CAPABILITY__MAX);
 316    for (i = 0; i < state->caps_count; i++) {
 317        MigrationCapability capability = state->capabilities[i];
 318        set_bit(capability, source_caps_bm);
 319    }
 320
 321    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 322        bool source_state, target_state;
 323        if (!should_validate_capability(i)) {
 324            continue;
 325        }
 326        source_state = test_bit(i, source_caps_bm);
 327        target_state = s->enabled_capabilities[i];
 328        if (source_state != target_state) {
 329            error_report("Capability %s is %s, but received capability is %s",
 330                         MigrationCapability_str(i),
 331                         target_state ? "on" : "off",
 332                         source_state ? "on" : "off");
 333            ret = false;
 334            /* Don't break here to report all failed capabilities */
 335        }
 336    }
 337
 338    g_free(source_caps_bm);
 339    return ret;
 340}
 341
 342static int configuration_post_load(void *opaque, int version_id)
 343{
 344    SaveState *state = opaque;
 345    const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
 346    int ret = 0;
 347
 348    if (strncmp(state->name, current_name, state->len) != 0) {
 349        error_report("Machine type received is '%.*s' and local is '%s'",
 350                     (int) state->len, state->name, current_name);
 351        ret = -EINVAL;
 352        goto out;
 353    }
 354
 355    if (state->target_page_bits != qemu_target_page_bits()) {
 356        error_report("Received TARGET_PAGE_BITS is %d but local is %d",
 357                     state->target_page_bits, qemu_target_page_bits());
 358        ret = -EINVAL;
 359        goto out;
 360    }
 361
 362    if (!configuration_validate_capabilities(state)) {
 363        ret = -EINVAL;
 364        goto out;
 365    }
 366
 367out:
 368    g_free((void *)state->name);
 369    state->name = NULL;
 370    state->len = 0;
 371    g_free(state->capabilities);
 372    state->capabilities = NULL;
 373    state->caps_count = 0;
 374
 375    return ret;
 376}
 377
 378static int get_capability(QEMUFile *f, void *pv, size_t size,
 379                          const VMStateField *field)
 380{
 381    MigrationCapability *capability = pv;
 382    char capability_str[UINT8_MAX + 1];
 383    uint8_t len;
 384    int i;
 385
 386    len = qemu_get_byte(f);
 387    qemu_get_buffer(f, (uint8_t *)capability_str, len);
 388    capability_str[len] = '\0';
 389    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 390        if (!strcmp(MigrationCapability_str(i), capability_str)) {
 391            *capability = i;
 392            return 0;
 393        }
 394    }
 395    error_report("Received unknown capability %s", capability_str);
 396    return -EINVAL;
 397}
 398
 399static int put_capability(QEMUFile *f, void *pv, size_t size,
 400                          const VMStateField *field, JSONWriter *vmdesc)
 401{
 402    MigrationCapability *capability = pv;
 403    const char *capability_str = MigrationCapability_str(*capability);
 404    size_t len = strlen(capability_str);
 405    assert(len <= UINT8_MAX);
 406
 407    qemu_put_byte(f, len);
 408    qemu_put_buffer(f, (uint8_t *)capability_str, len);
 409    return 0;
 410}
 411
 412static const VMStateInfo vmstate_info_capability = {
 413    .name = "capability",
 414    .get  = get_capability,
 415    .put  = put_capability,
 416};
 417
 418/* The target-page-bits subsection is present only if the
 419 * target page size is not the same as the default (ie the
 420 * minimum page size for a variable-page-size guest CPU).
 421 * If it is present then it contains the actual target page
 422 * bits for the machine, and migration will fail if the
 423 * two ends don't agree about it.
 424 */
 425static bool vmstate_target_page_bits_needed(void *opaque)
 426{
 427    return qemu_target_page_bits()
 428        > qemu_target_page_bits_min();
 429}
 430
 431static const VMStateDescription vmstate_target_page_bits = {
 432    .name = "configuration/target-page-bits",
 433    .version_id = 1,
 434    .minimum_version_id = 1,
 435    .needed = vmstate_target_page_bits_needed,
 436    .fields = (VMStateField[]) {
 437        VMSTATE_UINT32(target_page_bits, SaveState),
 438        VMSTATE_END_OF_LIST()
 439    }
 440};
 441
 442static bool vmstate_capabilites_needed(void *opaque)
 443{
 444    return get_validatable_capabilities_count() > 0;
 445}
 446
 447static const VMStateDescription vmstate_capabilites = {
 448    .name = "configuration/capabilities",
 449    .version_id = 1,
 450    .minimum_version_id = 1,
 451    .needed = vmstate_capabilites_needed,
 452    .fields = (VMStateField[]) {
 453        VMSTATE_UINT32_V(caps_count, SaveState, 1),
 454        VMSTATE_VARRAY_UINT32_ALLOC(capabilities, SaveState, caps_count, 1,
 455                                    vmstate_info_capability,
 456                                    MigrationCapability),
 457        VMSTATE_END_OF_LIST()
 458    }
 459};
 460
 461static bool vmstate_uuid_needed(void *opaque)
 462{
 463    return qemu_uuid_set && migrate_validate_uuid();
 464}
 465
 466static int vmstate_uuid_post_load(void *opaque, int version_id)
 467{
 468    SaveState *state = opaque;
 469    char uuid_src[UUID_FMT_LEN + 1];
 470    char uuid_dst[UUID_FMT_LEN + 1];
 471
 472    if (!qemu_uuid_set) {
 473        /*
 474         * It's warning because user might not know UUID in some cases,
 475         * e.g. load an old snapshot
 476         */
 477        qemu_uuid_unparse(&state->uuid, uuid_src);
 478        warn_report("UUID is received %s, but local uuid isn't set",
 479                     uuid_src);
 480        return 0;
 481    }
 482    if (!qemu_uuid_is_equal(&state->uuid, &qemu_uuid)) {
 483        qemu_uuid_unparse(&state->uuid, uuid_src);
 484        qemu_uuid_unparse(&qemu_uuid, uuid_dst);
 485        error_report("UUID received is %s and local is %s", uuid_src, uuid_dst);
 486        return -EINVAL;
 487    }
 488    return 0;
 489}
 490
 491static const VMStateDescription vmstate_uuid = {
 492    .name = "configuration/uuid",
 493    .version_id = 1,
 494    .minimum_version_id = 1,
 495    .needed = vmstate_uuid_needed,
 496    .post_load = vmstate_uuid_post_load,
 497    .fields = (VMStateField[]) {
 498        VMSTATE_UINT8_ARRAY_V(uuid.data, SaveState, sizeof(QemuUUID), 1),
 499        VMSTATE_END_OF_LIST()
 500    }
 501};
 502
 503static const VMStateDescription vmstate_configuration = {
 504    .name = "configuration",
 505    .version_id = 1,
 506    .pre_load = configuration_pre_load,
 507    .post_load = configuration_post_load,
 508    .pre_save = configuration_pre_save,
 509    .post_save = configuration_post_save,
 510    .fields = (VMStateField[]) {
 511        VMSTATE_UINT32(len, SaveState),
 512        VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
 513        VMSTATE_END_OF_LIST()
 514    },
 515    .subsections = (const VMStateDescription *[]) {
 516        &vmstate_target_page_bits,
 517        &vmstate_capabilites,
 518        &vmstate_uuid,
 519        NULL
 520    }
 521};
 522
 523static void dump_vmstate_vmsd(FILE *out_file,
 524                              const VMStateDescription *vmsd, int indent,
 525                              bool is_subsection);
 526
 527static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field,
 528                              int indent)
 529{
 530    fprintf(out_file, "%*s{\n", indent, "");
 531    indent += 2;
 532    fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name);
 533    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 534            field->version_id);
 535    fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "",
 536            field->field_exists ? "true" : "false");
 537    fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size);
 538    if (field->vmsd != NULL) {
 539        fprintf(out_file, ",\n");
 540        dump_vmstate_vmsd(out_file, field->vmsd, indent, false);
 541    }
 542    fprintf(out_file, "\n%*s}", indent - 2, "");
 543}
 544
 545static void dump_vmstate_vmss(FILE *out_file,
 546                              const VMStateDescription **subsection,
 547                              int indent)
 548{
 549    if (*subsection != NULL) {
 550        dump_vmstate_vmsd(out_file, *subsection, indent, true);
 551    }
 552}
 553
 554static void dump_vmstate_vmsd(FILE *out_file,
 555                              const VMStateDescription *vmsd, int indent,
 556                              bool is_subsection)
 557{
 558    if (is_subsection) {
 559        fprintf(out_file, "%*s{\n", indent, "");
 560    } else {
 561        fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description");
 562    }
 563    indent += 2;
 564    fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name);
 565    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 566            vmsd->version_id);
 567    fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "",
 568            vmsd->minimum_version_id);
 569    if (vmsd->fields != NULL) {
 570        const VMStateField *field = vmsd->fields;
 571        bool first;
 572
 573        fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, "");
 574        first = true;
 575        while (field->name != NULL) {
 576            if (field->flags & VMS_MUST_EXIST) {
 577                /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */
 578                field++;
 579                continue;
 580            }
 581            if (!first) {
 582                fprintf(out_file, ",\n");
 583            }
 584            dump_vmstate_vmsf(out_file, field, indent + 2);
 585            field++;
 586            first = false;
 587        }
 588        fprintf(out_file, "\n%*s]", indent, "");
 589    }
 590    if (vmsd->subsections != NULL) {
 591        const VMStateDescription **subsection = vmsd->subsections;
 592        bool first;
 593
 594        fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, "");
 595        first = true;
 596        while (*subsection != NULL) {
 597            if (!first) {
 598                fprintf(out_file, ",\n");
 599            }
 600            dump_vmstate_vmss(out_file, subsection, indent + 2);
 601            subsection++;
 602            first = false;
 603        }
 604        fprintf(out_file, "\n%*s]", indent, "");
 605    }
 606    fprintf(out_file, "\n%*s}", indent - 2, "");
 607}
 608
 609static void dump_machine_type(FILE *out_file)
 610{
 611    MachineClass *mc;
 612
 613    mc = MACHINE_GET_CLASS(current_machine);
 614
 615    fprintf(out_file, "  \"vmschkmachine\": {\n");
 616    fprintf(out_file, "    \"Name\": \"%s\"\n", mc->name);
 617    fprintf(out_file, "  },\n");
 618}
 619
 620void dump_vmstate_json_to_file(FILE *out_file)
 621{
 622    GSList *list, *elt;
 623    bool first;
 624
 625    fprintf(out_file, "{\n");
 626    dump_machine_type(out_file);
 627
 628    first = true;
 629    list = object_class_get_list(TYPE_DEVICE, true);
 630    for (elt = list; elt; elt = elt->next) {
 631        DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data,
 632                                             TYPE_DEVICE);
 633        const char *name;
 634        int indent = 2;
 635
 636        if (!dc->vmsd) {
 637            continue;
 638        }
 639
 640        if (!first) {
 641            fprintf(out_file, ",\n");
 642        }
 643        name = object_class_get_name(OBJECT_CLASS(dc));
 644        fprintf(out_file, "%*s\"%s\": {\n", indent, "", name);
 645        indent += 2;
 646        fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name);
 647        fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 648                dc->vmsd->version_id);
 649        fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "",
 650                dc->vmsd->minimum_version_id);
 651
 652        dump_vmstate_vmsd(out_file, dc->vmsd, indent, false);
 653
 654        fprintf(out_file, "\n%*s}", indent - 2, "");
 655        first = false;
 656    }
 657    fprintf(out_file, "\n}\n");
 658    fclose(out_file);
 659    g_slist_free(list);
 660}
 661
 662static uint32_t calculate_new_instance_id(const char *idstr)
 663{
 664    SaveStateEntry *se;
 665    uint32_t instance_id = 0;
 666
 667    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 668        if (strcmp(idstr, se->idstr) == 0
 669            && instance_id <= se->instance_id) {
 670            instance_id = se->instance_id + 1;
 671        }
 672    }
 673    /* Make sure we never loop over without being noticed */
 674    assert(instance_id != VMSTATE_INSTANCE_ID_ANY);
 675    return instance_id;
 676}
 677
 678static int calculate_compat_instance_id(const char *idstr)
 679{
 680    SaveStateEntry *se;
 681    int instance_id = 0;
 682
 683    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 684        if (!se->compat) {
 685            continue;
 686        }
 687
 688        if (strcmp(idstr, se->compat->idstr) == 0
 689            && instance_id <= se->compat->instance_id) {
 690            instance_id = se->compat->instance_id + 1;
 691        }
 692    }
 693    return instance_id;
 694}
 695
 696static inline MigrationPriority save_state_priority(SaveStateEntry *se)
 697{
 698    if (se->vmsd) {
 699        return se->vmsd->priority;
 700    }
 701    return MIG_PRI_DEFAULT;
 702}
 703
 704static void savevm_state_handler_insert(SaveStateEntry *nse)
 705{
 706    MigrationPriority priority = save_state_priority(nse);
 707    SaveStateEntry *se;
 708    int i;
 709
 710    assert(priority <= MIG_PRI_MAX);
 711
 712    for (i = priority - 1; i >= 0; i--) {
 713        se = savevm_state.handler_pri_head[i];
 714        if (se != NULL) {
 715            assert(save_state_priority(se) < priority);
 716            break;
 717        }
 718    }
 719
 720    if (i >= 0) {
 721        QTAILQ_INSERT_BEFORE(se, nse, entry);
 722    } else {
 723        QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry);
 724    }
 725
 726    if (savevm_state.handler_pri_head[priority] == NULL) {
 727        savevm_state.handler_pri_head[priority] = nse;
 728    }
 729}
 730
 731static void savevm_state_handler_remove(SaveStateEntry *se)
 732{
 733    SaveStateEntry *next;
 734    MigrationPriority priority = save_state_priority(se);
 735
 736    if (se == savevm_state.handler_pri_head[priority]) {
 737        next = QTAILQ_NEXT(se, entry);
 738        if (next != NULL && save_state_priority(next) == priority) {
 739            savevm_state.handler_pri_head[priority] = next;
 740        } else {
 741            savevm_state.handler_pri_head[priority] = NULL;
 742        }
 743    }
 744    QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
 745}
 746
 747/* TODO: Individual devices generally have very little idea about the rest
 748   of the system, so instance_id should be removed/replaced.
 749   Meanwhile pass -1 as instance_id if you do not already have a clearly
 750   distinguishing id for all instances of your device class. */
 751int register_savevm_live(const char *idstr,
 752                         uint32_t instance_id,
 753                         int version_id,
 754                         const SaveVMHandlers *ops,
 755                         void *opaque)
 756{
 757    SaveStateEntry *se;
 758
 759    se = g_new0(SaveStateEntry, 1);
 760    se->version_id = version_id;
 761    se->section_id = savevm_state.global_section_id++;
 762    se->ops = ops;
 763    se->opaque = opaque;
 764    se->vmsd = NULL;
 765    /* if this is a live_savem then set is_ram */
 766    if (ops->save_setup != NULL) {
 767        se->is_ram = 1;
 768    }
 769
 770    pstrcat(se->idstr, sizeof(se->idstr), idstr);
 771
 772    if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
 773        se->instance_id = calculate_new_instance_id(se->idstr);
 774    } else {
 775        se->instance_id = instance_id;
 776    }
 777    assert(!se->compat || se->instance_id == 0);
 778    savevm_state_handler_insert(se);
 779    return 0;
 780}
 781
 782void unregister_savevm(VMStateIf *obj, const char *idstr, void *opaque)
 783{
 784    SaveStateEntry *se, *new_se;
 785    char id[256] = "";
 786
 787    if (obj) {
 788        char *oid = vmstate_if_get_id(obj);
 789        if (oid) {
 790            pstrcpy(id, sizeof(id), oid);
 791            pstrcat(id, sizeof(id), "/");
 792            g_free(oid);
 793        }
 794    }
 795    pstrcat(id, sizeof(id), idstr);
 796
 797    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
 798        if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
 799            savevm_state_handler_remove(se);
 800            g_free(se->compat);
 801            g_free(se);
 802        }
 803    }
 804}
 805
 806int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
 807                                   const VMStateDescription *vmsd,
 808                                   void *opaque, int alias_id,
 809                                   int required_for_version,
 810                                   Error **errp)
 811{
 812    SaveStateEntry *se;
 813
 814    /* If this triggers, alias support can be dropped for the vmsd. */
 815    assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id);
 816
 817    se = g_new0(SaveStateEntry, 1);
 818    se->version_id = vmsd->version_id;
 819    se->section_id = savevm_state.global_section_id++;
 820    se->opaque = opaque;
 821    se->vmsd = vmsd;
 822    se->alias_id = alias_id;
 823
 824    if (obj) {
 825        char *id = vmstate_if_get_id(obj);
 826        if (id) {
 827            if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
 828                sizeof(se->idstr)) {
 829                error_setg(errp, "Path too long for VMState (%s)", id);
 830                g_free(id);
 831                g_free(se);
 832
 833                return -1;
 834            }
 835            g_free(id);
 836
 837            se->compat = g_new0(CompatEntry, 1);
 838            pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name);
 839            se->compat->instance_id = instance_id == VMSTATE_INSTANCE_ID_ANY ?
 840                         calculate_compat_instance_id(vmsd->name) : instance_id;
 841            instance_id = VMSTATE_INSTANCE_ID_ANY;
 842        }
 843    }
 844    pstrcat(se->idstr, sizeof(se->idstr), vmsd->name);
 845
 846    if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
 847        se->instance_id = calculate_new_instance_id(se->idstr);
 848    } else {
 849        se->instance_id = instance_id;
 850    }
 851    assert(!se->compat || se->instance_id == 0);
 852    savevm_state_handler_insert(se);
 853    return 0;
 854}
 855
 856void vmstate_unregister(VMStateIf *obj, const VMStateDescription *vmsd,
 857                        void *opaque)
 858{
 859    SaveStateEntry *se, *new_se;
 860
 861    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
 862        if (se->vmsd == vmsd && se->opaque == opaque) {
 863            savevm_state_handler_remove(se);
 864            g_free(se->compat);
 865            g_free(se);
 866        }
 867    }
 868}
 869
 870static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
 871{
 872    trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
 873    if (!se->vmsd) {         /* Old style */
 874        return se->ops->load_state(f, se->opaque, se->load_version_id);
 875    }
 876    return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id);
 877}
 878
 879static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se,
 880                                   JSONWriter *vmdesc)
 881{
 882    int64_t old_offset, size;
 883
 884    old_offset = qemu_file_total_transferred_fast(f);
 885    se->ops->save_state(f, se->opaque);
 886    size = qemu_file_total_transferred_fast(f) - old_offset;
 887
 888    if (vmdesc) {
 889        json_writer_int64(vmdesc, "size", size);
 890        json_writer_start_array(vmdesc, "fields");
 891        json_writer_start_object(vmdesc, NULL);
 892        json_writer_str(vmdesc, "name", "data");
 893        json_writer_int64(vmdesc, "size", size);
 894        json_writer_str(vmdesc, "type", "buffer");
 895        json_writer_end_object(vmdesc);
 896        json_writer_end_array(vmdesc);
 897    }
 898}
 899
 900static int vmstate_save(QEMUFile *f, SaveStateEntry *se,
 901                        JSONWriter *vmdesc)
 902{
 903    trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
 904    if (!se->vmsd) {
 905        vmstate_save_old_style(f, se, vmdesc);
 906        return 0;
 907    }
 908    return vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
 909}
 910
 911/*
 912 * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
 913 */
 914static void save_section_header(QEMUFile *f, SaveStateEntry *se,
 915                                uint8_t section_type)
 916{
 917    qemu_put_byte(f, section_type);
 918    qemu_put_be32(f, se->section_id);
 919
 920    if (section_type == QEMU_VM_SECTION_FULL ||
 921        section_type == QEMU_VM_SECTION_START) {
 922        /* ID string */
 923        size_t len = strlen(se->idstr);
 924        qemu_put_byte(f, len);
 925        qemu_put_buffer(f, (uint8_t *)se->idstr, len);
 926
 927        qemu_put_be32(f, se->instance_id);
 928        qemu_put_be32(f, se->version_id);
 929    }
 930}
 931
 932/*
 933 * Write a footer onto device sections that catches cases misformatted device
 934 * sections.
 935 */
 936static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
 937{
 938    if (migrate_get_current()->send_section_footer) {
 939        qemu_put_byte(f, QEMU_VM_SECTION_FOOTER);
 940        qemu_put_be32(f, se->section_id);
 941    }
 942}
 943
 944/**
 945 * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
 946 *                           command and associated data.
 947 *
 948 * @f: File to send command on
 949 * @command: Command type to send
 950 * @len: Length of associated data
 951 * @data: Data associated with command.
 952 */
 953static void qemu_savevm_command_send(QEMUFile *f,
 954                                     enum qemu_vm_cmd command,
 955                                     uint16_t len,
 956                                     uint8_t *data)
 957{
 958    trace_savevm_command_send(command, len);
 959    qemu_put_byte(f, QEMU_VM_COMMAND);
 960    qemu_put_be16(f, (uint16_t)command);
 961    qemu_put_be16(f, len);
 962    qemu_put_buffer(f, data, len);
 963    qemu_fflush(f);
 964}
 965
 966void qemu_savevm_send_colo_enable(QEMUFile *f)
 967{
 968    trace_savevm_send_colo_enable();
 969    qemu_savevm_command_send(f, MIG_CMD_ENABLE_COLO, 0, NULL);
 970}
 971
 972void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
 973{
 974    uint32_t buf;
 975
 976    trace_savevm_send_ping(value);
 977    buf = cpu_to_be32(value);
 978    qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
 979}
 980
 981void qemu_savevm_send_open_return_path(QEMUFile *f)
 982{
 983    trace_savevm_send_open_return_path();
 984    qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
 985}
 986
 987/* We have a buffer of data to send; we don't want that all to be loaded
 988 * by the command itself, so the command contains just the length of the
 989 * extra buffer that we then send straight after it.
 990 * TODO: Must be a better way to organise that
 991 *
 992 * Returns:
 993 *    0 on success
 994 *    -ve on error
 995 */
 996int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len)
 997{
 998    uint32_t tmp;
 999
1000    if (len > MAX_VM_CMD_PACKAGED_SIZE) {
1001        error_report("%s: Unreasonably large packaged state: %zu",
1002                     __func__, len);
1003        return -1;
1004    }
1005
1006    tmp = cpu_to_be32(len);
1007
1008    trace_qemu_savevm_send_packaged();
1009    qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
1010
1011    qemu_put_buffer(f, buf, len);
1012
1013    return 0;
1014}
1015
1016/* Send prior to any postcopy transfer */
1017void qemu_savevm_send_postcopy_advise(QEMUFile *f)
1018{
1019    if (migrate_postcopy_ram()) {
1020        uint64_t tmp[2];
1021        tmp[0] = cpu_to_be64(ram_pagesize_summary());
1022        tmp[1] = cpu_to_be64(qemu_target_page_size());
1023
1024        trace_qemu_savevm_send_postcopy_advise();
1025        qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE,
1026                                 16, (uint8_t *)tmp);
1027    } else {
1028        qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 0, NULL);
1029    }
1030}
1031
1032/* Sent prior to starting the destination running in postcopy, discard pages
1033 * that have already been sent but redirtied on the source.
1034 * CMD_POSTCOPY_RAM_DISCARD consist of:
1035 *      byte   version (0)
1036 *      byte   Length of name field (not including 0)
1037 *  n x byte   RAM block name
1038 *      byte   0 terminator (just for safety)
1039 *  n x        Byte ranges within the named RAMBlock
1040 *      be64   Start of the range
1041 *      be64   Length
1042 *
1043 *  name:  RAMBlock name that these entries are part of
1044 *  len: Number of page entries
1045 *  start_list: 'len' addresses
1046 *  length_list: 'len' addresses
1047 *
1048 */
1049void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
1050                                           uint16_t len,
1051                                           uint64_t *start_list,
1052                                           uint64_t *length_list)
1053{
1054    uint8_t *buf;
1055    uint16_t tmplen;
1056    uint16_t t;
1057    size_t name_len = strlen(name);
1058
1059    trace_qemu_savevm_send_postcopy_ram_discard(name, len);
1060    assert(name_len < 256);
1061    buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
1062    buf[0] = postcopy_ram_discard_version;
1063    buf[1] = name_len;
1064    memcpy(buf + 2, name, name_len);
1065    tmplen = 2 + name_len;
1066    buf[tmplen++] = '\0';
1067
1068    for (t = 0; t < len; t++) {
1069        stq_be_p(buf + tmplen, start_list[t]);
1070        tmplen += 8;
1071        stq_be_p(buf + tmplen, length_list[t]);
1072        tmplen += 8;
1073    }
1074    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
1075    g_free(buf);
1076}
1077
1078/* Get the destination into a state where it can receive postcopy data. */
1079void qemu_savevm_send_postcopy_listen(QEMUFile *f)
1080{
1081    trace_savevm_send_postcopy_listen();
1082    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
1083}
1084
1085/* Kick the destination into running */
1086void qemu_savevm_send_postcopy_run(QEMUFile *f)
1087{
1088    trace_savevm_send_postcopy_run();
1089    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
1090}
1091
1092void qemu_savevm_send_postcopy_resume(QEMUFile *f)
1093{
1094    trace_savevm_send_postcopy_resume();
1095    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RESUME, 0, NULL);
1096}
1097
1098void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name)
1099{
1100    size_t len;
1101    char buf[256];
1102
1103    trace_savevm_send_recv_bitmap(block_name);
1104
1105    buf[0] = len = strlen(block_name);
1106    memcpy(buf + 1, block_name, len);
1107
1108    qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
1109}
1110
1111bool qemu_savevm_state_blocked(Error **errp)
1112{
1113    SaveStateEntry *se;
1114
1115    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1116        if (se->vmsd && se->vmsd->unmigratable) {
1117            error_setg(errp, "State blocked by non-migratable device '%s'",
1118                       se->idstr);
1119            return true;
1120        }
1121    }
1122    return false;
1123}
1124
1125void qemu_savevm_non_migratable_list(strList **reasons)
1126{
1127    SaveStateEntry *se;
1128
1129    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1130        if (se->vmsd && se->vmsd->unmigratable) {
1131            QAPI_LIST_PREPEND(*reasons,
1132                              g_strdup_printf("non-migratable device: %s",
1133                                              se->idstr));
1134        }
1135    }
1136}
1137
1138void qemu_savevm_state_header(QEMUFile *f)
1139{
1140    trace_savevm_state_header();
1141    qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1142    qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1143
1144    if (migrate_get_current()->send_configuration) {
1145        qemu_put_byte(f, QEMU_VM_CONFIGURATION);
1146        vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
1147    }
1148}
1149
1150bool qemu_savevm_state_guest_unplug_pending(void)
1151{
1152    SaveStateEntry *se;
1153
1154    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1155        if (se->vmsd && se->vmsd->dev_unplug_pending &&
1156            se->vmsd->dev_unplug_pending(se->opaque)) {
1157            return true;
1158        }
1159    }
1160
1161    return false;
1162}
1163
1164void qemu_savevm_state_setup(QEMUFile *f)
1165{
1166    SaveStateEntry *se;
1167    Error *local_err = NULL;
1168    int ret;
1169
1170    trace_savevm_state_setup();
1171    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1172        if (!se->ops || !se->ops->save_setup) {
1173            continue;
1174        }
1175        if (se->ops->is_active) {
1176            if (!se->ops->is_active(se->opaque)) {
1177                continue;
1178            }
1179        }
1180        save_section_header(f, se, QEMU_VM_SECTION_START);
1181
1182        ret = se->ops->save_setup(f, se->opaque);
1183        save_section_footer(f, se);
1184        if (ret < 0) {
1185            qemu_file_set_error(f, ret);
1186            break;
1187        }
1188    }
1189
1190    if (precopy_notify(PRECOPY_NOTIFY_SETUP, &local_err)) {
1191        error_report_err(local_err);
1192    }
1193}
1194
1195int qemu_savevm_state_resume_prepare(MigrationState *s)
1196{
1197    SaveStateEntry *se;
1198    int ret;
1199
1200    trace_savevm_state_resume_prepare();
1201
1202    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1203        if (!se->ops || !se->ops->resume_prepare) {
1204            continue;
1205        }
1206        if (se->ops->is_active) {
1207            if (!se->ops->is_active(se->opaque)) {
1208                continue;
1209            }
1210        }
1211        ret = se->ops->resume_prepare(s, se->opaque);
1212        if (ret < 0) {
1213            return ret;
1214        }
1215    }
1216
1217    return 0;
1218}
1219
1220/*
1221 * this function has three return values:
1222 *   negative: there was one error, and we have -errno.
1223 *   0 : We haven't finished, caller have to go again
1224 *   1 : We have finished, we can go to complete phase
1225 */
1226int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
1227{
1228    SaveStateEntry *se;
1229    int ret = 1;
1230
1231    trace_savevm_state_iterate();
1232    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1233        if (!se->ops || !se->ops->save_live_iterate) {
1234            continue;
1235        }
1236        if (se->ops->is_active &&
1237            !se->ops->is_active(se->opaque)) {
1238            continue;
1239        }
1240        if (se->ops->is_active_iterate &&
1241            !se->ops->is_active_iterate(se->opaque)) {
1242            continue;
1243        }
1244        /*
1245         * In the postcopy phase, any device that doesn't know how to
1246         * do postcopy should have saved it's state in the _complete
1247         * call that's already run, it might get confused if we call
1248         * iterate afterwards.
1249         */
1250        if (postcopy &&
1251            !(se->ops->has_postcopy && se->ops->has_postcopy(se->opaque))) {
1252            continue;
1253        }
1254        if (qemu_file_rate_limit(f)) {
1255            return 0;
1256        }
1257        trace_savevm_section_start(se->idstr, se->section_id);
1258
1259        save_section_header(f, se, QEMU_VM_SECTION_PART);
1260
1261        ret = se->ops->save_live_iterate(f, se->opaque);
1262        trace_savevm_section_end(se->idstr, se->section_id, ret);
1263        save_section_footer(f, se);
1264
1265        if (ret < 0) {
1266            error_report("failed to save SaveStateEntry with id(name): "
1267                         "%d(%s): %d",
1268                         se->section_id, se->idstr, ret);
1269            qemu_file_set_error(f, ret);
1270        }
1271        if (ret <= 0) {
1272            /* Do not proceed to the next vmstate before this one reported
1273               completion of the current stage. This serializes the migration
1274               and reduces the probability that a faster changing state is
1275               synchronized over and over again. */
1276            break;
1277        }
1278    }
1279    return ret;
1280}
1281
1282static bool should_send_vmdesc(void)
1283{
1284    MachineState *machine = MACHINE(qdev_get_machine());
1285    bool in_postcopy = migration_in_postcopy();
1286    return !machine->suppress_vmdesc && !in_postcopy;
1287}
1288
1289/*
1290 * Calls the save_live_complete_postcopy methods
1291 * causing the last few pages to be sent immediately and doing any associated
1292 * cleanup.
1293 * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
1294 * all the other devices, but that happens at the point we switch to postcopy.
1295 */
1296void qemu_savevm_state_complete_postcopy(QEMUFile *f)
1297{
1298    SaveStateEntry *se;
1299    int ret;
1300
1301    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1302        if (!se->ops || !se->ops->save_live_complete_postcopy) {
1303            continue;
1304        }
1305        if (se->ops->is_active) {
1306            if (!se->ops->is_active(se->opaque)) {
1307                continue;
1308            }
1309        }
1310        trace_savevm_section_start(se->idstr, se->section_id);
1311        /* Section type */
1312        qemu_put_byte(f, QEMU_VM_SECTION_END);
1313        qemu_put_be32(f, se->section_id);
1314
1315        ret = se->ops->save_live_complete_postcopy(f, se->opaque);
1316        trace_savevm_section_end(se->idstr, se->section_id, ret);
1317        save_section_footer(f, se);
1318        if (ret < 0) {
1319            qemu_file_set_error(f, ret);
1320            return;
1321        }
1322    }
1323
1324    qemu_put_byte(f, QEMU_VM_EOF);
1325    qemu_fflush(f);
1326}
1327
1328static
1329int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
1330{
1331    SaveStateEntry *se;
1332    int ret;
1333
1334    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1335        if (!se->ops ||
1336            (in_postcopy && se->ops->has_postcopy &&
1337             se->ops->has_postcopy(se->opaque)) ||
1338            !se->ops->save_live_complete_precopy) {
1339            continue;
1340        }
1341
1342        if (se->ops->is_active) {
1343            if (!se->ops->is_active(se->opaque)) {
1344                continue;
1345            }
1346        }
1347        trace_savevm_section_start(se->idstr, se->section_id);
1348
1349        save_section_header(f, se, QEMU_VM_SECTION_END);
1350
1351        ret = se->ops->save_live_complete_precopy(f, se->opaque);
1352        trace_savevm_section_end(se->idstr, se->section_id, ret);
1353        save_section_footer(f, se);
1354        if (ret < 0) {
1355            qemu_file_set_error(f, ret);
1356            return -1;
1357        }
1358    }
1359
1360    return 0;
1361}
1362
1363int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
1364                                                    bool in_postcopy,
1365                                                    bool inactivate_disks)
1366{
1367    g_autoptr(JSONWriter) vmdesc = NULL;
1368    int vmdesc_len;
1369    SaveStateEntry *se;
1370    int ret;
1371
1372    vmdesc = json_writer_new(false);
1373    json_writer_start_object(vmdesc, NULL);
1374    json_writer_int64(vmdesc, "page_size", qemu_target_page_size());
1375    json_writer_start_array(vmdesc, "devices");
1376    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1377
1378        if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1379            continue;
1380        }
1381        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1382            trace_savevm_section_skip(se->idstr, se->section_id);
1383            continue;
1384        }
1385
1386        trace_savevm_section_start(se->idstr, se->section_id);
1387
1388        json_writer_start_object(vmdesc, NULL);
1389        json_writer_str(vmdesc, "name", se->idstr);
1390        json_writer_int64(vmdesc, "instance_id", se->instance_id);
1391
1392        save_section_header(f, se, QEMU_VM_SECTION_FULL);
1393        ret = vmstate_save(f, se, vmdesc);
1394        if (ret) {
1395            qemu_file_set_error(f, ret);
1396            return ret;
1397        }
1398        trace_savevm_section_end(se->idstr, se->section_id, 0);
1399        save_section_footer(f, se);
1400
1401        json_writer_end_object(vmdesc);
1402    }
1403
1404    if (inactivate_disks) {
1405        /* Inactivate before sending QEMU_VM_EOF so that the
1406         * bdrv_activate_all() on the other end won't fail. */
1407        ret = bdrv_inactivate_all();
1408        if (ret) {
1409            error_report("%s: bdrv_inactivate_all() failed (%d)",
1410                         __func__, ret);
1411            qemu_file_set_error(f, ret);
1412            return ret;
1413        }
1414    }
1415    if (!in_postcopy) {
1416        /* Postcopy stream will still be going */
1417        qemu_put_byte(f, QEMU_VM_EOF);
1418    }
1419
1420    json_writer_end_array(vmdesc);
1421    json_writer_end_object(vmdesc);
1422    vmdesc_len = strlen(json_writer_get(vmdesc));
1423
1424    if (should_send_vmdesc()) {
1425        qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
1426        qemu_put_be32(f, vmdesc_len);
1427        qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len);
1428    }
1429
1430    return 0;
1431}
1432
1433int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
1434                                       bool inactivate_disks)
1435{
1436    int ret;
1437    Error *local_err = NULL;
1438    bool in_postcopy = migration_in_postcopy();
1439
1440    if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) {
1441        error_report_err(local_err);
1442    }
1443
1444    trace_savevm_state_complete_precopy();
1445
1446    cpu_synchronize_all_states();
1447
1448    if (!in_postcopy || iterable_only) {
1449        ret = qemu_savevm_state_complete_precopy_iterable(f, in_postcopy);
1450        if (ret) {
1451            return ret;
1452        }
1453    }
1454
1455    if (iterable_only) {
1456        goto flush;
1457    }
1458
1459    ret = qemu_savevm_state_complete_precopy_non_iterable(f, in_postcopy,
1460                                                          inactivate_disks);
1461    if (ret) {
1462        return ret;
1463    }
1464
1465flush:
1466    qemu_fflush(f);
1467    return 0;
1468}
1469
1470/* Give an estimate of the amount left to be transferred,
1471 * the result is split into the amount for units that can and
1472 * for units that can't do postcopy.
1473 */
1474void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
1475                               uint64_t *res_precopy_only,
1476                               uint64_t *res_compatible,
1477                               uint64_t *res_postcopy_only)
1478{
1479    SaveStateEntry *se;
1480
1481    *res_precopy_only = 0;
1482    *res_compatible = 0;
1483    *res_postcopy_only = 0;
1484
1485
1486    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1487        if (!se->ops || !se->ops->save_live_pending) {
1488            continue;
1489        }
1490        if (se->ops->is_active) {
1491            if (!se->ops->is_active(se->opaque)) {
1492                continue;
1493            }
1494        }
1495        se->ops->save_live_pending(f, se->opaque, threshold_size,
1496                                   res_precopy_only, res_compatible,
1497                                   res_postcopy_only);
1498    }
1499}
1500
1501void qemu_savevm_state_cleanup(void)
1502{
1503    SaveStateEntry *se;
1504    Error *local_err = NULL;
1505
1506    if (precopy_notify(PRECOPY_NOTIFY_CLEANUP, &local_err)) {
1507        error_report_err(local_err);
1508    }
1509
1510    trace_savevm_state_cleanup();
1511    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1512        if (se->ops && se->ops->save_cleanup) {
1513            se->ops->save_cleanup(se->opaque);
1514        }
1515    }
1516}
1517
1518static int qemu_savevm_state(QEMUFile *f, Error **errp)
1519{
1520    int ret;
1521    MigrationState *ms = migrate_get_current();
1522    MigrationStatus status;
1523
1524    if (migration_is_running(ms->state)) {
1525        error_setg(errp, QERR_MIGRATION_ACTIVE);
1526        return -EINVAL;
1527    }
1528
1529    if (migrate_use_block()) {
1530        error_setg(errp, "Block migration and snapshots are incompatible");
1531        return -EINVAL;
1532    }
1533
1534    migrate_init(ms);
1535    memset(&ram_counters, 0, sizeof(ram_counters));
1536    memset(&compression_counters, 0, sizeof(compression_counters));
1537    ms->to_dst_file = f;
1538
1539    qemu_mutex_unlock_iothread();
1540    qemu_savevm_state_header(f);
1541    qemu_savevm_state_setup(f);
1542    qemu_mutex_lock_iothread();
1543
1544    while (qemu_file_get_error(f) == 0) {
1545        if (qemu_savevm_state_iterate(f, false) > 0) {
1546            break;
1547        }
1548    }
1549
1550    ret = qemu_file_get_error(f);
1551    if (ret == 0) {
1552        qemu_savevm_state_complete_precopy(f, false, false);
1553        ret = qemu_file_get_error(f);
1554    }
1555    qemu_savevm_state_cleanup();
1556    if (ret != 0) {
1557        error_setg_errno(errp, -ret, "Error while writing VM state");
1558    }
1559
1560    if (ret != 0) {
1561        status = MIGRATION_STATUS_FAILED;
1562    } else {
1563        status = MIGRATION_STATUS_COMPLETED;
1564    }
1565    migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status);
1566
1567    /* f is outer parameter, it should not stay in global migration state after
1568     * this function finished */
1569    ms->to_dst_file = NULL;
1570
1571    return ret;
1572}
1573
1574void qemu_savevm_live_state(QEMUFile *f)
1575{
1576    /* save QEMU_VM_SECTION_END section */
1577    qemu_savevm_state_complete_precopy(f, true, false);
1578    qemu_put_byte(f, QEMU_VM_EOF);
1579}
1580
1581int qemu_save_device_state(QEMUFile *f)
1582{
1583    SaveStateEntry *se;
1584
1585    if (!migration_in_colo_state()) {
1586        qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1587        qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1588    }
1589    cpu_synchronize_all_states();
1590
1591    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1592        int ret;
1593
1594        if (se->is_ram) {
1595            continue;
1596        }
1597        if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1598            continue;
1599        }
1600        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1601            continue;
1602        }
1603
1604        save_section_header(f, se, QEMU_VM_SECTION_FULL);
1605
1606        ret = vmstate_save(f, se, NULL);
1607        if (ret) {
1608            return ret;
1609        }
1610
1611        save_section_footer(f, se);
1612    }
1613
1614    qemu_put_byte(f, QEMU_VM_EOF);
1615
1616    return qemu_file_get_error(f);
1617}
1618
1619static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id)
1620{
1621    SaveStateEntry *se;
1622
1623    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1624        if (!strcmp(se->idstr, idstr) &&
1625            (instance_id == se->instance_id ||
1626             instance_id == se->alias_id))
1627            return se;
1628        /* Migrating from an older version? */
1629        if (strstr(se->idstr, idstr) && se->compat) {
1630            if (!strcmp(se->compat->idstr, idstr) &&
1631                (instance_id == se->compat->instance_id ||
1632                 instance_id == se->alias_id))
1633                return se;
1634        }
1635    }
1636    return NULL;
1637}
1638
1639enum LoadVMExitCodes {
1640    /* Allow a command to quit all layers of nested loadvm loops */
1641    LOADVM_QUIT     =  1,
1642};
1643
1644/* ------ incoming postcopy messages ------ */
1645/* 'advise' arrives before any transfers just to tell us that a postcopy
1646 * *might* happen - it might be skipped if precopy transferred everything
1647 * quickly.
1648 */
1649static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
1650                                         uint16_t len)
1651{
1652    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1653    uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps;
1654    size_t page_size = qemu_target_page_size();
1655    Error *local_err = NULL;
1656
1657    trace_loadvm_postcopy_handle_advise();
1658    if (ps != POSTCOPY_INCOMING_NONE) {
1659        error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
1660        return -1;
1661    }
1662
1663    switch (len) {
1664    case 0:
1665        if (migrate_postcopy_ram()) {
1666            error_report("RAM postcopy is enabled but have 0 byte advise");
1667            return -EINVAL;
1668        }
1669        return 0;
1670    case 8 + 8:
1671        if (!migrate_postcopy_ram()) {
1672            error_report("RAM postcopy is disabled but have 16 byte advise");
1673            return -EINVAL;
1674        }
1675        break;
1676    default:
1677        error_report("CMD_POSTCOPY_ADVISE invalid length (%d)", len);
1678        return -EINVAL;
1679    }
1680
1681    if (!postcopy_ram_supported_by_host(mis)) {
1682        postcopy_state_set(POSTCOPY_INCOMING_NONE);
1683        return -1;
1684    }
1685
1686    remote_pagesize_summary = qemu_get_be64(mis->from_src_file);
1687    local_pagesize_summary = ram_pagesize_summary();
1688
1689    if (remote_pagesize_summary != local_pagesize_summary)  {
1690        /*
1691         * This detects two potential causes of mismatch:
1692         *   a) A mismatch in host page sizes
1693         *      Some combinations of mismatch are probably possible but it gets
1694         *      a bit more complicated.  In particular we need to place whole
1695         *      host pages on the dest at once, and we need to ensure that we
1696         *      handle dirtying to make sure we never end up sending part of
1697         *      a hostpage on it's own.
1698         *   b) The use of different huge page sizes on source/destination
1699         *      a more fine grain test is performed during RAM block migration
1700         *      but this test here causes a nice early clear failure, and
1701         *      also fails when passed to an older qemu that doesn't
1702         *      do huge pages.
1703         */
1704        error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64
1705                                                             " d=%" PRIx64 ")",
1706                     remote_pagesize_summary, local_pagesize_summary);
1707        return -1;
1708    }
1709
1710    remote_tps = qemu_get_be64(mis->from_src_file);
1711    if (remote_tps != page_size) {
1712        /*
1713         * Again, some differences could be dealt with, but for now keep it
1714         * simple.
1715         */
1716        error_report("Postcopy needs matching target page sizes (s=%d d=%zd)",
1717                     (int)remote_tps, page_size);
1718        return -1;
1719    }
1720
1721    if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_ADVISE, &local_err)) {
1722        error_report_err(local_err);
1723        return -1;
1724    }
1725
1726    if (ram_postcopy_incoming_init(mis)) {
1727        return -1;
1728    }
1729
1730    return 0;
1731}
1732
1733/* After postcopy we will be told to throw some pages away since they're
1734 * dirty and will have to be demand fetched.  Must happen before CPU is
1735 * started.
1736 * There can be 0..many of these messages, each encoding multiple pages.
1737 */
1738static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
1739                                              uint16_t len)
1740{
1741    int tmp;
1742    char ramid[256];
1743    PostcopyState ps = postcopy_state_get();
1744
1745    trace_loadvm_postcopy_ram_handle_discard();
1746
1747    switch (ps) {
1748    case POSTCOPY_INCOMING_ADVISE:
1749        /* 1st discard */
1750        tmp = postcopy_ram_prepare_discard(mis);
1751        if (tmp) {
1752            return tmp;
1753        }
1754        break;
1755
1756    case POSTCOPY_INCOMING_DISCARD:
1757        /* Expected state */
1758        break;
1759
1760    default:
1761        error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
1762                     ps);
1763        return -1;
1764    }
1765    /* We're expecting a
1766     *    Version (0)
1767     *    a RAM ID string (length byte, name, 0 term)
1768     *    then at least 1 16 byte chunk
1769    */
1770    if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
1771        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1772        return -1;
1773    }
1774
1775    tmp = qemu_get_byte(mis->from_src_file);
1776    if (tmp != postcopy_ram_discard_version) {
1777        error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
1778        return -1;
1779    }
1780
1781    if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
1782        error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
1783        return -1;
1784    }
1785    tmp = qemu_get_byte(mis->from_src_file);
1786    if (tmp != 0) {
1787        error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
1788        return -1;
1789    }
1790
1791    len -= 3 + strlen(ramid);
1792    if (len % 16) {
1793        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1794        return -1;
1795    }
1796    trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
1797    while (len) {
1798        uint64_t start_addr, block_length;
1799        start_addr = qemu_get_be64(mis->from_src_file);
1800        block_length = qemu_get_be64(mis->from_src_file);
1801
1802        len -= 16;
1803        int ret = ram_discard_range(ramid, start_addr, block_length);
1804        if (ret) {
1805            return ret;
1806        }
1807    }
1808    trace_loadvm_postcopy_ram_handle_discard_end();
1809
1810    return 0;
1811}
1812
1813/*
1814 * Triggered by a postcopy_listen command; this thread takes over reading
1815 * the input stream, leaving the main thread free to carry on loading the rest
1816 * of the device state (from RAM).
1817 * (TODO:This could do with being in a postcopy file - but there again it's
1818 * just another input loop, not that postcopy specific)
1819 */
1820static void *postcopy_ram_listen_thread(void *opaque)
1821{
1822    MigrationIncomingState *mis = migration_incoming_get_current();
1823    QEMUFile *f = mis->from_src_file;
1824    int load_res;
1825    MigrationState *migr = migrate_get_current();
1826
1827    object_ref(OBJECT(migr));
1828
1829    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
1830                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);
1831    qemu_sem_post(&mis->thread_sync_sem);
1832    trace_postcopy_ram_listen_thread_start();
1833
1834    rcu_register_thread();
1835    /*
1836     * Because we're a thread and not a coroutine we can't yield
1837     * in qemu_file, and thus we must be blocking now.
1838     */
1839    qemu_file_set_blocking(f, true);
1840    load_res = qemu_loadvm_state_main(f, mis);
1841
1842    /*
1843     * This is tricky, but, mis->from_src_file can change after it
1844     * returns, when postcopy recovery happened. In the future, we may
1845     * want a wrapper for the QEMUFile handle.
1846     */
1847    f = mis->from_src_file;
1848
1849    /* And non-blocking again so we don't block in any cleanup */
1850    qemu_file_set_blocking(f, false);
1851
1852    trace_postcopy_ram_listen_thread_exit();
1853    if (load_res < 0) {
1854        qemu_file_set_error(f, load_res);
1855        dirty_bitmap_mig_cancel_incoming();
1856        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
1857            !migrate_postcopy_ram() && migrate_dirty_bitmaps())
1858        {
1859            error_report("%s: loadvm failed during postcopy: %d. All states "
1860                         "are migrated except dirty bitmaps. Some dirty "
1861                         "bitmaps may be lost, and present migrated dirty "
1862                         "bitmaps are correctly migrated and valid.",
1863                         __func__, load_res);
1864            load_res = 0; /* prevent further exit() */
1865        } else {
1866            error_report("%s: loadvm failed: %d", __func__, load_res);
1867            migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1868                                           MIGRATION_STATUS_FAILED);
1869        }
1870    }
1871    if (load_res >= 0) {
1872        /*
1873         * This looks good, but it's possible that the device loading in the
1874         * main thread hasn't finished yet, and so we might not be in 'RUN'
1875         * state yet; wait for the end of the main thread.
1876         */
1877        qemu_event_wait(&mis->main_thread_load_event);
1878    }
1879    postcopy_ram_incoming_cleanup(mis);
1880
1881    if (load_res < 0) {
1882        /*
1883         * If something went wrong then we have a bad state so exit;
1884         * depending how far we got it might be possible at this point
1885         * to leave the guest running and fire MCEs for pages that never
1886         * arrived as a desperate recovery step.
1887         */
1888        rcu_unregister_thread();
1889        exit(EXIT_FAILURE);
1890    }
1891
1892    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1893                                   MIGRATION_STATUS_COMPLETED);
1894    /*
1895     * If everything has worked fine, then the main thread has waited
1896     * for us to start, and we're the last use of the mis.
1897     * (If something broke then qemu will have to exit anyway since it's
1898     * got a bad migration state).
1899     */
1900    migration_incoming_state_destroy();
1901    qemu_loadvm_state_cleanup();
1902
1903    rcu_unregister_thread();
1904    mis->have_listen_thread = false;
1905    postcopy_state_set(POSTCOPY_INCOMING_END);
1906
1907    object_unref(OBJECT(migr));
1908
1909    return NULL;
1910}
1911
1912/* After this message we must be able to immediately receive postcopy data */
1913static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
1914{
1915    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
1916    Error *local_err = NULL;
1917
1918    trace_loadvm_postcopy_handle_listen("enter");
1919
1920    if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
1921        error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
1922        return -1;
1923    }
1924    if (ps == POSTCOPY_INCOMING_ADVISE) {
1925        /*
1926         * A rare case, we entered listen without having to do any discards,
1927         * so do the setup that's normally done at the time of the 1st discard.
1928         */
1929        if (migrate_postcopy_ram()) {
1930            postcopy_ram_prepare_discard(mis);
1931        }
1932    }
1933
1934    trace_loadvm_postcopy_handle_listen("after discard");
1935
1936    /*
1937     * Sensitise RAM - can now generate requests for blocks that don't exist
1938     * However, at this point the CPU shouldn't be running, and the IO
1939     * shouldn't be doing anything yet so don't actually expect requests
1940     */
1941    if (migrate_postcopy_ram()) {
1942        if (postcopy_ram_incoming_setup(mis)) {
1943            postcopy_ram_incoming_cleanup(mis);
1944            return -1;
1945        }
1946    }
1947
1948    trace_loadvm_postcopy_handle_listen("after uffd");
1949
1950    if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) {
1951        error_report_err(local_err);
1952        return -1;
1953    }
1954
1955    mis->have_listen_thread = true;
1956    postcopy_thread_create(mis, &mis->listen_thread, "postcopy/listen",
1957                           postcopy_ram_listen_thread, QEMU_THREAD_DETACHED);
1958    trace_loadvm_postcopy_handle_listen("return");
1959
1960    return 0;
1961}
1962
1963static void loadvm_postcopy_handle_run_bh(void *opaque)
1964{
1965    Error *local_err = NULL;
1966    MigrationIncomingState *mis = opaque;
1967
1968    trace_loadvm_postcopy_handle_run_bh("enter");
1969
1970    /* TODO we should move all of this lot into postcopy_ram.c or a shared code
1971     * in migration.c
1972     */
1973    cpu_synchronize_all_post_init();
1974
1975    trace_loadvm_postcopy_handle_run_bh("after cpu sync");
1976
1977    qemu_announce_self(&mis->announce_timer, migrate_announce_params());
1978
1979    trace_loadvm_postcopy_handle_run_bh("after announce");
1980
1981    /* Make sure all file formats throw away their mutable metadata.
1982     * If we get an error here, just don't restart the VM yet. */
1983    bdrv_activate_all(&local_err);
1984    if (local_err) {
1985        error_report_err(local_err);
1986        local_err = NULL;
1987        autostart = false;
1988    }
1989
1990    trace_loadvm_postcopy_handle_run_bh("after invalidate cache");
1991
1992    dirty_bitmap_mig_before_vm_start();
1993
1994    if (autostart) {
1995        /* Hold onto your hats, starting the CPU */
1996        vm_start();
1997    } else {
1998        /* leave it paused and let management decide when to start the CPU */
1999        runstate_set(RUN_STATE_PAUSED);
2000    }
2001
2002    qemu_bh_delete(mis->bh);
2003
2004    trace_loadvm_postcopy_handle_run_bh("return");
2005}
2006
2007/* After all discards we can start running and asking for pages */
2008static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
2009{
2010    PostcopyState ps = postcopy_state_get();
2011
2012    trace_loadvm_postcopy_handle_run();
2013    if (ps != POSTCOPY_INCOMING_LISTENING) {
2014        error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
2015        return -1;
2016    }
2017
2018    postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
2019    mis->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, mis);
2020    qemu_bh_schedule(mis->bh);
2021
2022    /* We need to finish reading the stream from the package
2023     * and also stop reading anything more from the stream that loaded the
2024     * package (since it's now being read by the listener thread).
2025     * LOADVM_QUIT will quit all the layers of nested loadvm loops.
2026     */
2027    return LOADVM_QUIT;
2028}
2029
2030/* We must be with page_request_mutex held */
2031static gboolean postcopy_sync_page_req(gpointer key, gpointer value,
2032                                       gpointer data)
2033{
2034    MigrationIncomingState *mis = data;
2035    void *host_addr = (void *) key;
2036    ram_addr_t rb_offset;
2037    RAMBlock *rb;
2038    int ret;
2039
2040    rb = qemu_ram_block_from_host(host_addr, true, &rb_offset);
2041    if (!rb) {
2042        /*
2043         * This should _never_ happen.  However be nice for a migrating VM to
2044         * not crash/assert.  Post an error (note: intended to not use *_once
2045         * because we do want to see all the illegal addresses; and this can
2046         * never be triggered by the guest so we're safe) and move on next.
2047         */
2048        error_report("%s: illegal host addr %p", __func__, host_addr);
2049        /* Try the next entry */
2050        return FALSE;
2051    }
2052
2053    ret = migrate_send_rp_message_req_pages(mis, rb, rb_offset);
2054    if (ret) {
2055        /* Please refer to above comment. */
2056        error_report("%s: send rp message failed for addr %p",
2057                     __func__, host_addr);
2058        return FALSE;
2059    }
2060
2061    trace_postcopy_page_req_sync(host_addr);
2062
2063    return FALSE;
2064}
2065
2066static void migrate_send_rp_req_pages_pending(MigrationIncomingState *mis)
2067{
2068    WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
2069        g_tree_foreach(mis->page_requested, postcopy_sync_page_req, mis);
2070    }
2071}
2072
2073static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis)
2074{
2075    if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
2076        error_report("%s: illegal resume received", __func__);
2077        /* Don't fail the load, only for this. */
2078        return 0;
2079    }
2080
2081    /*
2082     * Reset the last_rb before we resend any page req to source again, since
2083     * the source should have it reset already.
2084     */
2085    mis->last_rb = NULL;
2086
2087    /*
2088     * This means source VM is ready to resume the postcopy migration.
2089     */
2090    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2091                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
2092
2093    trace_loadvm_postcopy_handle_resume();
2094
2095    /* Tell source that "we are ready" */
2096    migrate_send_rp_resume_ack(mis, MIGRATION_RESUME_ACK_VALUE);
2097
2098    /*
2099     * After a postcopy recovery, the source should have lost the postcopy
2100     * queue, or potentially the requested pages could have been lost during
2101     * the network down phase.  Let's re-sync with the source VM by re-sending
2102     * all the pending pages that we eagerly need, so these threads won't get
2103     * blocked too long due to the recovery.
2104     *
2105     * Without this procedure, the faulted destination VM threads (waiting for
2106     * page requests right before the postcopy is interrupted) can keep hanging
2107     * until the pages are sent by the source during the background copying of
2108     * pages, or another thread faulted on the same address accidentally.
2109     */
2110    migrate_send_rp_req_pages_pending(mis);
2111
2112    /*
2113     * It's time to switch state and release the fault thread to continue
2114     * service page faults.  Note that this should be explicitly after the
2115     * above call to migrate_send_rp_req_pages_pending().  In short:
2116     * migrate_send_rp_message_req_pages() is not thread safe, yet.
2117     */
2118    qemu_sem_post(&mis->postcopy_pause_sem_fault);
2119
2120    if (migrate_postcopy_preempt()) {
2121        /* The channel should already be setup again; make sure of it */
2122        assert(mis->postcopy_qemufile_dst);
2123        /* Kick the fast ram load thread too */
2124        qemu_sem_post(&mis->postcopy_pause_sem_fast_load);
2125    }
2126
2127    return 0;
2128}
2129
2130/**
2131 * Immediately following this command is a blob of data containing an embedded
2132 * chunk of migration stream; read it and load it.
2133 *
2134 * @mis: Incoming state
2135 * @length: Length of packaged data to read
2136 *
2137 * Returns: Negative values on error
2138 *
2139 */
2140static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
2141{
2142    int ret;
2143    size_t length;
2144    QIOChannelBuffer *bioc;
2145
2146    length = qemu_get_be32(mis->from_src_file);
2147    trace_loadvm_handle_cmd_packaged(length);
2148
2149    if (length > MAX_VM_CMD_PACKAGED_SIZE) {
2150        error_report("Unreasonably large packaged state: %zu", length);
2151        return -1;
2152    }
2153
2154    bioc = qio_channel_buffer_new(length);
2155    qio_channel_set_name(QIO_CHANNEL(bioc), "migration-loadvm-buffer");
2156    ret = qemu_get_buffer(mis->from_src_file,
2157                          bioc->data,
2158                          length);
2159    if (ret != length) {
2160        object_unref(OBJECT(bioc));
2161        error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu",
2162                     ret, length);
2163        return (ret < 0) ? ret : -EAGAIN;
2164    }
2165    bioc->usage += length;
2166    trace_loadvm_handle_cmd_packaged_received(ret);
2167
2168    QEMUFile *packf = qemu_file_new_input(QIO_CHANNEL(bioc));
2169
2170    ret = qemu_loadvm_state_main(packf, mis);
2171    trace_loadvm_handle_cmd_packaged_main(ret);
2172    qemu_fclose(packf);
2173    object_unref(OBJECT(bioc));
2174
2175    return ret;
2176}
2177
2178/*
2179 * Handle request that source requests for recved_bitmap on
2180 * destination. Payload format:
2181 *
2182 * len (1 byte) + ramblock_name (<255 bytes)
2183 */
2184static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
2185                                     uint16_t len)
2186{
2187    QEMUFile *file = mis->from_src_file;
2188    RAMBlock *rb;
2189    char block_name[256];
2190    size_t cnt;
2191
2192    cnt = qemu_get_counted_string(file, block_name);
2193    if (!cnt) {
2194        error_report("%s: failed to read block name", __func__);
2195        return -EINVAL;
2196    }
2197
2198    /* Validate before using the data */
2199    if (qemu_file_get_error(file)) {
2200        return qemu_file_get_error(file);
2201    }
2202
2203    if (len != cnt + 1) {
2204        error_report("%s: invalid payload length (%d)", __func__, len);
2205        return -EINVAL;
2206    }
2207
2208    rb = qemu_ram_block_by_name(block_name);
2209    if (!rb) {
2210        error_report("%s: block '%s' not found", __func__, block_name);
2211        return -EINVAL;
2212    }
2213
2214    migrate_send_rp_recv_bitmap(mis, block_name);
2215
2216    trace_loadvm_handle_recv_bitmap(block_name);
2217
2218    return 0;
2219}
2220
2221static int loadvm_process_enable_colo(MigrationIncomingState *mis)
2222{
2223    int ret = migration_incoming_enable_colo();
2224
2225    if (!ret) {
2226        ret = colo_init_ram_cache();
2227        if (ret) {
2228            migration_incoming_disable_colo();
2229        }
2230    }
2231    return ret;
2232}
2233
2234/*
2235 * Process an incoming 'QEMU_VM_COMMAND'
2236 * 0           just a normal return
2237 * LOADVM_QUIT All good, but exit the loop
2238 * <0          Error
2239 */
2240static int loadvm_process_command(QEMUFile *f)
2241{
2242    MigrationIncomingState *mis = migration_incoming_get_current();
2243    uint16_t cmd;
2244    uint16_t len;
2245    uint32_t tmp32;
2246
2247    cmd = qemu_get_be16(f);
2248    len = qemu_get_be16(f);
2249
2250    /* Check validity before continue processing of cmds */
2251    if (qemu_file_get_error(f)) {
2252        return qemu_file_get_error(f);
2253    }
2254
2255    if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
2256        error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
2257        return -EINVAL;
2258    }
2259
2260    trace_loadvm_process_command(mig_cmd_args[cmd].name, len);
2261
2262    if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
2263        error_report("%s received with bad length - expecting %zu, got %d",
2264                     mig_cmd_args[cmd].name,
2265                     (size_t)mig_cmd_args[cmd].len, len);
2266        return -ERANGE;
2267    }
2268
2269    switch (cmd) {
2270    case MIG_CMD_OPEN_RETURN_PATH:
2271        if (mis->to_src_file) {
2272            error_report("CMD_OPEN_RETURN_PATH called when RP already open");
2273            /* Not really a problem, so don't give up */
2274            return 0;
2275        }
2276        mis->to_src_file = qemu_file_get_return_path(f);
2277        if (!mis->to_src_file) {
2278            error_report("CMD_OPEN_RETURN_PATH failed");
2279            return -1;
2280        }
2281        break;
2282
2283    case MIG_CMD_PING:
2284        tmp32 = qemu_get_be32(f);
2285        trace_loadvm_process_command_ping(tmp32);
2286        if (!mis->to_src_file) {
2287            error_report("CMD_PING (0x%x) received with no return path",
2288                         tmp32);
2289            return -1;
2290        }
2291        migrate_send_rp_pong(mis, tmp32);
2292        break;
2293
2294    case MIG_CMD_PACKAGED:
2295        return loadvm_handle_cmd_packaged(mis);
2296
2297    case MIG_CMD_POSTCOPY_ADVISE:
2298        return loadvm_postcopy_handle_advise(mis, len);
2299
2300    case MIG_CMD_POSTCOPY_LISTEN:
2301        return loadvm_postcopy_handle_listen(mis);
2302
2303    case MIG_CMD_POSTCOPY_RUN:
2304        return loadvm_postcopy_handle_run(mis);
2305
2306    case MIG_CMD_POSTCOPY_RAM_DISCARD:
2307        return loadvm_postcopy_ram_handle_discard(mis, len);
2308
2309    case MIG_CMD_POSTCOPY_RESUME:
2310        return loadvm_postcopy_handle_resume(mis);
2311
2312    case MIG_CMD_RECV_BITMAP:
2313        return loadvm_handle_recv_bitmap(mis, len);
2314
2315    case MIG_CMD_ENABLE_COLO:
2316        return loadvm_process_enable_colo(mis);
2317    }
2318
2319    return 0;
2320}
2321
2322/*
2323 * Read a footer off the wire and check that it matches the expected section
2324 *
2325 * Returns: true if the footer was good
2326 *          false if there is a problem (and calls error_report to say why)
2327 */
2328static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
2329{
2330    int ret;
2331    uint8_t read_mark;
2332    uint32_t read_section_id;
2333
2334    if (!migrate_get_current()->send_section_footer) {
2335        /* No footer to check */
2336        return true;
2337    }
2338
2339    read_mark = qemu_get_byte(f);
2340
2341    ret = qemu_file_get_error(f);
2342    if (ret) {
2343        error_report("%s: Read section footer failed: %d",
2344                     __func__, ret);
2345        return false;
2346    }
2347
2348    if (read_mark != QEMU_VM_SECTION_FOOTER) {
2349        error_report("Missing section footer for %s", se->idstr);
2350        return false;
2351    }
2352
2353    read_section_id = qemu_get_be32(f);
2354    if (read_section_id != se->load_section_id) {
2355        error_report("Mismatched section id in footer for %s -"
2356                     " read 0x%x expected 0x%x",
2357                     se->idstr, read_section_id, se->load_section_id);
2358        return false;
2359    }
2360
2361    /* All good */
2362    return true;
2363}
2364
2365static int
2366qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
2367{
2368    uint32_t instance_id, version_id, section_id;
2369    SaveStateEntry *se;
2370    char idstr[256];
2371    int ret;
2372
2373    /* Read section start */
2374    section_id = qemu_get_be32(f);
2375    if (!qemu_get_counted_string(f, idstr)) {
2376        error_report("Unable to read ID string for section %u",
2377                     section_id);
2378        return -EINVAL;
2379    }
2380    instance_id = qemu_get_be32(f);
2381    version_id = qemu_get_be32(f);
2382
2383    ret = qemu_file_get_error(f);
2384    if (ret) {
2385        error_report("%s: Failed to read instance/version ID: %d",
2386                     __func__, ret);
2387        return ret;
2388    }
2389
2390    trace_qemu_loadvm_state_section_startfull(section_id, idstr,
2391            instance_id, version_id);
2392    /* Find savevm section */
2393    se = find_se(idstr, instance_id);
2394    if (se == NULL) {
2395        error_report("Unknown savevm section or instance '%s' %"PRIu32". "
2396                     "Make sure that your current VM setup matches your "
2397                     "saved VM setup, including any hotplugged devices",
2398                     idstr, instance_id);
2399        return -EINVAL;
2400    }
2401
2402    /* Validate version */
2403    if (version_id > se->version_id) {
2404        error_report("savevm: unsupported version %d for '%s' v%d",
2405                     version_id, idstr, se->version_id);
2406        return -EINVAL;
2407    }
2408    se->load_version_id = version_id;
2409    se->load_section_id = section_id;
2410
2411    /* Validate if it is a device's state */
2412    if (xen_enabled() && se->is_ram) {
2413        error_report("loadvm: %s RAM loading not allowed on Xen", idstr);
2414        return -EINVAL;
2415    }
2416
2417    ret = vmstate_load(f, se);
2418    if (ret < 0) {
2419        error_report("error while loading state for instance 0x%"PRIx32" of"
2420                     " device '%s'", instance_id, idstr);
2421        return ret;
2422    }
2423    if (!check_section_footer(f, se)) {
2424        return -EINVAL;
2425    }
2426
2427    return 0;
2428}
2429
2430static int
2431qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
2432{
2433    uint32_t section_id;
2434    SaveStateEntry *se;
2435    int ret;
2436
2437    section_id = qemu_get_be32(f);
2438
2439    ret = qemu_file_get_error(f);
2440    if (ret) {
2441        error_report("%s: Failed to read section ID: %d",
2442                     __func__, ret);
2443        return ret;
2444    }
2445
2446    trace_qemu_loadvm_state_section_partend(section_id);
2447    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2448        if (se->load_section_id == section_id) {
2449            break;
2450        }
2451    }
2452    if (se == NULL) {
2453        error_report("Unknown savevm section %d", section_id);
2454        return -EINVAL;
2455    }
2456
2457    ret = vmstate_load(f, se);
2458    if (ret < 0) {
2459        error_report("error while loading state section id %d(%s)",
2460                     section_id, se->idstr);
2461        return ret;
2462    }
2463    if (!check_section_footer(f, se)) {
2464        return -EINVAL;
2465    }
2466
2467    return 0;
2468}
2469
2470static int qemu_loadvm_state_header(QEMUFile *f)
2471{
2472    unsigned int v;
2473    int ret;
2474
2475    v = qemu_get_be32(f);
2476    if (v != QEMU_VM_FILE_MAGIC) {
2477        error_report("Not a migration stream");
2478        return -EINVAL;
2479    }
2480
2481    v = qemu_get_be32(f);
2482    if (v == QEMU_VM_FILE_VERSION_COMPAT) {
2483        error_report("SaveVM v2 format is obsolete and don't work anymore");
2484        return -ENOTSUP;
2485    }
2486    if (v != QEMU_VM_FILE_VERSION) {
2487        error_report("Unsupported migration stream version");
2488        return -ENOTSUP;
2489    }
2490
2491    if (migrate_get_current()->send_configuration) {
2492        if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
2493            error_report("Configuration section missing");
2494            qemu_loadvm_state_cleanup();
2495            return -EINVAL;
2496        }
2497        ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
2498
2499        if (ret) {
2500            qemu_loadvm_state_cleanup();
2501            return ret;
2502        }
2503    }
2504    return 0;
2505}
2506
2507static int qemu_loadvm_state_setup(QEMUFile *f)
2508{
2509    SaveStateEntry *se;
2510    int ret;
2511
2512    trace_loadvm_state_setup();
2513    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2514        if (!se->ops || !se->ops->load_setup) {
2515            continue;
2516        }
2517        if (se->ops->is_active) {
2518            if (!se->ops->is_active(se->opaque)) {
2519                continue;
2520            }
2521        }
2522
2523        ret = se->ops->load_setup(f, se->opaque);
2524        if (ret < 0) {
2525            qemu_file_set_error(f, ret);
2526            error_report("Load state of device %s failed", se->idstr);
2527            return ret;
2528        }
2529    }
2530    return 0;
2531}
2532
2533void qemu_loadvm_state_cleanup(void)
2534{
2535    SaveStateEntry *se;
2536
2537    trace_loadvm_state_cleanup();
2538    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2539        if (se->ops && se->ops->load_cleanup) {
2540            se->ops->load_cleanup(se->opaque);
2541        }
2542    }
2543}
2544
2545/* Return true if we should continue the migration, or false. */
2546static bool postcopy_pause_incoming(MigrationIncomingState *mis)
2547{
2548    int i;
2549
2550    trace_postcopy_pause_incoming();
2551
2552    assert(migrate_postcopy_ram());
2553
2554    /*
2555     * Unregister yank with either from/to src would work, since ioc behind it
2556     * is the same
2557     */
2558    migration_ioc_unregister_yank_from_file(mis->from_src_file);
2559
2560    assert(mis->from_src_file);
2561    qemu_file_shutdown(mis->from_src_file);
2562    qemu_fclose(mis->from_src_file);
2563    mis->from_src_file = NULL;
2564
2565    assert(mis->to_src_file);
2566    qemu_file_shutdown(mis->to_src_file);
2567    qemu_mutex_lock(&mis->rp_mutex);
2568    qemu_fclose(mis->to_src_file);
2569    mis->to_src_file = NULL;
2570    qemu_mutex_unlock(&mis->rp_mutex);
2571
2572    /*
2573     * NOTE: this must happen before reset the PostcopyTmpPages below,
2574     * otherwise it's racy to reset those fields when the fast load thread
2575     * can be accessing it in parallel.
2576     */
2577    if (mis->postcopy_qemufile_dst) {
2578        qemu_file_shutdown(mis->postcopy_qemufile_dst);
2579        /* Take the mutex to make sure the fast ram load thread halted */
2580        qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
2581        migration_ioc_unregister_yank_from_file(mis->postcopy_qemufile_dst);
2582        qemu_fclose(mis->postcopy_qemufile_dst);
2583        mis->postcopy_qemufile_dst = NULL;
2584        qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
2585    }
2586
2587    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2588                      MIGRATION_STATUS_POSTCOPY_PAUSED);
2589
2590    /* Notify the fault thread for the invalidated file handle */
2591    postcopy_fault_thread_notify(mis);
2592
2593    /*
2594     * If network is interrupted, any temp page we received will be useless
2595     * because we didn't mark them as "received" in receivedmap.  After a
2596     * proper recovery later (which will sync src dirty bitmap with receivedmap
2597     * on dest) these cached small pages will be resent again.
2598     */
2599    for (i = 0; i < mis->postcopy_channels; i++) {
2600        postcopy_temp_page_reset(&mis->postcopy_tmp_pages[i]);
2601    }
2602
2603    error_report("Detected IO failure for postcopy. "
2604                 "Migration paused.");
2605
2606    while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
2607        qemu_sem_wait(&mis->postcopy_pause_sem_dst);
2608    }
2609
2610    trace_postcopy_pause_incoming_continued();
2611
2612    return true;
2613}
2614
2615int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
2616{
2617    uint8_t section_type;
2618    int ret = 0;
2619
2620retry:
2621    while (true) {
2622        section_type = qemu_get_byte(f);
2623
2624        ret = qemu_file_get_error_obj_any(f, mis->postcopy_qemufile_dst, NULL);
2625        if (ret) {
2626            break;
2627        }
2628
2629        trace_qemu_loadvm_state_section(section_type);
2630        switch (section_type) {
2631        case QEMU_VM_SECTION_START:
2632        case QEMU_VM_SECTION_FULL:
2633            ret = qemu_loadvm_section_start_full(f, mis);
2634            if (ret < 0) {
2635                goto out;
2636            }
2637            break;
2638        case QEMU_VM_SECTION_PART:
2639        case QEMU_VM_SECTION_END:
2640            ret = qemu_loadvm_section_part_end(f, mis);
2641            if (ret < 0) {
2642                goto out;
2643            }
2644            break;
2645        case QEMU_VM_COMMAND:
2646            ret = loadvm_process_command(f);
2647            trace_qemu_loadvm_state_section_command(ret);
2648            if ((ret < 0) || (ret == LOADVM_QUIT)) {
2649                goto out;
2650            }
2651            break;
2652        case QEMU_VM_EOF:
2653            /* This is the end of migration */
2654            goto out;
2655        default:
2656            error_report("Unknown savevm section type %d", section_type);
2657            ret = -EINVAL;
2658            goto out;
2659        }
2660    }
2661
2662out:
2663    if (ret < 0) {
2664        qemu_file_set_error(f, ret);
2665
2666        /* Cancel bitmaps incoming regardless of recovery */
2667        dirty_bitmap_mig_cancel_incoming();
2668
2669        /*
2670         * If we are during an active postcopy, then we pause instead
2671         * of bail out to at least keep the VM's dirty data.  Note
2672         * that POSTCOPY_INCOMING_LISTENING stage is still not enough,
2673         * during which we're still receiving device states and we
2674         * still haven't yet started the VM on destination.
2675         *
2676         * Only RAM postcopy supports recovery. Still, if RAM postcopy is
2677         * enabled, canceled bitmaps postcopy will not affect RAM postcopy
2678         * recovering.
2679         */
2680        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
2681            migrate_postcopy_ram() && postcopy_pause_incoming(mis)) {
2682            /* Reset f to point to the newly created channel */
2683            f = mis->from_src_file;
2684            goto retry;
2685        }
2686    }
2687    return ret;
2688}
2689
2690int qemu_loadvm_state(QEMUFile *f)
2691{
2692    MigrationIncomingState *mis = migration_incoming_get_current();
2693    Error *local_err = NULL;
2694    int ret;
2695
2696    if (qemu_savevm_state_blocked(&local_err)) {
2697        error_report_err(local_err);
2698        return -EINVAL;
2699    }
2700
2701    ret = qemu_loadvm_state_header(f);
2702    if (ret) {
2703        return ret;
2704    }
2705
2706    if (qemu_loadvm_state_setup(f) != 0) {
2707        return -EINVAL;
2708    }
2709
2710    cpu_synchronize_all_pre_loadvm();
2711
2712    ret = qemu_loadvm_state_main(f, mis);
2713    qemu_event_set(&mis->main_thread_load_event);
2714
2715    trace_qemu_loadvm_state_post_main(ret);
2716
2717    if (mis->have_listen_thread) {
2718        /* Listen thread still going, can't clean up yet */
2719        return ret;
2720    }
2721
2722    if (ret == 0) {
2723        ret = qemu_file_get_error(f);
2724    }
2725
2726    /*
2727     * Try to read in the VMDESC section as well, so that dumping tools that
2728     * intercept our migration stream have the chance to see it.
2729     */
2730
2731    /* We've got to be careful; if we don't read the data and just shut the fd
2732     * then the sender can error if we close while it's still sending.
2733     * We also mustn't read data that isn't there; some transports (RDMA)
2734     * will stall waiting for that data when the source has already closed.
2735     */
2736    if (ret == 0 && should_send_vmdesc()) {
2737        uint8_t *buf;
2738        uint32_t size;
2739        uint8_t  section_type = qemu_get_byte(f);
2740
2741        if (section_type != QEMU_VM_VMDESCRIPTION) {
2742            error_report("Expected vmdescription section, but got %d",
2743                         section_type);
2744            /*
2745             * It doesn't seem worth failing at this point since
2746             * we apparently have an otherwise valid VM state
2747             */
2748        } else {
2749            buf = g_malloc(0x1000);
2750            size = qemu_get_be32(f);
2751
2752            while (size > 0) {
2753                uint32_t read_chunk = MIN(size, 0x1000);
2754                qemu_get_buffer(f, buf, read_chunk);
2755                size -= read_chunk;
2756            }
2757            g_free(buf);
2758        }
2759    }
2760
2761    qemu_loadvm_state_cleanup();
2762    cpu_synchronize_all_post_init();
2763
2764    return ret;
2765}
2766
2767int qemu_load_device_state(QEMUFile *f)
2768{
2769    MigrationIncomingState *mis = migration_incoming_get_current();
2770    int ret;
2771
2772    /* Load QEMU_VM_SECTION_FULL section */
2773    ret = qemu_loadvm_state_main(f, mis);
2774    if (ret < 0) {
2775        error_report("Failed to load device state: %d", ret);
2776        return ret;
2777    }
2778
2779    cpu_synchronize_all_post_init();
2780    return 0;
2781}
2782
2783bool save_snapshot(const char *name, bool overwrite, const char *vmstate,
2784                  bool has_devices, strList *devices, Error **errp)
2785{
2786    BlockDriverState *bs;
2787    QEMUSnapshotInfo sn1, *sn = &sn1;
2788    int ret = -1, ret2;
2789    QEMUFile *f;
2790    int saved_vm_running;
2791    uint64_t vm_state_size;
2792    g_autoptr(GDateTime) now = g_date_time_new_now_local();
2793    AioContext *aio_context;
2794
2795    GLOBAL_STATE_CODE();
2796
2797    if (migration_is_blocked(errp)) {
2798        return false;
2799    }
2800
2801    if (!replay_can_snapshot()) {
2802        error_setg(errp, "Record/replay does not allow making snapshot "
2803                   "right now. Try once more later.");
2804        return false;
2805    }
2806
2807    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
2808        return false;
2809    }
2810
2811    /* Delete old snapshots of the same name */
2812    if (name) {
2813        if (overwrite) {
2814            if (bdrv_all_delete_snapshot(name, has_devices,
2815                                         devices, errp) < 0) {
2816                return false;
2817            }
2818        } else {
2819            ret2 = bdrv_all_has_snapshot(name, has_devices, devices, errp);
2820            if (ret2 < 0) {
2821                return false;
2822            }
2823            if (ret2 == 1) {
2824                error_setg(errp,
2825                           "Snapshot '%s' already exists in one or more devices",
2826                           name);
2827                return false;
2828            }
2829        }
2830    }
2831
2832    bs = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
2833    if (bs == NULL) {
2834        return false;
2835    }
2836    aio_context = bdrv_get_aio_context(bs);
2837
2838    saved_vm_running = runstate_is_running();
2839
2840    ret = global_state_store();
2841    if (ret) {
2842        error_setg(errp, "Error saving global state");
2843        return false;
2844    }
2845    vm_stop(RUN_STATE_SAVE_VM);
2846
2847    bdrv_drain_all_begin();
2848
2849    aio_context_acquire(aio_context);
2850
2851    memset(sn, 0, sizeof(*sn));
2852
2853    /* fill auxiliary fields */
2854    sn->date_sec = g_date_time_to_unix(now);
2855    sn->date_nsec = g_date_time_get_microsecond(now) * 1000;
2856    sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
2857    if (replay_mode != REPLAY_MODE_NONE) {
2858        sn->icount = replay_get_current_icount();
2859    } else {
2860        sn->icount = -1ULL;
2861    }
2862
2863    if (name) {
2864        pstrcpy(sn->name, sizeof(sn->name), name);
2865    } else {
2866        g_autofree char *autoname = g_date_time_format(now,  "vm-%Y%m%d%H%M%S");
2867        pstrcpy(sn->name, sizeof(sn->name), autoname);
2868    }
2869
2870    /* save the VM state */
2871    f = qemu_fopen_bdrv(bs, 1);
2872    if (!f) {
2873        error_setg(errp, "Could not open VM state file");
2874        goto the_end;
2875    }
2876    ret = qemu_savevm_state(f, errp);
2877    vm_state_size = qemu_file_total_transferred(f);
2878    ret2 = qemu_fclose(f);
2879    if (ret < 0) {
2880        goto the_end;
2881    }
2882    if (ret2 < 0) {
2883        ret = ret2;
2884        goto the_end;
2885    }
2886
2887    /* The bdrv_all_create_snapshot() call that follows acquires the AioContext
2888     * for itself.  BDRV_POLL_WHILE() does not support nested locking because
2889     * it only releases the lock once.  Therefore synchronous I/O will deadlock
2890     * unless we release the AioContext before bdrv_all_create_snapshot().
2891     */
2892    aio_context_release(aio_context);
2893    aio_context = NULL;
2894
2895    ret = bdrv_all_create_snapshot(sn, bs, vm_state_size,
2896                                   has_devices, devices, errp);
2897    if (ret < 0) {
2898        bdrv_all_delete_snapshot(sn->name, has_devices, devices, NULL);
2899        goto the_end;
2900    }
2901
2902    ret = 0;
2903
2904 the_end:
2905    if (aio_context) {
2906        aio_context_release(aio_context);
2907    }
2908
2909    bdrv_drain_all_end();
2910
2911    if (saved_vm_running) {
2912        vm_start();
2913    }
2914    return ret == 0;
2915}
2916
2917void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
2918                                Error **errp)
2919{
2920    QEMUFile *f;
2921    QIOChannelFile *ioc;
2922    int saved_vm_running;
2923    int ret;
2924
2925    if (!has_live) {
2926        /* live default to true so old version of Xen tool stack can have a
2927         * successful live migration */
2928        live = true;
2929    }
2930
2931    saved_vm_running = runstate_is_running();
2932    vm_stop(RUN_STATE_SAVE_VM);
2933    global_state_store_running();
2934
2935    ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT | O_TRUNC,
2936                                    0660, errp);
2937    if (!ioc) {
2938        goto the_end;
2939    }
2940    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-save-state");
2941    f = qemu_file_new_output(QIO_CHANNEL(ioc));
2942    object_unref(OBJECT(ioc));
2943    ret = qemu_save_device_state(f);
2944    if (ret < 0 || qemu_fclose(f) < 0) {
2945        error_setg(errp, QERR_IO_ERROR);
2946    } else {
2947        /* libxl calls the QMP command "stop" before calling
2948         * "xen-save-devices-state" and in case of migration failure, libxl
2949         * would call "cont".
2950         * So call bdrv_inactivate_all (release locks) here to let the other
2951         * side of the migration take control of the images.
2952         */
2953        if (live && !saved_vm_running) {
2954            ret = bdrv_inactivate_all();
2955            if (ret) {
2956                error_setg(errp, "%s: bdrv_inactivate_all() failed (%d)",
2957                           __func__, ret);
2958            }
2959        }
2960    }
2961
2962 the_end:
2963    if (saved_vm_running) {
2964        vm_start();
2965    }
2966}
2967
2968void qmp_xen_load_devices_state(const char *filename, Error **errp)
2969{
2970    QEMUFile *f;
2971    QIOChannelFile *ioc;
2972    int ret;
2973
2974    /* Guest must be paused before loading the device state; the RAM state
2975     * will already have been loaded by xc
2976     */
2977    if (runstate_is_running()) {
2978        error_setg(errp, "Cannot update device state while vm is running");
2979        return;
2980    }
2981    vm_stop(RUN_STATE_RESTORE_VM);
2982
2983    ioc = qio_channel_file_new_path(filename, O_RDONLY | O_BINARY, 0, errp);
2984    if (!ioc) {
2985        return;
2986    }
2987    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-load-state");
2988    f = qemu_file_new_input(QIO_CHANNEL(ioc));
2989    object_unref(OBJECT(ioc));
2990
2991    ret = qemu_loadvm_state(f);
2992    qemu_fclose(f);
2993    if (ret < 0) {
2994        error_setg(errp, QERR_IO_ERROR);
2995    }
2996    migration_incoming_state_destroy();
2997}
2998
2999bool load_snapshot(const char *name, const char *vmstate,
3000                   bool has_devices, strList *devices, Error **errp)
3001{
3002    BlockDriverState *bs_vm_state;
3003    QEMUSnapshotInfo sn;
3004    QEMUFile *f;
3005    int ret;
3006    AioContext *aio_context;
3007    MigrationIncomingState *mis = migration_incoming_get_current();
3008
3009    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
3010        return false;
3011    }
3012    ret = bdrv_all_has_snapshot(name, has_devices, devices, errp);
3013    if (ret < 0) {
3014        return false;
3015    }
3016    if (ret == 0) {
3017        error_setg(errp, "Snapshot '%s' does not exist in one or more devices",
3018                   name);
3019        return false;
3020    }
3021
3022    bs_vm_state = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
3023    if (!bs_vm_state) {
3024        return false;
3025    }
3026    aio_context = bdrv_get_aio_context(bs_vm_state);
3027
3028    /* Don't even try to load empty VM states */
3029    aio_context_acquire(aio_context);
3030    ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
3031    aio_context_release(aio_context);
3032    if (ret < 0) {
3033        return false;
3034    } else if (sn.vm_state_size == 0) {
3035        error_setg(errp, "This is a disk-only snapshot. Revert to it "
3036                   " offline using qemu-img");
3037        return false;
3038    }
3039
3040    /*
3041     * Flush the record/replay queue. Now the VM state is going
3042     * to change. Therefore we don't need to preserve its consistency
3043     */
3044    replay_flush_events();
3045
3046    /* Flush all IO requests so they don't interfere with the new state.  */
3047    bdrv_drain_all_begin();
3048
3049    ret = bdrv_all_goto_snapshot(name, has_devices, devices, errp);
3050    if (ret < 0) {
3051        goto err_drain;
3052    }
3053
3054    /* restore the VM state */
3055    f = qemu_fopen_bdrv(bs_vm_state, 0);
3056    if (!f) {
3057        error_setg(errp, "Could not open VM state file");
3058        goto err_drain;
3059    }
3060
3061    qemu_system_reset(SHUTDOWN_CAUSE_NONE);
3062    mis->from_src_file = f;
3063
3064    if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
3065        ret = -EINVAL;
3066        goto err_drain;
3067    }
3068    aio_context_acquire(aio_context);
3069    ret = qemu_loadvm_state(f);
3070    migration_incoming_state_destroy();
3071    aio_context_release(aio_context);
3072
3073    bdrv_drain_all_end();
3074
3075    if (ret < 0) {
3076        error_setg(errp, "Error %d while loading VM state", ret);
3077        return false;
3078    }
3079
3080    return true;
3081
3082err_drain:
3083    bdrv_drain_all_end();
3084    return false;
3085}
3086
3087bool delete_snapshot(const char *name, bool has_devices,
3088                     strList *devices, Error **errp)
3089{
3090    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
3091        return false;
3092    }
3093
3094    if (bdrv_all_delete_snapshot(name, has_devices, devices, errp) < 0) {
3095        return false;
3096    }
3097
3098    return true;
3099}
3100
3101void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
3102{
3103    qemu_ram_set_idstr(mr->ram_block,
3104                       memory_region_name(mr), dev);
3105    qemu_ram_set_migratable(mr->ram_block);
3106}
3107
3108void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
3109{
3110    qemu_ram_unset_idstr(mr->ram_block);
3111    qemu_ram_unset_migratable(mr->ram_block);
3112}
3113
3114void vmstate_register_ram_global(MemoryRegion *mr)
3115{
3116    vmstate_register_ram(mr, NULL);
3117}
3118
3119bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
3120{
3121    /* check needed if --only-migratable is specified */
3122    if (!only_migratable) {
3123        return true;
3124    }
3125
3126    return !(vmsd && vmsd->unmigratable);
3127}
3128
3129typedef struct SnapshotJob {
3130    Job common;
3131    char *tag;
3132    char *vmstate;
3133    strList *devices;
3134    Coroutine *co;
3135    Error **errp;
3136    bool ret;
3137} SnapshotJob;
3138
3139static void qmp_snapshot_job_free(SnapshotJob *s)
3140{
3141    g_free(s->tag);
3142    g_free(s->vmstate);
3143    qapi_free_strList(s->devices);
3144}
3145
3146
3147static void snapshot_load_job_bh(void *opaque)
3148{
3149    Job *job = opaque;
3150    SnapshotJob *s = container_of(job, SnapshotJob, common);
3151    int orig_vm_running;
3152
3153    job_progress_set_remaining(&s->common, 1);
3154
3155    orig_vm_running = runstate_is_running();
3156    vm_stop(RUN_STATE_RESTORE_VM);
3157
3158    s->ret = load_snapshot(s->tag, s->vmstate, true, s->devices, s->errp);
3159    if (s->ret && orig_vm_running) {
3160        vm_start();
3161    }
3162
3163    job_progress_update(&s->common, 1);
3164
3165    qmp_snapshot_job_free(s);
3166    aio_co_wake(s->co);
3167}
3168
3169static void snapshot_save_job_bh(void *opaque)
3170{
3171    Job *job = opaque;
3172    SnapshotJob *s = container_of(job, SnapshotJob, common);
3173
3174    job_progress_set_remaining(&s->common, 1);
3175    s->ret = save_snapshot(s->tag, false, s->vmstate,
3176                           true, s->devices, s->errp);
3177    job_progress_update(&s->common, 1);
3178
3179    qmp_snapshot_job_free(s);
3180    aio_co_wake(s->co);
3181}
3182
3183static void snapshot_delete_job_bh(void *opaque)
3184{
3185    Job *job = opaque;
3186    SnapshotJob *s = container_of(job, SnapshotJob, common);
3187
3188    job_progress_set_remaining(&s->common, 1);
3189    s->ret = delete_snapshot(s->tag, true, s->devices, s->errp);
3190    job_progress_update(&s->common, 1);
3191
3192    qmp_snapshot_job_free(s);
3193    aio_co_wake(s->co);
3194}
3195
3196static int coroutine_fn snapshot_save_job_run(Job *job, Error **errp)
3197{
3198    SnapshotJob *s = container_of(job, SnapshotJob, common);
3199    s->errp = errp;
3200    s->co = qemu_coroutine_self();
3201    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3202                            snapshot_save_job_bh, job);
3203    qemu_coroutine_yield();
3204    return s->ret ? 0 : -1;
3205}
3206
3207static int coroutine_fn snapshot_load_job_run(Job *job, Error **errp)
3208{
3209    SnapshotJob *s = container_of(job, SnapshotJob, common);
3210    s->errp = errp;
3211    s->co = qemu_coroutine_self();
3212    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3213                            snapshot_load_job_bh, job);
3214    qemu_coroutine_yield();
3215    return s->ret ? 0 : -1;
3216}
3217
3218static int coroutine_fn snapshot_delete_job_run(Job *job, Error **errp)
3219{
3220    SnapshotJob *s = container_of(job, SnapshotJob, common);
3221    s->errp = errp;
3222    s->co = qemu_coroutine_self();
3223    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3224                            snapshot_delete_job_bh, job);
3225    qemu_coroutine_yield();
3226    return s->ret ? 0 : -1;
3227}
3228
3229
3230static const JobDriver snapshot_load_job_driver = {
3231    .instance_size = sizeof(SnapshotJob),
3232    .job_type      = JOB_TYPE_SNAPSHOT_LOAD,
3233    .run           = snapshot_load_job_run,
3234};
3235
3236static const JobDriver snapshot_save_job_driver = {
3237    .instance_size = sizeof(SnapshotJob),
3238    .job_type      = JOB_TYPE_SNAPSHOT_SAVE,
3239    .run           = snapshot_save_job_run,
3240};
3241
3242static const JobDriver snapshot_delete_job_driver = {
3243    .instance_size = sizeof(SnapshotJob),
3244    .job_type      = JOB_TYPE_SNAPSHOT_DELETE,
3245    .run           = snapshot_delete_job_run,
3246};
3247
3248
3249void qmp_snapshot_save(const char *job_id,
3250                       const char *tag,
3251                       const char *vmstate,
3252                       strList *devices,
3253                       Error **errp)
3254{
3255    SnapshotJob *s;
3256
3257    s = job_create(job_id, &snapshot_save_job_driver, NULL,
3258                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3259                   NULL, NULL, errp);
3260    if (!s) {
3261        return;
3262    }
3263
3264    s->tag = g_strdup(tag);
3265    s->vmstate = g_strdup(vmstate);
3266    s->devices = QAPI_CLONE(strList, devices);
3267
3268    job_start(&s->common);
3269}
3270
3271void qmp_snapshot_load(const char *job_id,
3272                       const char *tag,
3273                       const char *vmstate,
3274                       strList *devices,
3275                       Error **errp)
3276{
3277    SnapshotJob *s;
3278
3279    s = job_create(job_id, &snapshot_load_job_driver, NULL,
3280                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3281                   NULL, NULL, errp);
3282    if (!s) {
3283        return;
3284    }
3285
3286    s->tag = g_strdup(tag);
3287    s->vmstate = g_strdup(vmstate);
3288    s->devices = QAPI_CLONE(strList, devices);
3289
3290    job_start(&s->common);
3291}
3292
3293void qmp_snapshot_delete(const char *job_id,
3294                         const char *tag,
3295                         strList *devices,
3296                         Error **errp)
3297{
3298    SnapshotJob *s;
3299
3300    s = job_create(job_id, &snapshot_delete_job_driver, NULL,
3301                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3302                   NULL, NULL, errp);
3303    if (!s) {
3304        return;
3305    }
3306
3307    s->tag = g_strdup(tag);
3308    s->devices = QAPI_CLONE(strList, devices);
3309
3310    job_start(&s->common);
3311}
3312