qemu/migration/savevm.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2009-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include "hw/boards.h"
  31#include "net/net.h"
  32#include "migration.h"
  33#include "migration/snapshot.h"
  34#include "migration/vmstate.h"
  35#include "migration/misc.h"
  36#include "migration/register.h"
  37#include "migration/global_state.h"
  38#include "ram.h"
  39#include "qemu-file-channel.h"
  40#include "qemu-file.h"
  41#include "savevm.h"
  42#include "postcopy-ram.h"
  43#include "qapi/error.h"
  44#include "qapi/qapi-commands-migration.h"
  45#include "qapi/qapi-commands-misc.h"
  46#include "qapi/qmp/qerror.h"
  47#include "qemu/error-report.h"
  48#include "sysemu/cpus.h"
  49#include "exec/memory.h"
  50#include "exec/target_page.h"
  51#include "trace.h"
  52#include "qemu/iov.h"
  53#include "qemu/main-loop.h"
  54#include "block/snapshot.h"
  55#include "qemu/cutils.h"
  56#include "io/channel-buffer.h"
  57#include "io/channel-file.h"
  58#include "sysemu/replay.h"
  59#include "sysemu/runstate.h"
  60#include "sysemu/sysemu.h"
  61#include "sysemu/xen.h"
  62#include "qjson.h"
  63#include "migration/colo.h"
  64#include "qemu/bitmap.h"
  65#include "net/announce.h"
  66
  67const unsigned int postcopy_ram_discard_version = 0;
  68
  69/* Subcommands for QEMU_VM_COMMAND */
  70enum qemu_vm_cmd {
  71    MIG_CMD_INVALID = 0,   /* Must be 0 */
  72    MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
  73    MIG_CMD_PING,              /* Request a PONG on the RP */
  74
  75    MIG_CMD_POSTCOPY_ADVISE,       /* Prior to any page transfers, just
  76                                      warn we might want to do PC */
  77    MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
  78                                      pages as it's running. */
  79    MIG_CMD_POSTCOPY_RUN,          /* Start execution */
  80
  81    MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
  82                                      were previously sent during
  83                                      precopy but are dirty. */
  84    MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
  85    MIG_CMD_ENABLE_COLO,       /* Enable COLO */
  86    MIG_CMD_POSTCOPY_RESUME,   /* resume postcopy on dest */
  87    MIG_CMD_RECV_BITMAP,       /* Request for recved bitmap on dst */
  88    MIG_CMD_MAX
  89};
  90
  91#define MAX_VM_CMD_PACKAGED_SIZE UINT32_MAX
  92static struct mig_cmd_args {
  93    ssize_t     len; /* -1 = variable */
  94    const char *name;
  95} mig_cmd_args[] = {
  96    [MIG_CMD_INVALID]          = { .len = -1, .name = "INVALID" },
  97    [MIG_CMD_OPEN_RETURN_PATH] = { .len =  0, .name = "OPEN_RETURN_PATH" },
  98    [MIG_CMD_PING]             = { .len = sizeof(uint32_t), .name = "PING" },
  99    [MIG_CMD_POSTCOPY_ADVISE]  = { .len = -1, .name = "POSTCOPY_ADVISE" },
 100    [MIG_CMD_POSTCOPY_LISTEN]  = { .len =  0, .name = "POSTCOPY_LISTEN" },
 101    [MIG_CMD_POSTCOPY_RUN]     = { .len =  0, .name = "POSTCOPY_RUN" },
 102    [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
 103                                   .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
 104    [MIG_CMD_POSTCOPY_RESUME]  = { .len =  0, .name = "POSTCOPY_RESUME" },
 105    [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
 106    [MIG_CMD_RECV_BITMAP]      = { .len = -1, .name = "RECV_BITMAP" },
 107    [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
 108};
 109
 110/* Note for MIG_CMD_POSTCOPY_ADVISE:
 111 * The format of arguments is depending on postcopy mode:
 112 * - postcopy RAM only
 113 *   uint64_t host page size
 114 *   uint64_t taget page size
 115 *
 116 * - postcopy RAM and postcopy dirty bitmaps
 117 *   format is the same as for postcopy RAM only
 118 *
 119 * - postcopy dirty bitmaps only
 120 *   Nothing. Command length field is 0.
 121 *
 122 * Be careful: adding a new postcopy entity with some other parameters should
 123 * not break format self-description ability. Good way is to introduce some
 124 * generic extendable format with an exception for two old entities.
 125 */
 126
 127/***********************************************************/
 128/* savevm/loadvm support */
 129
 130static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
 131                                   int64_t pos, Error **errp)
 132{
 133    int ret;
 134    QEMUIOVector qiov;
 135
 136    qemu_iovec_init_external(&qiov, iov, iovcnt);
 137    ret = bdrv_writev_vmstate(opaque, &qiov, pos);
 138    if (ret < 0) {
 139        return ret;
 140    }
 141
 142    return qiov.size;
 143}
 144
 145static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
 146                                size_t size, Error **errp)
 147{
 148    return bdrv_load_vmstate(opaque, buf, pos, size);
 149}
 150
 151static int bdrv_fclose(void *opaque, Error **errp)
 152{
 153    return bdrv_flush(opaque);
 154}
 155
 156static const QEMUFileOps bdrv_read_ops = {
 157    .get_buffer = block_get_buffer,
 158    .close =      bdrv_fclose
 159};
 160
 161static const QEMUFileOps bdrv_write_ops = {
 162    .writev_buffer  = block_writev_buffer,
 163    .close          = bdrv_fclose
 164};
 165
 166static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
 167{
 168    if (is_writable) {
 169        return qemu_fopen_ops(bs, &bdrv_write_ops);
 170    }
 171    return qemu_fopen_ops(bs, &bdrv_read_ops);
 172}
 173
 174
 175/* QEMUFile timer support.
 176 * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c
 177 */
 178
 179void timer_put(QEMUFile *f, QEMUTimer *ts)
 180{
 181    uint64_t expire_time;
 182
 183    expire_time = timer_expire_time_ns(ts);
 184    qemu_put_be64(f, expire_time);
 185}
 186
 187void timer_get(QEMUFile *f, QEMUTimer *ts)
 188{
 189    uint64_t expire_time;
 190
 191    expire_time = qemu_get_be64(f);
 192    if (expire_time != -1) {
 193        timer_mod_ns(ts, expire_time);
 194    } else {
 195        timer_del(ts);
 196    }
 197}
 198
 199
 200/* VMState timer support.
 201 * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c
 202 */
 203
 204static int get_timer(QEMUFile *f, void *pv, size_t size,
 205                     const VMStateField *field)
 206{
 207    QEMUTimer *v = pv;
 208    timer_get(f, v);
 209    return 0;
 210}
 211
 212static int put_timer(QEMUFile *f, void *pv, size_t size,
 213                     const VMStateField *field, QJSON *vmdesc)
 214{
 215    QEMUTimer *v = pv;
 216    timer_put(f, v);
 217
 218    return 0;
 219}
 220
 221const VMStateInfo vmstate_info_timer = {
 222    .name = "timer",
 223    .get  = get_timer,
 224    .put  = put_timer,
 225};
 226
 227
 228typedef struct CompatEntry {
 229    char idstr[256];
 230    int instance_id;
 231} CompatEntry;
 232
 233typedef struct SaveStateEntry {
 234    QTAILQ_ENTRY(SaveStateEntry) entry;
 235    char idstr[256];
 236    uint32_t instance_id;
 237    int alias_id;
 238    int version_id;
 239    /* version id read from the stream */
 240    int load_version_id;
 241    int section_id;
 242    /* section id read from the stream */
 243    int load_section_id;
 244    const SaveVMHandlers *ops;
 245    const VMStateDescription *vmsd;
 246    void *opaque;
 247    CompatEntry *compat;
 248    int is_ram;
 249} SaveStateEntry;
 250
 251typedef struct SaveState {
 252    QTAILQ_HEAD(, SaveStateEntry) handlers;
 253    SaveStateEntry *handler_pri_head[MIG_PRI_MAX + 1];
 254    int global_section_id;
 255    uint32_t len;
 256    const char *name;
 257    uint32_t target_page_bits;
 258    uint32_t caps_count;
 259    MigrationCapability *capabilities;
 260    QemuUUID uuid;
 261} SaveState;
 262
 263static SaveState savevm_state = {
 264    .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
 265    .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL },
 266    .global_section_id = 0,
 267};
 268
 269static bool should_validate_capability(int capability)
 270{
 271    assert(capability >= 0 && capability < MIGRATION_CAPABILITY__MAX);
 272    /* Validate only new capabilities to keep compatibility. */
 273    switch (capability) {
 274    case MIGRATION_CAPABILITY_X_IGNORE_SHARED:
 275        return true;
 276    default:
 277        return false;
 278    }
 279}
 280
 281static uint32_t get_validatable_capabilities_count(void)
 282{
 283    MigrationState *s = migrate_get_current();
 284    uint32_t result = 0;
 285    int i;
 286    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 287        if (should_validate_capability(i) && s->enabled_capabilities[i]) {
 288            result++;
 289        }
 290    }
 291    return result;
 292}
 293
 294static int configuration_pre_save(void *opaque)
 295{
 296    SaveState *state = opaque;
 297    const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
 298    MigrationState *s = migrate_get_current();
 299    int i, j;
 300
 301    state->len = strlen(current_name);
 302    state->name = current_name;
 303    state->target_page_bits = qemu_target_page_bits();
 304
 305    state->caps_count = get_validatable_capabilities_count();
 306    state->capabilities = g_renew(MigrationCapability, state->capabilities,
 307                                  state->caps_count);
 308    for (i = j = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 309        if (should_validate_capability(i) && s->enabled_capabilities[i]) {
 310            state->capabilities[j++] = i;
 311        }
 312    }
 313    state->uuid = qemu_uuid;
 314
 315    return 0;
 316}
 317
 318static int configuration_pre_load(void *opaque)
 319{
 320    SaveState *state = opaque;
 321
 322    /* If there is no target-page-bits subsection it means the source
 323     * predates the variable-target-page-bits support and is using the
 324     * minimum possible value for this CPU.
 325     */
 326    state->target_page_bits = qemu_target_page_bits_min();
 327    return 0;
 328}
 329
 330static bool configuration_validate_capabilities(SaveState *state)
 331{
 332    bool ret = true;
 333    MigrationState *s = migrate_get_current();
 334    unsigned long *source_caps_bm;
 335    int i;
 336
 337    source_caps_bm = bitmap_new(MIGRATION_CAPABILITY__MAX);
 338    for (i = 0; i < state->caps_count; i++) {
 339        MigrationCapability capability = state->capabilities[i];
 340        set_bit(capability, source_caps_bm);
 341    }
 342
 343    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 344        bool source_state, target_state;
 345        if (!should_validate_capability(i)) {
 346            continue;
 347        }
 348        source_state = test_bit(i, source_caps_bm);
 349        target_state = s->enabled_capabilities[i];
 350        if (source_state != target_state) {
 351            error_report("Capability %s is %s, but received capability is %s",
 352                         MigrationCapability_str(i),
 353                         target_state ? "on" : "off",
 354                         source_state ? "on" : "off");
 355            ret = false;
 356            /* Don't break here to report all failed capabilities */
 357        }
 358    }
 359
 360    g_free(source_caps_bm);
 361    return ret;
 362}
 363
 364static int configuration_post_load(void *opaque, int version_id)
 365{
 366    SaveState *state = opaque;
 367    const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
 368
 369    if (strncmp(state->name, current_name, state->len) != 0) {
 370        error_report("Machine type received is '%.*s' and local is '%s'",
 371                     (int) state->len, state->name, current_name);
 372        return -EINVAL;
 373    }
 374
 375    if (state->target_page_bits != qemu_target_page_bits()) {
 376        error_report("Received TARGET_PAGE_BITS is %d but local is %d",
 377                     state->target_page_bits, qemu_target_page_bits());
 378        return -EINVAL;
 379    }
 380
 381    if (!configuration_validate_capabilities(state)) {
 382        return -EINVAL;
 383    }
 384
 385    return 0;
 386}
 387
 388static int get_capability(QEMUFile *f, void *pv, size_t size,
 389                          const VMStateField *field)
 390{
 391    MigrationCapability *capability = pv;
 392    char capability_str[UINT8_MAX + 1];
 393    uint8_t len;
 394    int i;
 395
 396    len = qemu_get_byte(f);
 397    qemu_get_buffer(f, (uint8_t *)capability_str, len);
 398    capability_str[len] = '\0';
 399    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 400        if (!strcmp(MigrationCapability_str(i), capability_str)) {
 401            *capability = i;
 402            return 0;
 403        }
 404    }
 405    error_report("Received unknown capability %s", capability_str);
 406    return -EINVAL;
 407}
 408
 409static int put_capability(QEMUFile *f, void *pv, size_t size,
 410                          const VMStateField *field, QJSON *vmdesc)
 411{
 412    MigrationCapability *capability = pv;
 413    const char *capability_str = MigrationCapability_str(*capability);
 414    size_t len = strlen(capability_str);
 415    assert(len <= UINT8_MAX);
 416
 417    qemu_put_byte(f, len);
 418    qemu_put_buffer(f, (uint8_t *)capability_str, len);
 419    return 0;
 420}
 421
 422static const VMStateInfo vmstate_info_capability = {
 423    .name = "capability",
 424    .get  = get_capability,
 425    .put  = put_capability,
 426};
 427
 428/* The target-page-bits subsection is present only if the
 429 * target page size is not the same as the default (ie the
 430 * minimum page size for a variable-page-size guest CPU).
 431 * If it is present then it contains the actual target page
 432 * bits for the machine, and migration will fail if the
 433 * two ends don't agree about it.
 434 */
 435static bool vmstate_target_page_bits_needed(void *opaque)
 436{
 437    return qemu_target_page_bits()
 438        > qemu_target_page_bits_min();
 439}
 440
 441static const VMStateDescription vmstate_target_page_bits = {
 442    .name = "configuration/target-page-bits",
 443    .version_id = 1,
 444    .minimum_version_id = 1,
 445    .needed = vmstate_target_page_bits_needed,
 446    .fields = (VMStateField[]) {
 447        VMSTATE_UINT32(target_page_bits, SaveState),
 448        VMSTATE_END_OF_LIST()
 449    }
 450};
 451
 452static bool vmstate_capabilites_needed(void *opaque)
 453{
 454    return get_validatable_capabilities_count() > 0;
 455}
 456
 457static const VMStateDescription vmstate_capabilites = {
 458    .name = "configuration/capabilities",
 459    .version_id = 1,
 460    .minimum_version_id = 1,
 461    .needed = vmstate_capabilites_needed,
 462    .fields = (VMStateField[]) {
 463        VMSTATE_UINT32_V(caps_count, SaveState, 1),
 464        VMSTATE_VARRAY_UINT32_ALLOC(capabilities, SaveState, caps_count, 1,
 465                                    vmstate_info_capability,
 466                                    MigrationCapability),
 467        VMSTATE_END_OF_LIST()
 468    }
 469};
 470
 471static bool vmstate_uuid_needed(void *opaque)
 472{
 473    return qemu_uuid_set && migrate_validate_uuid();
 474}
 475
 476static int vmstate_uuid_post_load(void *opaque, int version_id)
 477{
 478    SaveState *state = opaque;
 479    char uuid_src[UUID_FMT_LEN + 1];
 480    char uuid_dst[UUID_FMT_LEN + 1];
 481
 482    if (!qemu_uuid_set) {
 483        /*
 484         * It's warning because user might not know UUID in some cases,
 485         * e.g. load an old snapshot
 486         */
 487        qemu_uuid_unparse(&state->uuid, uuid_src);
 488        warn_report("UUID is received %s, but local uuid isn't set",
 489                     uuid_src);
 490        return 0;
 491    }
 492    if (!qemu_uuid_is_equal(&state->uuid, &qemu_uuid)) {
 493        qemu_uuid_unparse(&state->uuid, uuid_src);
 494        qemu_uuid_unparse(&qemu_uuid, uuid_dst);
 495        error_report("UUID received is %s and local is %s", uuid_src, uuid_dst);
 496        return -EINVAL;
 497    }
 498    return 0;
 499}
 500
 501static const VMStateDescription vmstate_uuid = {
 502    .name = "configuration/uuid",
 503    .version_id = 1,
 504    .minimum_version_id = 1,
 505    .needed = vmstate_uuid_needed,
 506    .post_load = vmstate_uuid_post_load,
 507    .fields = (VMStateField[]) {
 508        VMSTATE_UINT8_ARRAY_V(uuid.data, SaveState, sizeof(QemuUUID), 1),
 509        VMSTATE_END_OF_LIST()
 510    }
 511};
 512
 513static const VMStateDescription vmstate_configuration = {
 514    .name = "configuration",
 515    .version_id = 1,
 516    .pre_load = configuration_pre_load,
 517    .post_load = configuration_post_load,
 518    .pre_save = configuration_pre_save,
 519    .fields = (VMStateField[]) {
 520        VMSTATE_UINT32(len, SaveState),
 521        VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
 522        VMSTATE_END_OF_LIST()
 523    },
 524    .subsections = (const VMStateDescription*[]) {
 525        &vmstate_target_page_bits,
 526        &vmstate_capabilites,
 527        &vmstate_uuid,
 528        NULL
 529    }
 530};
 531
 532static void dump_vmstate_vmsd(FILE *out_file,
 533                              const VMStateDescription *vmsd, int indent,
 534                              bool is_subsection);
 535
 536static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field,
 537                              int indent)
 538{
 539    fprintf(out_file, "%*s{\n", indent, "");
 540    indent += 2;
 541    fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name);
 542    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 543            field->version_id);
 544    fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "",
 545            field->field_exists ? "true" : "false");
 546    fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size);
 547    if (field->vmsd != NULL) {
 548        fprintf(out_file, ",\n");
 549        dump_vmstate_vmsd(out_file, field->vmsd, indent, false);
 550    }
 551    fprintf(out_file, "\n%*s}", indent - 2, "");
 552}
 553
 554static void dump_vmstate_vmss(FILE *out_file,
 555                              const VMStateDescription **subsection,
 556                              int indent)
 557{
 558    if (*subsection != NULL) {
 559        dump_vmstate_vmsd(out_file, *subsection, indent, true);
 560    }
 561}
 562
 563static void dump_vmstate_vmsd(FILE *out_file,
 564                              const VMStateDescription *vmsd, int indent,
 565                              bool is_subsection)
 566{
 567    if (is_subsection) {
 568        fprintf(out_file, "%*s{\n", indent, "");
 569    } else {
 570        fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description");
 571    }
 572    indent += 2;
 573    fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name);
 574    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 575            vmsd->version_id);
 576    fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "",
 577            vmsd->minimum_version_id);
 578    if (vmsd->fields != NULL) {
 579        const VMStateField *field = vmsd->fields;
 580        bool first;
 581
 582        fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, "");
 583        first = true;
 584        while (field->name != NULL) {
 585            if (field->flags & VMS_MUST_EXIST) {
 586                /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */
 587                field++;
 588                continue;
 589            }
 590            if (!first) {
 591                fprintf(out_file, ",\n");
 592            }
 593            dump_vmstate_vmsf(out_file, field, indent + 2);
 594            field++;
 595            first = false;
 596        }
 597        fprintf(out_file, "\n%*s]", indent, "");
 598    }
 599    if (vmsd->subsections != NULL) {
 600        const VMStateDescription **subsection = vmsd->subsections;
 601        bool first;
 602
 603        fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, "");
 604        first = true;
 605        while (*subsection != NULL) {
 606            if (!first) {
 607                fprintf(out_file, ",\n");
 608            }
 609            dump_vmstate_vmss(out_file, subsection, indent + 2);
 610            subsection++;
 611            first = false;
 612        }
 613        fprintf(out_file, "\n%*s]", indent, "");
 614    }
 615    fprintf(out_file, "\n%*s}", indent - 2, "");
 616}
 617
 618static void dump_machine_type(FILE *out_file)
 619{
 620    MachineClass *mc;
 621
 622    mc = MACHINE_GET_CLASS(current_machine);
 623
 624    fprintf(out_file, "  \"vmschkmachine\": {\n");
 625    fprintf(out_file, "    \"Name\": \"%s\"\n", mc->name);
 626    fprintf(out_file, "  },\n");
 627}
 628
 629void dump_vmstate_json_to_file(FILE *out_file)
 630{
 631    GSList *list, *elt;
 632    bool first;
 633
 634    fprintf(out_file, "{\n");
 635    dump_machine_type(out_file);
 636
 637    first = true;
 638    list = object_class_get_list(TYPE_DEVICE, true);
 639    for (elt = list; elt; elt = elt->next) {
 640        DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data,
 641                                             TYPE_DEVICE);
 642        const char *name;
 643        int indent = 2;
 644
 645        if (!dc->vmsd) {
 646            continue;
 647        }
 648
 649        if (!first) {
 650            fprintf(out_file, ",\n");
 651        }
 652        name = object_class_get_name(OBJECT_CLASS(dc));
 653        fprintf(out_file, "%*s\"%s\": {\n", indent, "", name);
 654        indent += 2;
 655        fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name);
 656        fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 657                dc->vmsd->version_id);
 658        fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "",
 659                dc->vmsd->minimum_version_id);
 660
 661        dump_vmstate_vmsd(out_file, dc->vmsd, indent, false);
 662
 663        fprintf(out_file, "\n%*s}", indent - 2, "");
 664        first = false;
 665    }
 666    fprintf(out_file, "\n}\n");
 667    fclose(out_file);
 668    g_slist_free(list);
 669}
 670
 671static uint32_t calculate_new_instance_id(const char *idstr)
 672{
 673    SaveStateEntry *se;
 674    uint32_t instance_id = 0;
 675
 676    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 677        if (strcmp(idstr, se->idstr) == 0
 678            && instance_id <= se->instance_id) {
 679            instance_id = se->instance_id + 1;
 680        }
 681    }
 682    /* Make sure we never loop over without being noticed */
 683    assert(instance_id != VMSTATE_INSTANCE_ID_ANY);
 684    return instance_id;
 685}
 686
 687static int calculate_compat_instance_id(const char *idstr)
 688{
 689    SaveStateEntry *se;
 690    int instance_id = 0;
 691
 692    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 693        if (!se->compat) {
 694            continue;
 695        }
 696
 697        if (strcmp(idstr, se->compat->idstr) == 0
 698            && instance_id <= se->compat->instance_id) {
 699            instance_id = se->compat->instance_id + 1;
 700        }
 701    }
 702    return instance_id;
 703}
 704
 705static inline MigrationPriority save_state_priority(SaveStateEntry *se)
 706{
 707    if (se->vmsd) {
 708        return se->vmsd->priority;
 709    }
 710    return MIG_PRI_DEFAULT;
 711}
 712
 713static void savevm_state_handler_insert(SaveStateEntry *nse)
 714{
 715    MigrationPriority priority = save_state_priority(nse);
 716    SaveStateEntry *se;
 717    int i;
 718
 719    assert(priority <= MIG_PRI_MAX);
 720
 721    for (i = priority - 1; i >= 0; i--) {
 722        se = savevm_state.handler_pri_head[i];
 723        if (se != NULL) {
 724            assert(save_state_priority(se) < priority);
 725            break;
 726        }
 727    }
 728
 729    if (i >= 0) {
 730        QTAILQ_INSERT_BEFORE(se, nse, entry);
 731    } else {
 732        QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry);
 733    }
 734
 735    if (savevm_state.handler_pri_head[priority] == NULL) {
 736        savevm_state.handler_pri_head[priority] = nse;
 737    }
 738}
 739
 740static void savevm_state_handler_remove(SaveStateEntry *se)
 741{
 742    SaveStateEntry *next;
 743    MigrationPriority priority = save_state_priority(se);
 744
 745    if (se == savevm_state.handler_pri_head[priority]) {
 746        next = QTAILQ_NEXT(se, entry);
 747        if (next != NULL && save_state_priority(next) == priority) {
 748            savevm_state.handler_pri_head[priority] = next;
 749        } else {
 750            savevm_state.handler_pri_head[priority] = NULL;
 751        }
 752    }
 753    QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
 754}
 755
 756/* TODO: Individual devices generally have very little idea about the rest
 757   of the system, so instance_id should be removed/replaced.
 758   Meanwhile pass -1 as instance_id if you do not already have a clearly
 759   distinguishing id for all instances of your device class. */
 760int register_savevm_live(const char *idstr,
 761                         uint32_t instance_id,
 762                         int version_id,
 763                         const SaveVMHandlers *ops,
 764                         void *opaque)
 765{
 766    SaveStateEntry *se;
 767
 768    se = g_new0(SaveStateEntry, 1);
 769    se->version_id = version_id;
 770    se->section_id = savevm_state.global_section_id++;
 771    se->ops = ops;
 772    se->opaque = opaque;
 773    se->vmsd = NULL;
 774    /* if this is a live_savem then set is_ram */
 775    if (ops->save_setup != NULL) {
 776        se->is_ram = 1;
 777    }
 778
 779    pstrcat(se->idstr, sizeof(se->idstr), idstr);
 780
 781    if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
 782        se->instance_id = calculate_new_instance_id(se->idstr);
 783    } else {
 784        se->instance_id = instance_id;
 785    }
 786    assert(!se->compat || se->instance_id == 0);
 787    savevm_state_handler_insert(se);
 788    return 0;
 789}
 790
 791void unregister_savevm(VMStateIf *obj, const char *idstr, void *opaque)
 792{
 793    SaveStateEntry *se, *new_se;
 794    char id[256] = "";
 795
 796    if (obj) {
 797        char *oid = vmstate_if_get_id(obj);
 798        if (oid) {
 799            pstrcpy(id, sizeof(id), oid);
 800            pstrcat(id, sizeof(id), "/");
 801            g_free(oid);
 802        }
 803    }
 804    pstrcat(id, sizeof(id), idstr);
 805
 806    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
 807        if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
 808            savevm_state_handler_remove(se);
 809            g_free(se->compat);
 810            g_free(se);
 811        }
 812    }
 813}
 814
 815int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
 816                                   const VMStateDescription *vmsd,
 817                                   void *opaque, int alias_id,
 818                                   int required_for_version,
 819                                   Error **errp)
 820{
 821    SaveStateEntry *se;
 822
 823    /* If this triggers, alias support can be dropped for the vmsd. */
 824    assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id);
 825
 826    se = g_new0(SaveStateEntry, 1);
 827    se->version_id = vmsd->version_id;
 828    se->section_id = savevm_state.global_section_id++;
 829    se->opaque = opaque;
 830    se->vmsd = vmsd;
 831    se->alias_id = alias_id;
 832
 833    if (obj) {
 834        char *id = vmstate_if_get_id(obj);
 835        if (id) {
 836            if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
 837                sizeof(se->idstr)) {
 838                error_setg(errp, "Path too long for VMState (%s)", id);
 839                g_free(id);
 840                g_free(se);
 841
 842                return -1;
 843            }
 844            g_free(id);
 845
 846            se->compat = g_new0(CompatEntry, 1);
 847            pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name);
 848            se->compat->instance_id = instance_id == VMSTATE_INSTANCE_ID_ANY ?
 849                         calculate_compat_instance_id(vmsd->name) : instance_id;
 850            instance_id = VMSTATE_INSTANCE_ID_ANY;
 851        }
 852    }
 853    pstrcat(se->idstr, sizeof(se->idstr), vmsd->name);
 854
 855    if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
 856        se->instance_id = calculate_new_instance_id(se->idstr);
 857    } else {
 858        se->instance_id = instance_id;
 859    }
 860    assert(!se->compat || se->instance_id == 0);
 861    savevm_state_handler_insert(se);
 862    return 0;
 863}
 864
 865void vmstate_unregister(VMStateIf *obj, const VMStateDescription *vmsd,
 866                        void *opaque)
 867{
 868    SaveStateEntry *se, *new_se;
 869
 870    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
 871        if (se->vmsd == vmsd && se->opaque == opaque) {
 872            savevm_state_handler_remove(se);
 873            g_free(se->compat);
 874            g_free(se);
 875        }
 876    }
 877}
 878
 879static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
 880{
 881    trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
 882    if (!se->vmsd) {         /* Old style */
 883        return se->ops->load_state(f, se->opaque, se->load_version_id);
 884    }
 885    return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id);
 886}
 887
 888static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc)
 889{
 890    int64_t old_offset, size;
 891
 892    old_offset = qemu_ftell_fast(f);
 893    se->ops->save_state(f, se->opaque);
 894    size = qemu_ftell_fast(f) - old_offset;
 895
 896    if (vmdesc) {
 897        json_prop_int(vmdesc, "size", size);
 898        json_start_array(vmdesc, "fields");
 899        json_start_object(vmdesc, NULL);
 900        json_prop_str(vmdesc, "name", "data");
 901        json_prop_int(vmdesc, "size", size);
 902        json_prop_str(vmdesc, "type", "buffer");
 903        json_end_object(vmdesc);
 904        json_end_array(vmdesc);
 905    }
 906}
 907
 908static int vmstate_save(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc)
 909{
 910    trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
 911    if (!se->vmsd) {
 912        vmstate_save_old_style(f, se, vmdesc);
 913        return 0;
 914    }
 915    return vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
 916}
 917
 918/*
 919 * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
 920 */
 921static void save_section_header(QEMUFile *f, SaveStateEntry *se,
 922                                uint8_t section_type)
 923{
 924    qemu_put_byte(f, section_type);
 925    qemu_put_be32(f, se->section_id);
 926
 927    if (section_type == QEMU_VM_SECTION_FULL ||
 928        section_type == QEMU_VM_SECTION_START) {
 929        /* ID string */
 930        size_t len = strlen(se->idstr);
 931        qemu_put_byte(f, len);
 932        qemu_put_buffer(f, (uint8_t *)se->idstr, len);
 933
 934        qemu_put_be32(f, se->instance_id);
 935        qemu_put_be32(f, se->version_id);
 936    }
 937}
 938
 939/*
 940 * Write a footer onto device sections that catches cases misformatted device
 941 * sections.
 942 */
 943static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
 944{
 945    if (migrate_get_current()->send_section_footer) {
 946        qemu_put_byte(f, QEMU_VM_SECTION_FOOTER);
 947        qemu_put_be32(f, se->section_id);
 948    }
 949}
 950
 951/**
 952 * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
 953 *                           command and associated data.
 954 *
 955 * @f: File to send command on
 956 * @command: Command type to send
 957 * @len: Length of associated data
 958 * @data: Data associated with command.
 959 */
 960static void qemu_savevm_command_send(QEMUFile *f,
 961                                     enum qemu_vm_cmd command,
 962                                     uint16_t len,
 963                                     uint8_t *data)
 964{
 965    trace_savevm_command_send(command, len);
 966    qemu_put_byte(f, QEMU_VM_COMMAND);
 967    qemu_put_be16(f, (uint16_t)command);
 968    qemu_put_be16(f, len);
 969    qemu_put_buffer(f, data, len);
 970    qemu_fflush(f);
 971}
 972
 973void qemu_savevm_send_colo_enable(QEMUFile *f)
 974{
 975    trace_savevm_send_colo_enable();
 976    qemu_savevm_command_send(f, MIG_CMD_ENABLE_COLO, 0, NULL);
 977}
 978
 979void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
 980{
 981    uint32_t buf;
 982
 983    trace_savevm_send_ping(value);
 984    buf = cpu_to_be32(value);
 985    qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
 986}
 987
 988void qemu_savevm_send_open_return_path(QEMUFile *f)
 989{
 990    trace_savevm_send_open_return_path();
 991    qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
 992}
 993
 994/* We have a buffer of data to send; we don't want that all to be loaded
 995 * by the command itself, so the command contains just the length of the
 996 * extra buffer that we then send straight after it.
 997 * TODO: Must be a better way to organise that
 998 *
 999 * Returns:
1000 *    0 on success
1001 *    -ve on error
1002 */
1003int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len)
1004{
1005    uint32_t tmp;
1006
1007    if (len > MAX_VM_CMD_PACKAGED_SIZE) {
1008        error_report("%s: Unreasonably large packaged state: %zu",
1009                     __func__, len);
1010        return -1;
1011    }
1012
1013    tmp = cpu_to_be32(len);
1014
1015    trace_qemu_savevm_send_packaged();
1016    qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
1017
1018    qemu_put_buffer(f, buf, len);
1019
1020    return 0;
1021}
1022
1023/* Send prior to any postcopy transfer */
1024void qemu_savevm_send_postcopy_advise(QEMUFile *f)
1025{
1026    if (migrate_postcopy_ram()) {
1027        uint64_t tmp[2];
1028        tmp[0] = cpu_to_be64(ram_pagesize_summary());
1029        tmp[1] = cpu_to_be64(qemu_target_page_size());
1030
1031        trace_qemu_savevm_send_postcopy_advise();
1032        qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE,
1033                                 16, (uint8_t *)tmp);
1034    } else {
1035        qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 0, NULL);
1036    }
1037}
1038
1039/* Sent prior to starting the destination running in postcopy, discard pages
1040 * that have already been sent but redirtied on the source.
1041 * CMD_POSTCOPY_RAM_DISCARD consist of:
1042 *      byte   version (0)
1043 *      byte   Length of name field (not including 0)
1044 *  n x byte   RAM block name
1045 *      byte   0 terminator (just for safety)
1046 *  n x        Byte ranges within the named RAMBlock
1047 *      be64   Start of the range
1048 *      be64   Length
1049 *
1050 *  name:  RAMBlock name that these entries are part of
1051 *  len: Number of page entries
1052 *  start_list: 'len' addresses
1053 *  length_list: 'len' addresses
1054 *
1055 */
1056void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
1057                                           uint16_t len,
1058                                           uint64_t *start_list,
1059                                           uint64_t *length_list)
1060{
1061    uint8_t *buf;
1062    uint16_t tmplen;
1063    uint16_t t;
1064    size_t name_len = strlen(name);
1065
1066    trace_qemu_savevm_send_postcopy_ram_discard(name, len);
1067    assert(name_len < 256);
1068    buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
1069    buf[0] = postcopy_ram_discard_version;
1070    buf[1] = name_len;
1071    memcpy(buf + 2, name, name_len);
1072    tmplen = 2 + name_len;
1073    buf[tmplen++] = '\0';
1074
1075    for (t = 0; t < len; t++) {
1076        stq_be_p(buf + tmplen, start_list[t]);
1077        tmplen += 8;
1078        stq_be_p(buf + tmplen, length_list[t]);
1079        tmplen += 8;
1080    }
1081    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
1082    g_free(buf);
1083}
1084
1085/* Get the destination into a state where it can receive postcopy data. */
1086void qemu_savevm_send_postcopy_listen(QEMUFile *f)
1087{
1088    trace_savevm_send_postcopy_listen();
1089    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
1090}
1091
1092/* Kick the destination into running */
1093void qemu_savevm_send_postcopy_run(QEMUFile *f)
1094{
1095    trace_savevm_send_postcopy_run();
1096    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
1097}
1098
1099void qemu_savevm_send_postcopy_resume(QEMUFile *f)
1100{
1101    trace_savevm_send_postcopy_resume();
1102    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RESUME, 0, NULL);
1103}
1104
1105void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name)
1106{
1107    size_t len;
1108    char buf[256];
1109
1110    trace_savevm_send_recv_bitmap(block_name);
1111
1112    buf[0] = len = strlen(block_name);
1113    memcpy(buf + 1, block_name, len);
1114
1115    qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
1116}
1117
1118bool qemu_savevm_state_blocked(Error **errp)
1119{
1120    SaveStateEntry *se;
1121
1122    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1123        if (se->vmsd && se->vmsd->unmigratable) {
1124            error_setg(errp, "State blocked by non-migratable device '%s'",
1125                       se->idstr);
1126            return true;
1127        }
1128    }
1129    return false;
1130}
1131
1132void qemu_savevm_state_header(QEMUFile *f)
1133{
1134    trace_savevm_state_header();
1135    qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1136    qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1137
1138    if (migrate_get_current()->send_configuration) {
1139        qemu_put_byte(f, QEMU_VM_CONFIGURATION);
1140        vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
1141    }
1142}
1143
1144bool qemu_savevm_state_guest_unplug_pending(void)
1145{
1146    SaveStateEntry *se;
1147
1148    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1149        if (se->vmsd && se->vmsd->dev_unplug_pending &&
1150            se->vmsd->dev_unplug_pending(se->opaque)) {
1151            return true;
1152        }
1153    }
1154
1155    return false;
1156}
1157
1158void qemu_savevm_state_setup(QEMUFile *f)
1159{
1160    SaveStateEntry *se;
1161    Error *local_err = NULL;
1162    int ret;
1163
1164    trace_savevm_state_setup();
1165    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1166        if (!se->ops || !se->ops->save_setup) {
1167            continue;
1168        }
1169        if (se->ops->is_active) {
1170            if (!se->ops->is_active(se->opaque)) {
1171                continue;
1172            }
1173        }
1174        save_section_header(f, se, QEMU_VM_SECTION_START);
1175
1176        ret = se->ops->save_setup(f, se->opaque);
1177        save_section_footer(f, se);
1178        if (ret < 0) {
1179            qemu_file_set_error(f, ret);
1180            break;
1181        }
1182    }
1183
1184    if (precopy_notify(PRECOPY_NOTIFY_SETUP, &local_err)) {
1185        error_report_err(local_err);
1186    }
1187}
1188
1189int qemu_savevm_state_resume_prepare(MigrationState *s)
1190{
1191    SaveStateEntry *se;
1192    int ret;
1193
1194    trace_savevm_state_resume_prepare();
1195
1196    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1197        if (!se->ops || !se->ops->resume_prepare) {
1198            continue;
1199        }
1200        if (se->ops->is_active) {
1201            if (!se->ops->is_active(se->opaque)) {
1202                continue;
1203            }
1204        }
1205        ret = se->ops->resume_prepare(s, se->opaque);
1206        if (ret < 0) {
1207            return ret;
1208        }
1209    }
1210
1211    return 0;
1212}
1213
1214/*
1215 * this function has three return values:
1216 *   negative: there was one error, and we have -errno.
1217 *   0 : We haven't finished, caller have to go again
1218 *   1 : We have finished, we can go to complete phase
1219 */
1220int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
1221{
1222    SaveStateEntry *se;
1223    int ret = 1;
1224
1225    trace_savevm_state_iterate();
1226    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1227        if (!se->ops || !se->ops->save_live_iterate) {
1228            continue;
1229        }
1230        if (se->ops->is_active &&
1231            !se->ops->is_active(se->opaque)) {
1232            continue;
1233        }
1234        if (se->ops->is_active_iterate &&
1235            !se->ops->is_active_iterate(se->opaque)) {
1236            continue;
1237        }
1238        /*
1239         * In the postcopy phase, any device that doesn't know how to
1240         * do postcopy should have saved it's state in the _complete
1241         * call that's already run, it might get confused if we call
1242         * iterate afterwards.
1243         */
1244        if (postcopy &&
1245            !(se->ops->has_postcopy && se->ops->has_postcopy(se->opaque))) {
1246            continue;
1247        }
1248        if (qemu_file_rate_limit(f)) {
1249            return 0;
1250        }
1251        trace_savevm_section_start(se->idstr, se->section_id);
1252
1253        save_section_header(f, se, QEMU_VM_SECTION_PART);
1254
1255        ret = se->ops->save_live_iterate(f, se->opaque);
1256        trace_savevm_section_end(se->idstr, se->section_id, ret);
1257        save_section_footer(f, se);
1258
1259        if (ret < 0) {
1260            error_report("failed to save SaveStateEntry with id(name): %d(%s)",
1261                         se->section_id, se->idstr);
1262            qemu_file_set_error(f, ret);
1263        }
1264        if (ret <= 0) {
1265            /* Do not proceed to the next vmstate before this one reported
1266               completion of the current stage. This serializes the migration
1267               and reduces the probability that a faster changing state is
1268               synchronized over and over again. */
1269            break;
1270        }
1271    }
1272    return ret;
1273}
1274
1275static bool should_send_vmdesc(void)
1276{
1277    MachineState *machine = MACHINE(qdev_get_machine());
1278    bool in_postcopy = migration_in_postcopy();
1279    return !machine->suppress_vmdesc && !in_postcopy;
1280}
1281
1282/*
1283 * Calls the save_live_complete_postcopy methods
1284 * causing the last few pages to be sent immediately and doing any associated
1285 * cleanup.
1286 * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
1287 * all the other devices, but that happens at the point we switch to postcopy.
1288 */
1289void qemu_savevm_state_complete_postcopy(QEMUFile *f)
1290{
1291    SaveStateEntry *se;
1292    int ret;
1293
1294    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1295        if (!se->ops || !se->ops->save_live_complete_postcopy) {
1296            continue;
1297        }
1298        if (se->ops->is_active) {
1299            if (!se->ops->is_active(se->opaque)) {
1300                continue;
1301            }
1302        }
1303        trace_savevm_section_start(se->idstr, se->section_id);
1304        /* Section type */
1305        qemu_put_byte(f, QEMU_VM_SECTION_END);
1306        qemu_put_be32(f, se->section_id);
1307
1308        ret = se->ops->save_live_complete_postcopy(f, se->opaque);
1309        trace_savevm_section_end(se->idstr, se->section_id, ret);
1310        save_section_footer(f, se);
1311        if (ret < 0) {
1312            qemu_file_set_error(f, ret);
1313            return;
1314        }
1315    }
1316
1317    qemu_put_byte(f, QEMU_VM_EOF);
1318    qemu_fflush(f);
1319}
1320
1321static
1322int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
1323{
1324    SaveStateEntry *se;
1325    int ret;
1326
1327    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1328        if (!se->ops ||
1329            (in_postcopy && se->ops->has_postcopy &&
1330             se->ops->has_postcopy(se->opaque)) ||
1331            !se->ops->save_live_complete_precopy) {
1332            continue;
1333        }
1334
1335        if (se->ops->is_active) {
1336            if (!se->ops->is_active(se->opaque)) {
1337                continue;
1338            }
1339        }
1340        trace_savevm_section_start(se->idstr, se->section_id);
1341
1342        save_section_header(f, se, QEMU_VM_SECTION_END);
1343
1344        ret = se->ops->save_live_complete_precopy(f, se->opaque);
1345        trace_savevm_section_end(se->idstr, se->section_id, ret);
1346        save_section_footer(f, se);
1347        if (ret < 0) {
1348            qemu_file_set_error(f, ret);
1349            return -1;
1350        }
1351    }
1352
1353    return 0;
1354}
1355
1356static
1357int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
1358                                                    bool in_postcopy,
1359                                                    bool inactivate_disks)
1360{
1361    g_autoptr(QJSON) vmdesc = NULL;
1362    int vmdesc_len;
1363    SaveStateEntry *se;
1364    int ret;
1365
1366    vmdesc = qjson_new();
1367    json_prop_int(vmdesc, "page_size", qemu_target_page_size());
1368    json_start_array(vmdesc, "devices");
1369    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1370
1371        if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1372            continue;
1373        }
1374        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1375            trace_savevm_section_skip(se->idstr, se->section_id);
1376            continue;
1377        }
1378
1379        trace_savevm_section_start(se->idstr, se->section_id);
1380
1381        json_start_object(vmdesc, NULL);
1382        json_prop_str(vmdesc, "name", se->idstr);
1383        json_prop_int(vmdesc, "instance_id", se->instance_id);
1384
1385        save_section_header(f, se, QEMU_VM_SECTION_FULL);
1386        ret = vmstate_save(f, se, vmdesc);
1387        if (ret) {
1388            qemu_file_set_error(f, ret);
1389            return ret;
1390        }
1391        trace_savevm_section_end(se->idstr, se->section_id, 0);
1392        save_section_footer(f, se);
1393
1394        json_end_object(vmdesc);
1395    }
1396
1397    if (inactivate_disks) {
1398        /* Inactivate before sending QEMU_VM_EOF so that the
1399         * bdrv_invalidate_cache_all() on the other end won't fail. */
1400        ret = bdrv_inactivate_all();
1401        if (ret) {
1402            error_report("%s: bdrv_inactivate_all() failed (%d)",
1403                         __func__, ret);
1404            qemu_file_set_error(f, ret);
1405            return ret;
1406        }
1407    }
1408    if (!in_postcopy) {
1409        /* Postcopy stream will still be going */
1410        qemu_put_byte(f, QEMU_VM_EOF);
1411    }
1412
1413    json_end_array(vmdesc);
1414    qjson_finish(vmdesc);
1415    vmdesc_len = strlen(qjson_get_str(vmdesc));
1416
1417    if (should_send_vmdesc()) {
1418        qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
1419        qemu_put_be32(f, vmdesc_len);
1420        qemu_put_buffer(f, (uint8_t *)qjson_get_str(vmdesc), vmdesc_len);
1421    }
1422
1423    return 0;
1424}
1425
1426int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
1427                                       bool inactivate_disks)
1428{
1429    int ret;
1430    Error *local_err = NULL;
1431    bool in_postcopy = migration_in_postcopy();
1432
1433    if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) {
1434        error_report_err(local_err);
1435    }
1436
1437    trace_savevm_state_complete_precopy();
1438
1439    cpu_synchronize_all_states();
1440
1441    if (!in_postcopy || iterable_only) {
1442        ret = qemu_savevm_state_complete_precopy_iterable(f, in_postcopy);
1443        if (ret) {
1444            return ret;
1445        }
1446    }
1447
1448    if (iterable_only) {
1449        goto flush;
1450    }
1451
1452    ret = qemu_savevm_state_complete_precopy_non_iterable(f, in_postcopy,
1453                                                          inactivate_disks);
1454    if (ret) {
1455        return ret;
1456    }
1457
1458flush:
1459    qemu_fflush(f);
1460    return 0;
1461}
1462
1463/* Give an estimate of the amount left to be transferred,
1464 * the result is split into the amount for units that can and
1465 * for units that can't do postcopy.
1466 */
1467void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
1468                               uint64_t *res_precopy_only,
1469                               uint64_t *res_compatible,
1470                               uint64_t *res_postcopy_only)
1471{
1472    SaveStateEntry *se;
1473
1474    *res_precopy_only = 0;
1475    *res_compatible = 0;
1476    *res_postcopy_only = 0;
1477
1478
1479    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1480        if (!se->ops || !se->ops->save_live_pending) {
1481            continue;
1482        }
1483        if (se->ops->is_active) {
1484            if (!se->ops->is_active(se->opaque)) {
1485                continue;
1486            }
1487        }
1488        se->ops->save_live_pending(f, se->opaque, threshold_size,
1489                                   res_precopy_only, res_compatible,
1490                                   res_postcopy_only);
1491    }
1492}
1493
1494void qemu_savevm_state_cleanup(void)
1495{
1496    SaveStateEntry *se;
1497    Error *local_err = NULL;
1498
1499    if (precopy_notify(PRECOPY_NOTIFY_CLEANUP, &local_err)) {
1500        error_report_err(local_err);
1501    }
1502
1503    trace_savevm_state_cleanup();
1504    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1505        if (se->ops && se->ops->save_cleanup) {
1506            se->ops->save_cleanup(se->opaque);
1507        }
1508    }
1509}
1510
1511static int qemu_savevm_state(QEMUFile *f, Error **errp)
1512{
1513    int ret;
1514    MigrationState *ms = migrate_get_current();
1515    MigrationStatus status;
1516
1517    if (migration_is_running(ms->state)) {
1518        error_setg(errp, QERR_MIGRATION_ACTIVE);
1519        return -EINVAL;
1520    }
1521
1522    if (migrate_use_block()) {
1523        error_setg(errp, "Block migration and snapshots are incompatible");
1524        return -EINVAL;
1525    }
1526
1527    migrate_init(ms);
1528    memset(&ram_counters, 0, sizeof(ram_counters));
1529    ms->to_dst_file = f;
1530
1531    qemu_mutex_unlock_iothread();
1532    qemu_savevm_state_header(f);
1533    qemu_savevm_state_setup(f);
1534    qemu_mutex_lock_iothread();
1535
1536    while (qemu_file_get_error(f) == 0) {
1537        if (qemu_savevm_state_iterate(f, false) > 0) {
1538            break;
1539        }
1540    }
1541
1542    ret = qemu_file_get_error(f);
1543    if (ret == 0) {
1544        qemu_savevm_state_complete_precopy(f, false, false);
1545        ret = qemu_file_get_error(f);
1546    }
1547    qemu_savevm_state_cleanup();
1548    if (ret != 0) {
1549        error_setg_errno(errp, -ret, "Error while writing VM state");
1550    }
1551
1552    if (ret != 0) {
1553        status = MIGRATION_STATUS_FAILED;
1554    } else {
1555        status = MIGRATION_STATUS_COMPLETED;
1556    }
1557    migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status);
1558
1559    /* f is outer parameter, it should not stay in global migration state after
1560     * this function finished */
1561    ms->to_dst_file = NULL;
1562
1563    return ret;
1564}
1565
1566void qemu_savevm_live_state(QEMUFile *f)
1567{
1568    /* save QEMU_VM_SECTION_END section */
1569    qemu_savevm_state_complete_precopy(f, true, false);
1570    qemu_put_byte(f, QEMU_VM_EOF);
1571}
1572
1573int qemu_save_device_state(QEMUFile *f)
1574{
1575    SaveStateEntry *se;
1576
1577    if (!migration_in_colo_state()) {
1578        qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1579        qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1580    }
1581    cpu_synchronize_all_states();
1582
1583    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1584        int ret;
1585
1586        if (se->is_ram) {
1587            continue;
1588        }
1589        if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1590            continue;
1591        }
1592        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1593            continue;
1594        }
1595
1596        save_section_header(f, se, QEMU_VM_SECTION_FULL);
1597
1598        ret = vmstate_save(f, se, NULL);
1599        if (ret) {
1600            return ret;
1601        }
1602
1603        save_section_footer(f, se);
1604    }
1605
1606    qemu_put_byte(f, QEMU_VM_EOF);
1607
1608    return qemu_file_get_error(f);
1609}
1610
1611static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id)
1612{
1613    SaveStateEntry *se;
1614
1615    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1616        if (!strcmp(se->idstr, idstr) &&
1617            (instance_id == se->instance_id ||
1618             instance_id == se->alias_id))
1619            return se;
1620        /* Migrating from an older version? */
1621        if (strstr(se->idstr, idstr) && se->compat) {
1622            if (!strcmp(se->compat->idstr, idstr) &&
1623                (instance_id == se->compat->instance_id ||
1624                 instance_id == se->alias_id))
1625                return se;
1626        }
1627    }
1628    return NULL;
1629}
1630
1631enum LoadVMExitCodes {
1632    /* Allow a command to quit all layers of nested loadvm loops */
1633    LOADVM_QUIT     =  1,
1634};
1635
1636/* ------ incoming postcopy messages ------ */
1637/* 'advise' arrives before any transfers just to tell us that a postcopy
1638 * *might* happen - it might be skipped if precopy transferred everything
1639 * quickly.
1640 */
1641static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
1642                                         uint16_t len)
1643{
1644    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1645    uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps;
1646    Error *local_err = NULL;
1647
1648    trace_loadvm_postcopy_handle_advise();
1649    if (ps != POSTCOPY_INCOMING_NONE) {
1650        error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
1651        return -1;
1652    }
1653
1654    switch (len) {
1655    case 0:
1656        if (migrate_postcopy_ram()) {
1657            error_report("RAM postcopy is enabled but have 0 byte advise");
1658            return -EINVAL;
1659        }
1660        return 0;
1661    case 8 + 8:
1662        if (!migrate_postcopy_ram()) {
1663            error_report("RAM postcopy is disabled but have 16 byte advise");
1664            return -EINVAL;
1665        }
1666        break;
1667    default:
1668        error_report("CMD_POSTCOPY_ADVISE invalid length (%d)", len);
1669        return -EINVAL;
1670    }
1671
1672    if (!postcopy_ram_supported_by_host(mis)) {
1673        postcopy_state_set(POSTCOPY_INCOMING_NONE);
1674        return -1;
1675    }
1676
1677    remote_pagesize_summary = qemu_get_be64(mis->from_src_file);
1678    local_pagesize_summary = ram_pagesize_summary();
1679
1680    if (remote_pagesize_summary != local_pagesize_summary)  {
1681        /*
1682         * This detects two potential causes of mismatch:
1683         *   a) A mismatch in host page sizes
1684         *      Some combinations of mismatch are probably possible but it gets
1685         *      a bit more complicated.  In particular we need to place whole
1686         *      host pages on the dest at once, and we need to ensure that we
1687         *      handle dirtying to make sure we never end up sending part of
1688         *      a hostpage on it's own.
1689         *   b) The use of different huge page sizes on source/destination
1690         *      a more fine grain test is performed during RAM block migration
1691         *      but this test here causes a nice early clear failure, and
1692         *      also fails when passed to an older qemu that doesn't
1693         *      do huge pages.
1694         */
1695        error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64
1696                                                             " d=%" PRIx64 ")",
1697                     remote_pagesize_summary, local_pagesize_summary);
1698        return -1;
1699    }
1700
1701    remote_tps = qemu_get_be64(mis->from_src_file);
1702    if (remote_tps != qemu_target_page_size()) {
1703        /*
1704         * Again, some differences could be dealt with, but for now keep it
1705         * simple.
1706         */
1707        error_report("Postcopy needs matching target page sizes (s=%d d=%zd)",
1708                     (int)remote_tps, qemu_target_page_size());
1709        return -1;
1710    }
1711
1712    if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_ADVISE, &local_err)) {
1713        error_report_err(local_err);
1714        return -1;
1715    }
1716
1717    if (ram_postcopy_incoming_init(mis)) {
1718        return -1;
1719    }
1720
1721    return 0;
1722}
1723
1724/* After postcopy we will be told to throw some pages away since they're
1725 * dirty and will have to be demand fetched.  Must happen before CPU is
1726 * started.
1727 * There can be 0..many of these messages, each encoding multiple pages.
1728 */
1729static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
1730                                              uint16_t len)
1731{
1732    int tmp;
1733    char ramid[256];
1734    PostcopyState ps = postcopy_state_get();
1735
1736    trace_loadvm_postcopy_ram_handle_discard();
1737
1738    switch (ps) {
1739    case POSTCOPY_INCOMING_ADVISE:
1740        /* 1st discard */
1741        tmp = postcopy_ram_prepare_discard(mis);
1742        if (tmp) {
1743            return tmp;
1744        }
1745        break;
1746
1747    case POSTCOPY_INCOMING_DISCARD:
1748        /* Expected state */
1749        break;
1750
1751    default:
1752        error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
1753                     ps);
1754        return -1;
1755    }
1756    /* We're expecting a
1757     *    Version (0)
1758     *    a RAM ID string (length byte, name, 0 term)
1759     *    then at least 1 16 byte chunk
1760    */
1761    if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
1762        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1763        return -1;
1764    }
1765
1766    tmp = qemu_get_byte(mis->from_src_file);
1767    if (tmp != postcopy_ram_discard_version) {
1768        error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
1769        return -1;
1770    }
1771
1772    if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
1773        error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
1774        return -1;
1775    }
1776    tmp = qemu_get_byte(mis->from_src_file);
1777    if (tmp != 0) {
1778        error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
1779        return -1;
1780    }
1781
1782    len -= 3 + strlen(ramid);
1783    if (len % 16) {
1784        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1785        return -1;
1786    }
1787    trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
1788    while (len) {
1789        uint64_t start_addr, block_length;
1790        start_addr = qemu_get_be64(mis->from_src_file);
1791        block_length = qemu_get_be64(mis->from_src_file);
1792
1793        len -= 16;
1794        int ret = ram_discard_range(ramid, start_addr, block_length);
1795        if (ret) {
1796            return ret;
1797        }
1798    }
1799    trace_loadvm_postcopy_ram_handle_discard_end();
1800
1801    return 0;
1802}
1803
1804/*
1805 * Triggered by a postcopy_listen command; this thread takes over reading
1806 * the input stream, leaving the main thread free to carry on loading the rest
1807 * of the device state (from RAM).
1808 * (TODO:This could do with being in a postcopy file - but there again it's
1809 * just another input loop, not that postcopy specific)
1810 */
1811static void *postcopy_ram_listen_thread(void *opaque)
1812{
1813    MigrationIncomingState *mis = migration_incoming_get_current();
1814    QEMUFile *f = mis->from_src_file;
1815    int load_res;
1816    MigrationState *migr = migrate_get_current();
1817
1818    object_ref(OBJECT(migr));
1819
1820    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
1821                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);
1822    qemu_sem_post(&mis->listen_thread_sem);
1823    trace_postcopy_ram_listen_thread_start();
1824
1825    rcu_register_thread();
1826    /*
1827     * Because we're a thread and not a coroutine we can't yield
1828     * in qemu_file, and thus we must be blocking now.
1829     */
1830    qemu_file_set_blocking(f, true);
1831    load_res = qemu_loadvm_state_main(f, mis);
1832
1833    /*
1834     * This is tricky, but, mis->from_src_file can change after it
1835     * returns, when postcopy recovery happened. In the future, we may
1836     * want a wrapper for the QEMUFile handle.
1837     */
1838    f = mis->from_src_file;
1839
1840    /* And non-blocking again so we don't block in any cleanup */
1841    qemu_file_set_blocking(f, false);
1842
1843    trace_postcopy_ram_listen_thread_exit();
1844    if (load_res < 0) {
1845        qemu_file_set_error(f, load_res);
1846        dirty_bitmap_mig_cancel_incoming();
1847        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
1848            !migrate_postcopy_ram() && migrate_dirty_bitmaps())
1849        {
1850            error_report("%s: loadvm failed during postcopy: %d. All states "
1851                         "are migrated except dirty bitmaps. Some dirty "
1852                         "bitmaps may be lost, and present migrated dirty "
1853                         "bitmaps are correctly migrated and valid.",
1854                         __func__, load_res);
1855            load_res = 0; /* prevent further exit() */
1856        } else {
1857            error_report("%s: loadvm failed: %d", __func__, load_res);
1858            migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1859                                           MIGRATION_STATUS_FAILED);
1860        }
1861    }
1862    if (load_res >= 0) {
1863        /*
1864         * This looks good, but it's possible that the device loading in the
1865         * main thread hasn't finished yet, and so we might not be in 'RUN'
1866         * state yet; wait for the end of the main thread.
1867         */
1868        qemu_event_wait(&mis->main_thread_load_event);
1869    }
1870    postcopy_ram_incoming_cleanup(mis);
1871
1872    if (load_res < 0) {
1873        /*
1874         * If something went wrong then we have a bad state so exit;
1875         * depending how far we got it might be possible at this point
1876         * to leave the guest running and fire MCEs for pages that never
1877         * arrived as a desperate recovery step.
1878         */
1879        rcu_unregister_thread();
1880        exit(EXIT_FAILURE);
1881    }
1882
1883    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1884                                   MIGRATION_STATUS_COMPLETED);
1885    /*
1886     * If everything has worked fine, then the main thread has waited
1887     * for us to start, and we're the last use of the mis.
1888     * (If something broke then qemu will have to exit anyway since it's
1889     * got a bad migration state).
1890     */
1891    migration_incoming_state_destroy();
1892    qemu_loadvm_state_cleanup();
1893
1894    rcu_unregister_thread();
1895    mis->have_listen_thread = false;
1896    postcopy_state_set(POSTCOPY_INCOMING_END);
1897
1898    object_unref(OBJECT(migr));
1899
1900    return NULL;
1901}
1902
1903/* After this message we must be able to immediately receive postcopy data */
1904static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
1905{
1906    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
1907    trace_loadvm_postcopy_handle_listen();
1908    Error *local_err = NULL;
1909
1910    if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
1911        error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
1912        return -1;
1913    }
1914    if (ps == POSTCOPY_INCOMING_ADVISE) {
1915        /*
1916         * A rare case, we entered listen without having to do any discards,
1917         * so do the setup that's normally done at the time of the 1st discard.
1918         */
1919        if (migrate_postcopy_ram()) {
1920            postcopy_ram_prepare_discard(mis);
1921        }
1922    }
1923
1924    /*
1925     * Sensitise RAM - can now generate requests for blocks that don't exist
1926     * However, at this point the CPU shouldn't be running, and the IO
1927     * shouldn't be doing anything yet so don't actually expect requests
1928     */
1929    if (migrate_postcopy_ram()) {
1930        if (postcopy_ram_incoming_setup(mis)) {
1931            postcopy_ram_incoming_cleanup(mis);
1932            return -1;
1933        }
1934    }
1935
1936    if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) {
1937        error_report_err(local_err);
1938        return -1;
1939    }
1940
1941    mis->have_listen_thread = true;
1942    /* Start up the listening thread and wait for it to signal ready */
1943    qemu_sem_init(&mis->listen_thread_sem, 0);
1944    qemu_thread_create(&mis->listen_thread, "postcopy/listen",
1945                       postcopy_ram_listen_thread, NULL,
1946                       QEMU_THREAD_DETACHED);
1947    qemu_sem_wait(&mis->listen_thread_sem);
1948    qemu_sem_destroy(&mis->listen_thread_sem);
1949
1950    return 0;
1951}
1952
1953static void loadvm_postcopy_handle_run_bh(void *opaque)
1954{
1955    Error *local_err = NULL;
1956    MigrationIncomingState *mis = opaque;
1957
1958    /* TODO we should move all of this lot into postcopy_ram.c or a shared code
1959     * in migration.c
1960     */
1961    cpu_synchronize_all_post_init();
1962
1963    qemu_announce_self(&mis->announce_timer, migrate_announce_params());
1964
1965    /* Make sure all file formats flush their mutable metadata.
1966     * If we get an error here, just don't restart the VM yet. */
1967    bdrv_invalidate_cache_all(&local_err);
1968    if (local_err) {
1969        error_report_err(local_err);
1970        local_err = NULL;
1971        autostart = false;
1972    }
1973
1974    trace_loadvm_postcopy_handle_run_cpu_sync();
1975
1976    trace_loadvm_postcopy_handle_run_vmstart();
1977
1978    dirty_bitmap_mig_before_vm_start();
1979
1980    if (autostart) {
1981        /* Hold onto your hats, starting the CPU */
1982        vm_start();
1983    } else {
1984        /* leave it paused and let management decide when to start the CPU */
1985        runstate_set(RUN_STATE_PAUSED);
1986    }
1987
1988    qemu_bh_delete(mis->bh);
1989}
1990
1991/* After all discards we can start running and asking for pages */
1992static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
1993{
1994    PostcopyState ps = postcopy_state_get();
1995
1996    trace_loadvm_postcopy_handle_run();
1997    if (ps != POSTCOPY_INCOMING_LISTENING) {
1998        error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
1999        return -1;
2000    }
2001
2002    postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
2003    mis->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, mis);
2004    qemu_bh_schedule(mis->bh);
2005
2006    /* We need to finish reading the stream from the package
2007     * and also stop reading anything more from the stream that loaded the
2008     * package (since it's now being read by the listener thread).
2009     * LOADVM_QUIT will quit all the layers of nested loadvm loops.
2010     */
2011    return LOADVM_QUIT;
2012}
2013
2014static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis)
2015{
2016    if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
2017        error_report("%s: illegal resume received", __func__);
2018        /* Don't fail the load, only for this. */
2019        return 0;
2020    }
2021
2022    /*
2023     * This means source VM is ready to resume the postcopy migration.
2024     * It's time to switch state and release the fault thread to
2025     * continue service page faults.
2026     */
2027    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2028                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
2029    qemu_sem_post(&mis->postcopy_pause_sem_fault);
2030
2031    trace_loadvm_postcopy_handle_resume();
2032
2033    /* Tell source that "we are ready" */
2034    migrate_send_rp_resume_ack(mis, MIGRATION_RESUME_ACK_VALUE);
2035
2036    return 0;
2037}
2038
2039/**
2040 * Immediately following this command is a blob of data containing an embedded
2041 * chunk of migration stream; read it and load it.
2042 *
2043 * @mis: Incoming state
2044 * @length: Length of packaged data to read
2045 *
2046 * Returns: Negative values on error
2047 *
2048 */
2049static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
2050{
2051    int ret;
2052    size_t length;
2053    QIOChannelBuffer *bioc;
2054
2055    length = qemu_get_be32(mis->from_src_file);
2056    trace_loadvm_handle_cmd_packaged(length);
2057
2058    if (length > MAX_VM_CMD_PACKAGED_SIZE) {
2059        error_report("Unreasonably large packaged state: %zu", length);
2060        return -1;
2061    }
2062
2063    bioc = qio_channel_buffer_new(length);
2064    qio_channel_set_name(QIO_CHANNEL(bioc), "migration-loadvm-buffer");
2065    ret = qemu_get_buffer(mis->from_src_file,
2066                          bioc->data,
2067                          length);
2068    if (ret != length) {
2069        object_unref(OBJECT(bioc));
2070        error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu",
2071                     ret, length);
2072        return (ret < 0) ? ret : -EAGAIN;
2073    }
2074    bioc->usage += length;
2075    trace_loadvm_handle_cmd_packaged_received(ret);
2076
2077    QEMUFile *packf = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
2078
2079    ret = qemu_loadvm_state_main(packf, mis);
2080    trace_loadvm_handle_cmd_packaged_main(ret);
2081    qemu_fclose(packf);
2082    object_unref(OBJECT(bioc));
2083
2084    return ret;
2085}
2086
2087/*
2088 * Handle request that source requests for recved_bitmap on
2089 * destination. Payload format:
2090 *
2091 * len (1 byte) + ramblock_name (<255 bytes)
2092 */
2093static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
2094                                     uint16_t len)
2095{
2096    QEMUFile *file = mis->from_src_file;
2097    RAMBlock *rb;
2098    char block_name[256];
2099    size_t cnt;
2100
2101    cnt = qemu_get_counted_string(file, block_name);
2102    if (!cnt) {
2103        error_report("%s: failed to read block name", __func__);
2104        return -EINVAL;
2105    }
2106
2107    /* Validate before using the data */
2108    if (qemu_file_get_error(file)) {
2109        return qemu_file_get_error(file);
2110    }
2111
2112    if (len != cnt + 1) {
2113        error_report("%s: invalid payload length (%d)", __func__, len);
2114        return -EINVAL;
2115    }
2116
2117    rb = qemu_ram_block_by_name(block_name);
2118    if (!rb) {
2119        error_report("%s: block '%s' not found", __func__, block_name);
2120        return -EINVAL;
2121    }
2122
2123    migrate_send_rp_recv_bitmap(mis, block_name);
2124
2125    trace_loadvm_handle_recv_bitmap(block_name);
2126
2127    return 0;
2128}
2129
2130static int loadvm_process_enable_colo(MigrationIncomingState *mis)
2131{
2132    int ret = migration_incoming_enable_colo();
2133
2134    if (!ret) {
2135        ret = colo_init_ram_cache();
2136        if (ret) {
2137            migration_incoming_disable_colo();
2138        }
2139    }
2140    return ret;
2141}
2142
2143/*
2144 * Process an incoming 'QEMU_VM_COMMAND'
2145 * 0           just a normal return
2146 * LOADVM_QUIT All good, but exit the loop
2147 * <0          Error
2148 */
2149static int loadvm_process_command(QEMUFile *f)
2150{
2151    MigrationIncomingState *mis = migration_incoming_get_current();
2152    uint16_t cmd;
2153    uint16_t len;
2154    uint32_t tmp32;
2155
2156    cmd = qemu_get_be16(f);
2157    len = qemu_get_be16(f);
2158
2159    /* Check validity before continue processing of cmds */
2160    if (qemu_file_get_error(f)) {
2161        return qemu_file_get_error(f);
2162    }
2163
2164    trace_loadvm_process_command(cmd, len);
2165    if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
2166        error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
2167        return -EINVAL;
2168    }
2169
2170    if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
2171        error_report("%s received with bad length - expecting %zu, got %d",
2172                     mig_cmd_args[cmd].name,
2173                     (size_t)mig_cmd_args[cmd].len, len);
2174        return -ERANGE;
2175    }
2176
2177    switch (cmd) {
2178    case MIG_CMD_OPEN_RETURN_PATH:
2179        if (mis->to_src_file) {
2180            error_report("CMD_OPEN_RETURN_PATH called when RP already open");
2181            /* Not really a problem, so don't give up */
2182            return 0;
2183        }
2184        mis->to_src_file = qemu_file_get_return_path(f);
2185        if (!mis->to_src_file) {
2186            error_report("CMD_OPEN_RETURN_PATH failed");
2187            return -1;
2188        }
2189        break;
2190
2191    case MIG_CMD_PING:
2192        tmp32 = qemu_get_be32(f);
2193        trace_loadvm_process_command_ping(tmp32);
2194        if (!mis->to_src_file) {
2195            error_report("CMD_PING (0x%x) received with no return path",
2196                         tmp32);
2197            return -1;
2198        }
2199        migrate_send_rp_pong(mis, tmp32);
2200        break;
2201
2202    case MIG_CMD_PACKAGED:
2203        return loadvm_handle_cmd_packaged(mis);
2204
2205    case MIG_CMD_POSTCOPY_ADVISE:
2206        return loadvm_postcopy_handle_advise(mis, len);
2207
2208    case MIG_CMD_POSTCOPY_LISTEN:
2209        return loadvm_postcopy_handle_listen(mis);
2210
2211    case MIG_CMD_POSTCOPY_RUN:
2212        return loadvm_postcopy_handle_run(mis);
2213
2214    case MIG_CMD_POSTCOPY_RAM_DISCARD:
2215        return loadvm_postcopy_ram_handle_discard(mis, len);
2216
2217    case MIG_CMD_POSTCOPY_RESUME:
2218        return loadvm_postcopy_handle_resume(mis);
2219
2220    case MIG_CMD_RECV_BITMAP:
2221        return loadvm_handle_recv_bitmap(mis, len);
2222
2223    case MIG_CMD_ENABLE_COLO:
2224        return loadvm_process_enable_colo(mis);
2225    }
2226
2227    return 0;
2228}
2229
2230/*
2231 * Read a footer off the wire and check that it matches the expected section
2232 *
2233 * Returns: true if the footer was good
2234 *          false if there is a problem (and calls error_report to say why)
2235 */
2236static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
2237{
2238    int ret;
2239    uint8_t read_mark;
2240    uint32_t read_section_id;
2241
2242    if (!migrate_get_current()->send_section_footer) {
2243        /* No footer to check */
2244        return true;
2245    }
2246
2247    read_mark = qemu_get_byte(f);
2248
2249    ret = qemu_file_get_error(f);
2250    if (ret) {
2251        error_report("%s: Read section footer failed: %d",
2252                     __func__, ret);
2253        return false;
2254    }
2255
2256    if (read_mark != QEMU_VM_SECTION_FOOTER) {
2257        error_report("Missing section footer for %s", se->idstr);
2258        return false;
2259    }
2260
2261    read_section_id = qemu_get_be32(f);
2262    if (read_section_id != se->load_section_id) {
2263        error_report("Mismatched section id in footer for %s -"
2264                     " read 0x%x expected 0x%x",
2265                     se->idstr, read_section_id, se->load_section_id);
2266        return false;
2267    }
2268
2269    /* All good */
2270    return true;
2271}
2272
2273static int
2274qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
2275{
2276    uint32_t instance_id, version_id, section_id;
2277    SaveStateEntry *se;
2278    char idstr[256];
2279    int ret;
2280
2281    /* Read section start */
2282    section_id = qemu_get_be32(f);
2283    if (!qemu_get_counted_string(f, idstr)) {
2284        error_report("Unable to read ID string for section %u",
2285                     section_id);
2286        return -EINVAL;
2287    }
2288    instance_id = qemu_get_be32(f);
2289    version_id = qemu_get_be32(f);
2290
2291    ret = qemu_file_get_error(f);
2292    if (ret) {
2293        error_report("%s: Failed to read instance/version ID: %d",
2294                     __func__, ret);
2295        return ret;
2296    }
2297
2298    trace_qemu_loadvm_state_section_startfull(section_id, idstr,
2299            instance_id, version_id);
2300    /* Find savevm section */
2301    se = find_se(idstr, instance_id);
2302    if (se == NULL) {
2303        error_report("Unknown savevm section or instance '%s' %"PRIu32". "
2304                     "Make sure that your current VM setup matches your "
2305                     "saved VM setup, including any hotplugged devices",
2306                     idstr, instance_id);
2307        return -EINVAL;
2308    }
2309
2310    /* Validate version */
2311    if (version_id > se->version_id) {
2312        error_report("savevm: unsupported version %d for '%s' v%d",
2313                     version_id, idstr, se->version_id);
2314        return -EINVAL;
2315    }
2316    se->load_version_id = version_id;
2317    se->load_section_id = section_id;
2318
2319    /* Validate if it is a device's state */
2320    if (xen_enabled() && se->is_ram) {
2321        error_report("loadvm: %s RAM loading not allowed on Xen", idstr);
2322        return -EINVAL;
2323    }
2324
2325    ret = vmstate_load(f, se);
2326    if (ret < 0) {
2327        error_report("error while loading state for instance 0x%"PRIx32" of"
2328                     " device '%s'", instance_id, idstr);
2329        return ret;
2330    }
2331    if (!check_section_footer(f, se)) {
2332        return -EINVAL;
2333    }
2334
2335    return 0;
2336}
2337
2338static int
2339qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
2340{
2341    uint32_t section_id;
2342    SaveStateEntry *se;
2343    int ret;
2344
2345    section_id = qemu_get_be32(f);
2346
2347    ret = qemu_file_get_error(f);
2348    if (ret) {
2349        error_report("%s: Failed to read section ID: %d",
2350                     __func__, ret);
2351        return ret;
2352    }
2353
2354    trace_qemu_loadvm_state_section_partend(section_id);
2355    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2356        if (se->load_section_id == section_id) {
2357            break;
2358        }
2359    }
2360    if (se == NULL) {
2361        error_report("Unknown savevm section %d", section_id);
2362        return -EINVAL;
2363    }
2364
2365    ret = vmstate_load(f, se);
2366    if (ret < 0) {
2367        error_report("error while loading state section id %d(%s)",
2368                     section_id, se->idstr);
2369        return ret;
2370    }
2371    if (!check_section_footer(f, se)) {
2372        return -EINVAL;
2373    }
2374
2375    return 0;
2376}
2377
2378static int qemu_loadvm_state_header(QEMUFile *f)
2379{
2380    unsigned int v;
2381    int ret;
2382
2383    v = qemu_get_be32(f);
2384    if (v != QEMU_VM_FILE_MAGIC) {
2385        error_report("Not a migration stream");
2386        return -EINVAL;
2387    }
2388
2389    v = qemu_get_be32(f);
2390    if (v == QEMU_VM_FILE_VERSION_COMPAT) {
2391        error_report("SaveVM v2 format is obsolete and don't work anymore");
2392        return -ENOTSUP;
2393    }
2394    if (v != QEMU_VM_FILE_VERSION) {
2395        error_report("Unsupported migration stream version");
2396        return -ENOTSUP;
2397    }
2398
2399    if (migrate_get_current()->send_configuration) {
2400        if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
2401            error_report("Configuration section missing");
2402            qemu_loadvm_state_cleanup();
2403            return -EINVAL;
2404        }
2405        ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
2406
2407        if (ret) {
2408            qemu_loadvm_state_cleanup();
2409            return ret;
2410        }
2411    }
2412    return 0;
2413}
2414
2415static int qemu_loadvm_state_setup(QEMUFile *f)
2416{
2417    SaveStateEntry *se;
2418    int ret;
2419
2420    trace_loadvm_state_setup();
2421    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2422        if (!se->ops || !se->ops->load_setup) {
2423            continue;
2424        }
2425        if (se->ops->is_active) {
2426            if (!se->ops->is_active(se->opaque)) {
2427                continue;
2428            }
2429        }
2430
2431        ret = se->ops->load_setup(f, se->opaque);
2432        if (ret < 0) {
2433            qemu_file_set_error(f, ret);
2434            error_report("Load state of device %s failed", se->idstr);
2435            return ret;
2436        }
2437    }
2438    return 0;
2439}
2440
2441void qemu_loadvm_state_cleanup(void)
2442{
2443    SaveStateEntry *se;
2444
2445    trace_loadvm_state_cleanup();
2446    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2447        if (se->ops && se->ops->load_cleanup) {
2448            se->ops->load_cleanup(se->opaque);
2449        }
2450    }
2451}
2452
2453/* Return true if we should continue the migration, or false. */
2454static bool postcopy_pause_incoming(MigrationIncomingState *mis)
2455{
2456    trace_postcopy_pause_incoming();
2457
2458    assert(migrate_postcopy_ram());
2459
2460    /* Clear the triggered bit to allow one recovery */
2461    mis->postcopy_recover_triggered = false;
2462
2463    assert(mis->from_src_file);
2464    qemu_file_shutdown(mis->from_src_file);
2465    qemu_fclose(mis->from_src_file);
2466    mis->from_src_file = NULL;
2467
2468    assert(mis->to_src_file);
2469    qemu_file_shutdown(mis->to_src_file);
2470    qemu_mutex_lock(&mis->rp_mutex);
2471    qemu_fclose(mis->to_src_file);
2472    mis->to_src_file = NULL;
2473    qemu_mutex_unlock(&mis->rp_mutex);
2474
2475    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2476                      MIGRATION_STATUS_POSTCOPY_PAUSED);
2477
2478    /* Notify the fault thread for the invalidated file handle */
2479    postcopy_fault_thread_notify(mis);
2480
2481    error_report("Detected IO failure for postcopy. "
2482                 "Migration paused.");
2483
2484    while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
2485        qemu_sem_wait(&mis->postcopy_pause_sem_dst);
2486    }
2487
2488    trace_postcopy_pause_incoming_continued();
2489
2490    return true;
2491}
2492
2493int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
2494{
2495    uint8_t section_type;
2496    int ret = 0;
2497
2498retry:
2499    while (true) {
2500        section_type = qemu_get_byte(f);
2501
2502        if (qemu_file_get_error(f)) {
2503            ret = qemu_file_get_error(f);
2504            break;
2505        }
2506
2507        trace_qemu_loadvm_state_section(section_type);
2508        switch (section_type) {
2509        case QEMU_VM_SECTION_START:
2510        case QEMU_VM_SECTION_FULL:
2511            ret = qemu_loadvm_section_start_full(f, mis);
2512            if (ret < 0) {
2513                goto out;
2514            }
2515            break;
2516        case QEMU_VM_SECTION_PART:
2517        case QEMU_VM_SECTION_END:
2518            ret = qemu_loadvm_section_part_end(f, mis);
2519            if (ret < 0) {
2520                goto out;
2521            }
2522            break;
2523        case QEMU_VM_COMMAND:
2524            ret = loadvm_process_command(f);
2525            trace_qemu_loadvm_state_section_command(ret);
2526            if ((ret < 0) || (ret == LOADVM_QUIT)) {
2527                goto out;
2528            }
2529            break;
2530        case QEMU_VM_EOF:
2531            /* This is the end of migration */
2532            goto out;
2533        default:
2534            error_report("Unknown savevm section type %d", section_type);
2535            ret = -EINVAL;
2536            goto out;
2537        }
2538    }
2539
2540out:
2541    if (ret < 0) {
2542        qemu_file_set_error(f, ret);
2543
2544        /* Cancel bitmaps incoming regardless of recovery */
2545        dirty_bitmap_mig_cancel_incoming();
2546
2547        /*
2548         * If we are during an active postcopy, then we pause instead
2549         * of bail out to at least keep the VM's dirty data.  Note
2550         * that POSTCOPY_INCOMING_LISTENING stage is still not enough,
2551         * during which we're still receiving device states and we
2552         * still haven't yet started the VM on destination.
2553         *
2554         * Only RAM postcopy supports recovery. Still, if RAM postcopy is
2555         * enabled, canceled bitmaps postcopy will not affect RAM postcopy
2556         * recovering.
2557         */
2558        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
2559            migrate_postcopy_ram() && postcopy_pause_incoming(mis)) {
2560            /* Reset f to point to the newly created channel */
2561            f = mis->from_src_file;
2562            goto retry;
2563        }
2564    }
2565    return ret;
2566}
2567
2568int qemu_loadvm_state(QEMUFile *f)
2569{
2570    MigrationIncomingState *mis = migration_incoming_get_current();
2571    Error *local_err = NULL;
2572    int ret;
2573
2574    if (qemu_savevm_state_blocked(&local_err)) {
2575        error_report_err(local_err);
2576        return -EINVAL;
2577    }
2578
2579    ret = qemu_loadvm_state_header(f);
2580    if (ret) {
2581        return ret;
2582    }
2583
2584    if (qemu_loadvm_state_setup(f) != 0) {
2585        return -EINVAL;
2586    }
2587
2588    cpu_synchronize_all_pre_loadvm();
2589
2590    ret = qemu_loadvm_state_main(f, mis);
2591    qemu_event_set(&mis->main_thread_load_event);
2592
2593    trace_qemu_loadvm_state_post_main(ret);
2594
2595    if (mis->have_listen_thread) {
2596        /* Listen thread still going, can't clean up yet */
2597        return ret;
2598    }
2599
2600    if (ret == 0) {
2601        ret = qemu_file_get_error(f);
2602    }
2603
2604    /*
2605     * Try to read in the VMDESC section as well, so that dumping tools that
2606     * intercept our migration stream have the chance to see it.
2607     */
2608
2609    /* We've got to be careful; if we don't read the data and just shut the fd
2610     * then the sender can error if we close while it's still sending.
2611     * We also mustn't read data that isn't there; some transports (RDMA)
2612     * will stall waiting for that data when the source has already closed.
2613     */
2614    if (ret == 0 && should_send_vmdesc()) {
2615        uint8_t *buf;
2616        uint32_t size;
2617        uint8_t  section_type = qemu_get_byte(f);
2618
2619        if (section_type != QEMU_VM_VMDESCRIPTION) {
2620            error_report("Expected vmdescription section, but got %d",
2621                         section_type);
2622            /*
2623             * It doesn't seem worth failing at this point since
2624             * we apparently have an otherwise valid VM state
2625             */
2626        } else {
2627            buf = g_malloc(0x1000);
2628            size = qemu_get_be32(f);
2629
2630            while (size > 0) {
2631                uint32_t read_chunk = MIN(size, 0x1000);
2632                qemu_get_buffer(f, buf, read_chunk);
2633                size -= read_chunk;
2634            }
2635            g_free(buf);
2636        }
2637    }
2638
2639    qemu_loadvm_state_cleanup();
2640    cpu_synchronize_all_post_init();
2641
2642    return ret;
2643}
2644
2645int qemu_load_device_state(QEMUFile *f)
2646{
2647    MigrationIncomingState *mis = migration_incoming_get_current();
2648    int ret;
2649
2650    /* Load QEMU_VM_SECTION_FULL section */
2651    ret = qemu_loadvm_state_main(f, mis);
2652    if (ret < 0) {
2653        error_report("Failed to load device state: %d", ret);
2654        return ret;
2655    }
2656
2657    cpu_synchronize_all_post_init();
2658    return 0;
2659}
2660
2661int save_snapshot(const char *name, Error **errp)
2662{
2663    BlockDriverState *bs, *bs1;
2664    QEMUSnapshotInfo sn1, *sn = &sn1, old_sn1, *old_sn = &old_sn1;
2665    int ret = -1, ret2;
2666    QEMUFile *f;
2667    int saved_vm_running;
2668    uint64_t vm_state_size;
2669    qemu_timeval tv;
2670    struct tm tm;
2671    AioContext *aio_context;
2672
2673    if (migration_is_blocked(errp)) {
2674        return ret;
2675    }
2676
2677    if (!replay_can_snapshot()) {
2678        error_setg(errp, "Record/replay does not allow making snapshot "
2679                   "right now. Try once more later.");
2680        return ret;
2681    }
2682
2683    if (!bdrv_all_can_snapshot(&bs)) {
2684        error_setg(errp, "Device '%s' is writable but does not support "
2685                   "snapshots", bdrv_get_device_name(bs));
2686        return ret;
2687    }
2688
2689    /* Delete old snapshots of the same name */
2690    if (name) {
2691        ret = bdrv_all_delete_snapshot(name, &bs1, errp);
2692        if (ret < 0) {
2693            error_prepend(errp, "Error while deleting snapshot on device "
2694                          "'%s': ", bdrv_get_device_name(bs1));
2695            return ret;
2696        }
2697    }
2698
2699    bs = bdrv_all_find_vmstate_bs();
2700    if (bs == NULL) {
2701        error_setg(errp, "No block device can accept snapshots");
2702        return ret;
2703    }
2704    aio_context = bdrv_get_aio_context(bs);
2705
2706    saved_vm_running = runstate_is_running();
2707
2708    ret = global_state_store();
2709    if (ret) {
2710        error_setg(errp, "Error saving global state");
2711        return ret;
2712    }
2713    vm_stop(RUN_STATE_SAVE_VM);
2714
2715    bdrv_drain_all_begin();
2716
2717    aio_context_acquire(aio_context);
2718
2719    memset(sn, 0, sizeof(*sn));
2720
2721    /* fill auxiliary fields */
2722    qemu_gettimeofday(&tv);
2723    sn->date_sec = tv.tv_sec;
2724    sn->date_nsec = tv.tv_usec * 1000;
2725    sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
2726
2727    if (name) {
2728        ret = bdrv_snapshot_find(bs, old_sn, name);
2729        if (ret >= 0) {
2730            pstrcpy(sn->name, sizeof(sn->name), old_sn->name);
2731            pstrcpy(sn->id_str, sizeof(sn->id_str), old_sn->id_str);
2732        } else {
2733            pstrcpy(sn->name, sizeof(sn->name), name);
2734        }
2735    } else {
2736        /* cast below needed for OpenBSD where tv_sec is still 'long' */
2737        localtime_r((const time_t *)&tv.tv_sec, &tm);
2738        strftime(sn->name, sizeof(sn->name), "vm-%Y%m%d%H%M%S", &tm);
2739    }
2740
2741    /* save the VM state */
2742    f = qemu_fopen_bdrv(bs, 1);
2743    if (!f) {
2744        error_setg(errp, "Could not open VM state file");
2745        goto the_end;
2746    }
2747    ret = qemu_savevm_state(f, errp);
2748    vm_state_size = qemu_ftell(f);
2749    ret2 = qemu_fclose(f);
2750    if (ret < 0) {
2751        goto the_end;
2752    }
2753    if (ret2 < 0) {
2754        ret = ret2;
2755        goto the_end;
2756    }
2757
2758    /* The bdrv_all_create_snapshot() call that follows acquires the AioContext
2759     * for itself.  BDRV_POLL_WHILE() does not support nested locking because
2760     * it only releases the lock once.  Therefore synchronous I/O will deadlock
2761     * unless we release the AioContext before bdrv_all_create_snapshot().
2762     */
2763    aio_context_release(aio_context);
2764    aio_context = NULL;
2765
2766    ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs);
2767    if (ret < 0) {
2768        error_setg(errp, "Error while creating snapshot on '%s'",
2769                   bdrv_get_device_name(bs));
2770        goto the_end;
2771    }
2772
2773    ret = 0;
2774
2775 the_end:
2776    if (aio_context) {
2777        aio_context_release(aio_context);
2778    }
2779
2780    bdrv_drain_all_end();
2781
2782    if (saved_vm_running) {
2783        vm_start();
2784    }
2785    return ret;
2786}
2787
2788void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
2789                                Error **errp)
2790{
2791    QEMUFile *f;
2792    QIOChannelFile *ioc;
2793    int saved_vm_running;
2794    int ret;
2795
2796    if (!has_live) {
2797        /* live default to true so old version of Xen tool stack can have a
2798         * successfull live migration */
2799        live = true;
2800    }
2801
2802    saved_vm_running = runstate_is_running();
2803    vm_stop(RUN_STATE_SAVE_VM);
2804    global_state_store_running();
2805
2806    ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT, 0660, errp);
2807    if (!ioc) {
2808        goto the_end;
2809    }
2810    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-save-state");
2811    f = qemu_fopen_channel_output(QIO_CHANNEL(ioc));
2812    object_unref(OBJECT(ioc));
2813    ret = qemu_save_device_state(f);
2814    if (ret < 0 || qemu_fclose(f) < 0) {
2815        error_setg(errp, QERR_IO_ERROR);
2816    } else {
2817        /* libxl calls the QMP command "stop" before calling
2818         * "xen-save-devices-state" and in case of migration failure, libxl
2819         * would call "cont".
2820         * So call bdrv_inactivate_all (release locks) here to let the other
2821         * side of the migration take controle of the images.
2822         */
2823        if (live && !saved_vm_running) {
2824            ret = bdrv_inactivate_all();
2825            if (ret) {
2826                error_setg(errp, "%s: bdrv_inactivate_all() failed (%d)",
2827                           __func__, ret);
2828            }
2829        }
2830    }
2831
2832 the_end:
2833    if (saved_vm_running) {
2834        vm_start();
2835    }
2836}
2837
2838void qmp_xen_load_devices_state(const char *filename, Error **errp)
2839{
2840    QEMUFile *f;
2841    QIOChannelFile *ioc;
2842    int ret;
2843
2844    /* Guest must be paused before loading the device state; the RAM state
2845     * will already have been loaded by xc
2846     */
2847    if (runstate_is_running()) {
2848        error_setg(errp, "Cannot update device state while vm is running");
2849        return;
2850    }
2851    vm_stop(RUN_STATE_RESTORE_VM);
2852
2853    ioc = qio_channel_file_new_path(filename, O_RDONLY | O_BINARY, 0, errp);
2854    if (!ioc) {
2855        return;
2856    }
2857    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-load-state");
2858    f = qemu_fopen_channel_input(QIO_CHANNEL(ioc));
2859    object_unref(OBJECT(ioc));
2860
2861    ret = qemu_loadvm_state(f);
2862    qemu_fclose(f);
2863    if (ret < 0) {
2864        error_setg(errp, QERR_IO_ERROR);
2865    }
2866    migration_incoming_state_destroy();
2867}
2868
2869int load_snapshot(const char *name, Error **errp)
2870{
2871    BlockDriverState *bs, *bs_vm_state;
2872    QEMUSnapshotInfo sn;
2873    QEMUFile *f;
2874    int ret;
2875    AioContext *aio_context;
2876    MigrationIncomingState *mis = migration_incoming_get_current();
2877
2878    if (!replay_can_snapshot()) {
2879        error_setg(errp, "Record/replay does not allow loading snapshot "
2880                   "right now. Try once more later.");
2881        return -EINVAL;
2882    }
2883
2884    if (!bdrv_all_can_snapshot(&bs)) {
2885        error_setg(errp,
2886                   "Device '%s' is writable but does not support snapshots",
2887                   bdrv_get_device_name(bs));
2888        return -ENOTSUP;
2889    }
2890    ret = bdrv_all_find_snapshot(name, &bs);
2891    if (ret < 0) {
2892        error_setg(errp,
2893                   "Device '%s' does not have the requested snapshot '%s'",
2894                   bdrv_get_device_name(bs), name);
2895        return ret;
2896    }
2897
2898    bs_vm_state = bdrv_all_find_vmstate_bs();
2899    if (!bs_vm_state) {
2900        error_setg(errp, "No block device supports snapshots");
2901        return -ENOTSUP;
2902    }
2903    aio_context = bdrv_get_aio_context(bs_vm_state);
2904
2905    /* Don't even try to load empty VM states */
2906    aio_context_acquire(aio_context);
2907    ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
2908    aio_context_release(aio_context);
2909    if (ret < 0) {
2910        return ret;
2911    } else if (sn.vm_state_size == 0) {
2912        error_setg(errp, "This is a disk-only snapshot. Revert to it "
2913                   " offline using qemu-img");
2914        return -EINVAL;
2915    }
2916
2917    /* Flush all IO requests so they don't interfere with the new state.  */
2918    bdrv_drain_all_begin();
2919
2920    ret = bdrv_all_goto_snapshot(name, &bs, errp);
2921    if (ret < 0) {
2922        error_prepend(errp, "Could not load snapshot '%s' on '%s': ",
2923                      name, bdrv_get_device_name(bs));
2924        goto err_drain;
2925    }
2926
2927    /* restore the VM state */
2928    f = qemu_fopen_bdrv(bs_vm_state, 0);
2929    if (!f) {
2930        error_setg(errp, "Could not open VM state file");
2931        ret = -EINVAL;
2932        goto err_drain;
2933    }
2934
2935    qemu_system_reset(SHUTDOWN_CAUSE_NONE);
2936    mis->from_src_file = f;
2937
2938    aio_context_acquire(aio_context);
2939    ret = qemu_loadvm_state(f);
2940    migration_incoming_state_destroy();
2941    aio_context_release(aio_context);
2942
2943    bdrv_drain_all_end();
2944
2945    if (ret < 0) {
2946        error_setg(errp, "Error %d while loading VM state", ret);
2947        return ret;
2948    }
2949
2950    return 0;
2951
2952err_drain:
2953    bdrv_drain_all_end();
2954    return ret;
2955}
2956
2957void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
2958{
2959    qemu_ram_set_idstr(mr->ram_block,
2960                       memory_region_name(mr), dev);
2961    qemu_ram_set_migratable(mr->ram_block);
2962}
2963
2964void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
2965{
2966    qemu_ram_unset_idstr(mr->ram_block);
2967    qemu_ram_unset_migratable(mr->ram_block);
2968}
2969
2970void vmstate_register_ram_global(MemoryRegion *mr)
2971{
2972    vmstate_register_ram(mr, NULL);
2973}
2974
2975bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
2976{
2977    /* check needed if --only-migratable is specified */
2978    if (!only_migratable) {
2979        return true;
2980    }
2981
2982    return !(vmsd && vmsd->unmigratable);
2983}
2984