qemu/migration/savevm.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2009-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include "hw/boards.h"
  31#include "net/net.h"
  32#include "migration.h"
  33#include "migration/snapshot.h"
  34#include "migration/vmstate.h"
  35#include "migration/misc.h"
  36#include "migration/register.h"
  37#include "migration/global_state.h"
  38#include "ram.h"
  39#include "qemu-file-channel.h"
  40#include "qemu-file.h"
  41#include "savevm.h"
  42#include "postcopy-ram.h"
  43#include "qapi/error.h"
  44#include "qapi/qapi-commands-migration.h"
  45#include "qapi/qmp/json-writer.h"
  46#include "qapi/clone-visitor.h"
  47#include "qapi/qapi-builtin-visit.h"
  48#include "qapi/qmp/qerror.h"
  49#include "qemu/error-report.h"
  50#include "sysemu/cpus.h"
  51#include "exec/memory.h"
  52#include "exec/target_page.h"
  53#include "trace.h"
  54#include "qemu/iov.h"
  55#include "qemu/main-loop.h"
  56#include "block/snapshot.h"
  57#include "qemu/cutils.h"
  58#include "io/channel-buffer.h"
  59#include "io/channel-file.h"
  60#include "sysemu/replay.h"
  61#include "sysemu/runstate.h"
  62#include "sysemu/sysemu.h"
  63#include "sysemu/xen.h"
  64#include "migration/colo.h"
  65#include "qemu/bitmap.h"
  66#include "net/announce.h"
  67#include "qemu/yank.h"
  68
  69const unsigned int postcopy_ram_discard_version;
  70
  71/* Subcommands for QEMU_VM_COMMAND */
  72enum qemu_vm_cmd {
  73    MIG_CMD_INVALID = 0,   /* Must be 0 */
  74    MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
  75    MIG_CMD_PING,              /* Request a PONG on the RP */
  76
  77    MIG_CMD_POSTCOPY_ADVISE,       /* Prior to any page transfers, just
  78                                      warn we might want to do PC */
  79    MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
  80                                      pages as it's running. */
  81    MIG_CMD_POSTCOPY_RUN,          /* Start execution */
  82
  83    MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
  84                                      were previously sent during
  85                                      precopy but are dirty. */
  86    MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
  87    MIG_CMD_ENABLE_COLO,       /* Enable COLO */
  88    MIG_CMD_POSTCOPY_RESUME,   /* resume postcopy on dest */
  89    MIG_CMD_RECV_BITMAP,       /* Request for recved bitmap on dst */
  90    MIG_CMD_MAX
  91};
  92
  93#define MAX_VM_CMD_PACKAGED_SIZE UINT32_MAX
  94static struct mig_cmd_args {
  95    ssize_t     len; /* -1 = variable */
  96    const char *name;
  97} mig_cmd_args[] = {
  98    [MIG_CMD_INVALID]          = { .len = -1, .name = "INVALID" },
  99    [MIG_CMD_OPEN_RETURN_PATH] = { .len =  0, .name = "OPEN_RETURN_PATH" },
 100    [MIG_CMD_PING]             = { .len = sizeof(uint32_t), .name = "PING" },
 101    [MIG_CMD_POSTCOPY_ADVISE]  = { .len = -1, .name = "POSTCOPY_ADVISE" },
 102    [MIG_CMD_POSTCOPY_LISTEN]  = { .len =  0, .name = "POSTCOPY_LISTEN" },
 103    [MIG_CMD_POSTCOPY_RUN]     = { .len =  0, .name = "POSTCOPY_RUN" },
 104    [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
 105                                   .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
 106    [MIG_CMD_POSTCOPY_RESUME]  = { .len =  0, .name = "POSTCOPY_RESUME" },
 107    [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
 108    [MIG_CMD_RECV_BITMAP]      = { .len = -1, .name = "RECV_BITMAP" },
 109    [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
 110};
 111
 112/* Note for MIG_CMD_POSTCOPY_ADVISE:
 113 * The format of arguments is depending on postcopy mode:
 114 * - postcopy RAM only
 115 *   uint64_t host page size
 116 *   uint64_t taget page size
 117 *
 118 * - postcopy RAM and postcopy dirty bitmaps
 119 *   format is the same as for postcopy RAM only
 120 *
 121 * - postcopy dirty bitmaps only
 122 *   Nothing. Command length field is 0.
 123 *
 124 * Be careful: adding a new postcopy entity with some other parameters should
 125 * not break format self-description ability. Good way is to introduce some
 126 * generic extendable format with an exception for two old entities.
 127 */
 128
 129/***********************************************************/
 130/* savevm/loadvm support */
 131
 132static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
 133                                   int64_t pos, Error **errp)
 134{
 135    int ret;
 136    QEMUIOVector qiov;
 137
 138    qemu_iovec_init_external(&qiov, iov, iovcnt);
 139    ret = bdrv_writev_vmstate(opaque, &qiov, pos);
 140    if (ret < 0) {
 141        return ret;
 142    }
 143
 144    return qiov.size;
 145}
 146
 147static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
 148                                size_t size, Error **errp)
 149{
 150    return bdrv_load_vmstate(opaque, buf, pos, size);
 151}
 152
 153static int bdrv_fclose(void *opaque, Error **errp)
 154{
 155    return bdrv_flush(opaque);
 156}
 157
 158static const QEMUFileOps bdrv_read_ops = {
 159    .get_buffer = block_get_buffer,
 160    .close =      bdrv_fclose
 161};
 162
 163static const QEMUFileOps bdrv_write_ops = {
 164    .writev_buffer  = block_writev_buffer,
 165    .close          = bdrv_fclose
 166};
 167
 168static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
 169{
 170    if (is_writable) {
 171        return qemu_fopen_ops(bs, &bdrv_write_ops);
 172    }
 173    return qemu_fopen_ops(bs, &bdrv_read_ops);
 174}
 175
 176
 177/* QEMUFile timer support.
 178 * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c
 179 */
 180
 181void timer_put(QEMUFile *f, QEMUTimer *ts)
 182{
 183    uint64_t expire_time;
 184
 185    expire_time = timer_expire_time_ns(ts);
 186    qemu_put_be64(f, expire_time);
 187}
 188
 189void timer_get(QEMUFile *f, QEMUTimer *ts)
 190{
 191    uint64_t expire_time;
 192
 193    expire_time = qemu_get_be64(f);
 194    if (expire_time != -1) {
 195        timer_mod_ns(ts, expire_time);
 196    } else {
 197        timer_del(ts);
 198    }
 199}
 200
 201
 202/* VMState timer support.
 203 * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c
 204 */
 205
 206static int get_timer(QEMUFile *f, void *pv, size_t size,
 207                     const VMStateField *field)
 208{
 209    QEMUTimer *v = pv;
 210    timer_get(f, v);
 211    return 0;
 212}
 213
 214static int put_timer(QEMUFile *f, void *pv, size_t size,
 215                     const VMStateField *field, JSONWriter *vmdesc)
 216{
 217    QEMUTimer *v = pv;
 218    timer_put(f, v);
 219
 220    return 0;
 221}
 222
 223const VMStateInfo vmstate_info_timer = {
 224    .name = "timer",
 225    .get  = get_timer,
 226    .put  = put_timer,
 227};
 228
 229
 230typedef struct CompatEntry {
 231    char idstr[256];
 232    int instance_id;
 233} CompatEntry;
 234
 235typedef struct SaveStateEntry {
 236    QTAILQ_ENTRY(SaveStateEntry) entry;
 237    char idstr[256];
 238    uint32_t instance_id;
 239    int alias_id;
 240    int version_id;
 241    /* version id read from the stream */
 242    int load_version_id;
 243    int section_id;
 244    /* section id read from the stream */
 245    int load_section_id;
 246    const SaveVMHandlers *ops;
 247    const VMStateDescription *vmsd;
 248    void *opaque;
 249    CompatEntry *compat;
 250    int is_ram;
 251} SaveStateEntry;
 252
 253typedef struct SaveState {
 254    QTAILQ_HEAD(, SaveStateEntry) handlers;
 255    SaveStateEntry *handler_pri_head[MIG_PRI_MAX + 1];
 256    int global_section_id;
 257    uint32_t len;
 258    const char *name;
 259    uint32_t target_page_bits;
 260    uint32_t caps_count;
 261    MigrationCapability *capabilities;
 262    QemuUUID uuid;
 263} SaveState;
 264
 265static SaveState savevm_state = {
 266    .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
 267    .handler_pri_head = { [MIG_PRI_DEFAULT ... MIG_PRI_MAX] = NULL },
 268    .global_section_id = 0,
 269};
 270
 271static bool should_validate_capability(int capability)
 272{
 273    assert(capability >= 0 && capability < MIGRATION_CAPABILITY__MAX);
 274    /* Validate only new capabilities to keep compatibility. */
 275    switch (capability) {
 276    case MIGRATION_CAPABILITY_X_IGNORE_SHARED:
 277        return true;
 278    default:
 279        return false;
 280    }
 281}
 282
 283static uint32_t get_validatable_capabilities_count(void)
 284{
 285    MigrationState *s = migrate_get_current();
 286    uint32_t result = 0;
 287    int i;
 288    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 289        if (should_validate_capability(i) && s->enabled_capabilities[i]) {
 290            result++;
 291        }
 292    }
 293    return result;
 294}
 295
 296static int configuration_pre_save(void *opaque)
 297{
 298    SaveState *state = opaque;
 299    const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
 300    MigrationState *s = migrate_get_current();
 301    int i, j;
 302
 303    state->len = strlen(current_name);
 304    state->name = current_name;
 305    state->target_page_bits = qemu_target_page_bits();
 306
 307    state->caps_count = get_validatable_capabilities_count();
 308    state->capabilities = g_renew(MigrationCapability, state->capabilities,
 309                                  state->caps_count);
 310    for (i = j = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 311        if (should_validate_capability(i) && s->enabled_capabilities[i]) {
 312            state->capabilities[j++] = i;
 313        }
 314    }
 315    state->uuid = qemu_uuid;
 316
 317    return 0;
 318}
 319
 320static int configuration_post_save(void *opaque)
 321{
 322    SaveState *state = opaque;
 323
 324    g_free(state->capabilities);
 325    state->capabilities = NULL;
 326    state->caps_count = 0;
 327    return 0;
 328}
 329
 330static int configuration_pre_load(void *opaque)
 331{
 332    SaveState *state = opaque;
 333
 334    /* If there is no target-page-bits subsection it means the source
 335     * predates the variable-target-page-bits support and is using the
 336     * minimum possible value for this CPU.
 337     */
 338    state->target_page_bits = qemu_target_page_bits_min();
 339    return 0;
 340}
 341
 342static bool configuration_validate_capabilities(SaveState *state)
 343{
 344    bool ret = true;
 345    MigrationState *s = migrate_get_current();
 346    unsigned long *source_caps_bm;
 347    int i;
 348
 349    source_caps_bm = bitmap_new(MIGRATION_CAPABILITY__MAX);
 350    for (i = 0; i < state->caps_count; i++) {
 351        MigrationCapability capability = state->capabilities[i];
 352        set_bit(capability, source_caps_bm);
 353    }
 354
 355    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 356        bool source_state, target_state;
 357        if (!should_validate_capability(i)) {
 358            continue;
 359        }
 360        source_state = test_bit(i, source_caps_bm);
 361        target_state = s->enabled_capabilities[i];
 362        if (source_state != target_state) {
 363            error_report("Capability %s is %s, but received capability is %s",
 364                         MigrationCapability_str(i),
 365                         target_state ? "on" : "off",
 366                         source_state ? "on" : "off");
 367            ret = false;
 368            /* Don't break here to report all failed capabilities */
 369        }
 370    }
 371
 372    g_free(source_caps_bm);
 373    return ret;
 374}
 375
 376static int configuration_post_load(void *opaque, int version_id)
 377{
 378    SaveState *state = opaque;
 379    const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
 380    int ret = 0;
 381
 382    if (strncmp(state->name, current_name, state->len) != 0) {
 383        error_report("Machine type received is '%.*s' and local is '%s'",
 384                     (int) state->len, state->name, current_name);
 385        ret = -EINVAL;
 386        goto out;
 387    }
 388
 389    if (state->target_page_bits != qemu_target_page_bits()) {
 390        error_report("Received TARGET_PAGE_BITS is %d but local is %d",
 391                     state->target_page_bits, qemu_target_page_bits());
 392        ret = -EINVAL;
 393        goto out;
 394    }
 395
 396    if (!configuration_validate_capabilities(state)) {
 397        ret = -EINVAL;
 398        goto out;
 399    }
 400
 401out:
 402    g_free((void *)state->name);
 403    state->name = NULL;
 404    state->len = 0;
 405    g_free(state->capabilities);
 406    state->capabilities = NULL;
 407    state->caps_count = 0;
 408
 409    return ret;
 410}
 411
 412static int get_capability(QEMUFile *f, void *pv, size_t size,
 413                          const VMStateField *field)
 414{
 415    MigrationCapability *capability = pv;
 416    char capability_str[UINT8_MAX + 1];
 417    uint8_t len;
 418    int i;
 419
 420    len = qemu_get_byte(f);
 421    qemu_get_buffer(f, (uint8_t *)capability_str, len);
 422    capability_str[len] = '\0';
 423    for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) {
 424        if (!strcmp(MigrationCapability_str(i), capability_str)) {
 425            *capability = i;
 426            return 0;
 427        }
 428    }
 429    error_report("Received unknown capability %s", capability_str);
 430    return -EINVAL;
 431}
 432
 433static int put_capability(QEMUFile *f, void *pv, size_t size,
 434                          const VMStateField *field, JSONWriter *vmdesc)
 435{
 436    MigrationCapability *capability = pv;
 437    const char *capability_str = MigrationCapability_str(*capability);
 438    size_t len = strlen(capability_str);
 439    assert(len <= UINT8_MAX);
 440
 441    qemu_put_byte(f, len);
 442    qemu_put_buffer(f, (uint8_t *)capability_str, len);
 443    return 0;
 444}
 445
 446static const VMStateInfo vmstate_info_capability = {
 447    .name = "capability",
 448    .get  = get_capability,
 449    .put  = put_capability,
 450};
 451
 452/* The target-page-bits subsection is present only if the
 453 * target page size is not the same as the default (ie the
 454 * minimum page size for a variable-page-size guest CPU).
 455 * If it is present then it contains the actual target page
 456 * bits for the machine, and migration will fail if the
 457 * two ends don't agree about it.
 458 */
 459static bool vmstate_target_page_bits_needed(void *opaque)
 460{
 461    return qemu_target_page_bits()
 462        > qemu_target_page_bits_min();
 463}
 464
 465static const VMStateDescription vmstate_target_page_bits = {
 466    .name = "configuration/target-page-bits",
 467    .version_id = 1,
 468    .minimum_version_id = 1,
 469    .needed = vmstate_target_page_bits_needed,
 470    .fields = (VMStateField[]) {
 471        VMSTATE_UINT32(target_page_bits, SaveState),
 472        VMSTATE_END_OF_LIST()
 473    }
 474};
 475
 476static bool vmstate_capabilites_needed(void *opaque)
 477{
 478    return get_validatable_capabilities_count() > 0;
 479}
 480
 481static const VMStateDescription vmstate_capabilites = {
 482    .name = "configuration/capabilities",
 483    .version_id = 1,
 484    .minimum_version_id = 1,
 485    .needed = vmstate_capabilites_needed,
 486    .fields = (VMStateField[]) {
 487        VMSTATE_UINT32_V(caps_count, SaveState, 1),
 488        VMSTATE_VARRAY_UINT32_ALLOC(capabilities, SaveState, caps_count, 1,
 489                                    vmstate_info_capability,
 490                                    MigrationCapability),
 491        VMSTATE_END_OF_LIST()
 492    }
 493};
 494
 495static bool vmstate_uuid_needed(void *opaque)
 496{
 497    return qemu_uuid_set && migrate_validate_uuid();
 498}
 499
 500static int vmstate_uuid_post_load(void *opaque, int version_id)
 501{
 502    SaveState *state = opaque;
 503    char uuid_src[UUID_FMT_LEN + 1];
 504    char uuid_dst[UUID_FMT_LEN + 1];
 505
 506    if (!qemu_uuid_set) {
 507        /*
 508         * It's warning because user might not know UUID in some cases,
 509         * e.g. load an old snapshot
 510         */
 511        qemu_uuid_unparse(&state->uuid, uuid_src);
 512        warn_report("UUID is received %s, but local uuid isn't set",
 513                     uuid_src);
 514        return 0;
 515    }
 516    if (!qemu_uuid_is_equal(&state->uuid, &qemu_uuid)) {
 517        qemu_uuid_unparse(&state->uuid, uuid_src);
 518        qemu_uuid_unparse(&qemu_uuid, uuid_dst);
 519        error_report("UUID received is %s and local is %s", uuid_src, uuid_dst);
 520        return -EINVAL;
 521    }
 522    return 0;
 523}
 524
 525static const VMStateDescription vmstate_uuid = {
 526    .name = "configuration/uuid",
 527    .version_id = 1,
 528    .minimum_version_id = 1,
 529    .needed = vmstate_uuid_needed,
 530    .post_load = vmstate_uuid_post_load,
 531    .fields = (VMStateField[]) {
 532        VMSTATE_UINT8_ARRAY_V(uuid.data, SaveState, sizeof(QemuUUID), 1),
 533        VMSTATE_END_OF_LIST()
 534    }
 535};
 536
 537static const VMStateDescription vmstate_configuration = {
 538    .name = "configuration",
 539    .version_id = 1,
 540    .pre_load = configuration_pre_load,
 541    .post_load = configuration_post_load,
 542    .pre_save = configuration_pre_save,
 543    .post_save = configuration_post_save,
 544    .fields = (VMStateField[]) {
 545        VMSTATE_UINT32(len, SaveState),
 546        VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
 547        VMSTATE_END_OF_LIST()
 548    },
 549    .subsections = (const VMStateDescription *[]) {
 550        &vmstate_target_page_bits,
 551        &vmstate_capabilites,
 552        &vmstate_uuid,
 553        NULL
 554    }
 555};
 556
 557static void dump_vmstate_vmsd(FILE *out_file,
 558                              const VMStateDescription *vmsd, int indent,
 559                              bool is_subsection);
 560
 561static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field,
 562                              int indent)
 563{
 564    fprintf(out_file, "%*s{\n", indent, "");
 565    indent += 2;
 566    fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name);
 567    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 568            field->version_id);
 569    fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "",
 570            field->field_exists ? "true" : "false");
 571    fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size);
 572    if (field->vmsd != NULL) {
 573        fprintf(out_file, ",\n");
 574        dump_vmstate_vmsd(out_file, field->vmsd, indent, false);
 575    }
 576    fprintf(out_file, "\n%*s}", indent - 2, "");
 577}
 578
 579static void dump_vmstate_vmss(FILE *out_file,
 580                              const VMStateDescription **subsection,
 581                              int indent)
 582{
 583    if (*subsection != NULL) {
 584        dump_vmstate_vmsd(out_file, *subsection, indent, true);
 585    }
 586}
 587
 588static void dump_vmstate_vmsd(FILE *out_file,
 589                              const VMStateDescription *vmsd, int indent,
 590                              bool is_subsection)
 591{
 592    if (is_subsection) {
 593        fprintf(out_file, "%*s{\n", indent, "");
 594    } else {
 595        fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description");
 596    }
 597    indent += 2;
 598    fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name);
 599    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 600            vmsd->version_id);
 601    fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "",
 602            vmsd->minimum_version_id);
 603    if (vmsd->fields != NULL) {
 604        const VMStateField *field = vmsd->fields;
 605        bool first;
 606
 607        fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, "");
 608        first = true;
 609        while (field->name != NULL) {
 610            if (field->flags & VMS_MUST_EXIST) {
 611                /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */
 612                field++;
 613                continue;
 614            }
 615            if (!first) {
 616                fprintf(out_file, ",\n");
 617            }
 618            dump_vmstate_vmsf(out_file, field, indent + 2);
 619            field++;
 620            first = false;
 621        }
 622        fprintf(out_file, "\n%*s]", indent, "");
 623    }
 624    if (vmsd->subsections != NULL) {
 625        const VMStateDescription **subsection = vmsd->subsections;
 626        bool first;
 627
 628        fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, "");
 629        first = true;
 630        while (*subsection != NULL) {
 631            if (!first) {
 632                fprintf(out_file, ",\n");
 633            }
 634            dump_vmstate_vmss(out_file, subsection, indent + 2);
 635            subsection++;
 636            first = false;
 637        }
 638        fprintf(out_file, "\n%*s]", indent, "");
 639    }
 640    fprintf(out_file, "\n%*s}", indent - 2, "");
 641}
 642
 643static void dump_machine_type(FILE *out_file)
 644{
 645    MachineClass *mc;
 646
 647    mc = MACHINE_GET_CLASS(current_machine);
 648
 649    fprintf(out_file, "  \"vmschkmachine\": {\n");
 650    fprintf(out_file, "    \"Name\": \"%s\"\n", mc->name);
 651    fprintf(out_file, "  },\n");
 652}
 653
 654void dump_vmstate_json_to_file(FILE *out_file)
 655{
 656    GSList *list, *elt;
 657    bool first;
 658
 659    fprintf(out_file, "{\n");
 660    dump_machine_type(out_file);
 661
 662    first = true;
 663    list = object_class_get_list(TYPE_DEVICE, true);
 664    for (elt = list; elt; elt = elt->next) {
 665        DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data,
 666                                             TYPE_DEVICE);
 667        const char *name;
 668        int indent = 2;
 669
 670        if (!dc->vmsd) {
 671            continue;
 672        }
 673
 674        if (!first) {
 675            fprintf(out_file, ",\n");
 676        }
 677        name = object_class_get_name(OBJECT_CLASS(dc));
 678        fprintf(out_file, "%*s\"%s\": {\n", indent, "", name);
 679        indent += 2;
 680        fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name);
 681        fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 682                dc->vmsd->version_id);
 683        fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "",
 684                dc->vmsd->minimum_version_id);
 685
 686        dump_vmstate_vmsd(out_file, dc->vmsd, indent, false);
 687
 688        fprintf(out_file, "\n%*s}", indent - 2, "");
 689        first = false;
 690    }
 691    fprintf(out_file, "\n}\n");
 692    fclose(out_file);
 693    g_slist_free(list);
 694}
 695
 696static uint32_t calculate_new_instance_id(const char *idstr)
 697{
 698    SaveStateEntry *se;
 699    uint32_t instance_id = 0;
 700
 701    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 702        if (strcmp(idstr, se->idstr) == 0
 703            && instance_id <= se->instance_id) {
 704            instance_id = se->instance_id + 1;
 705        }
 706    }
 707    /* Make sure we never loop over without being noticed */
 708    assert(instance_id != VMSTATE_INSTANCE_ID_ANY);
 709    return instance_id;
 710}
 711
 712static int calculate_compat_instance_id(const char *idstr)
 713{
 714    SaveStateEntry *se;
 715    int instance_id = 0;
 716
 717    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 718        if (!se->compat) {
 719            continue;
 720        }
 721
 722        if (strcmp(idstr, se->compat->idstr) == 0
 723            && instance_id <= se->compat->instance_id) {
 724            instance_id = se->compat->instance_id + 1;
 725        }
 726    }
 727    return instance_id;
 728}
 729
 730static inline MigrationPriority save_state_priority(SaveStateEntry *se)
 731{
 732    if (se->vmsd) {
 733        return se->vmsd->priority;
 734    }
 735    return MIG_PRI_DEFAULT;
 736}
 737
 738static void savevm_state_handler_insert(SaveStateEntry *nse)
 739{
 740    MigrationPriority priority = save_state_priority(nse);
 741    SaveStateEntry *se;
 742    int i;
 743
 744    assert(priority <= MIG_PRI_MAX);
 745
 746    for (i = priority - 1; i >= 0; i--) {
 747        se = savevm_state.handler_pri_head[i];
 748        if (se != NULL) {
 749            assert(save_state_priority(se) < priority);
 750            break;
 751        }
 752    }
 753
 754    if (i >= 0) {
 755        QTAILQ_INSERT_BEFORE(se, nse, entry);
 756    } else {
 757        QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry);
 758    }
 759
 760    if (savevm_state.handler_pri_head[priority] == NULL) {
 761        savevm_state.handler_pri_head[priority] = nse;
 762    }
 763}
 764
 765static void savevm_state_handler_remove(SaveStateEntry *se)
 766{
 767    SaveStateEntry *next;
 768    MigrationPriority priority = save_state_priority(se);
 769
 770    if (se == savevm_state.handler_pri_head[priority]) {
 771        next = QTAILQ_NEXT(se, entry);
 772        if (next != NULL && save_state_priority(next) == priority) {
 773            savevm_state.handler_pri_head[priority] = next;
 774        } else {
 775            savevm_state.handler_pri_head[priority] = NULL;
 776        }
 777    }
 778    QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
 779}
 780
 781/* TODO: Individual devices generally have very little idea about the rest
 782   of the system, so instance_id should be removed/replaced.
 783   Meanwhile pass -1 as instance_id if you do not already have a clearly
 784   distinguishing id for all instances of your device class. */
 785int register_savevm_live(const char *idstr,
 786                         uint32_t instance_id,
 787                         int version_id,
 788                         const SaveVMHandlers *ops,
 789                         void *opaque)
 790{
 791    SaveStateEntry *se;
 792
 793    se = g_new0(SaveStateEntry, 1);
 794    se->version_id = version_id;
 795    se->section_id = savevm_state.global_section_id++;
 796    se->ops = ops;
 797    se->opaque = opaque;
 798    se->vmsd = NULL;
 799    /* if this is a live_savem then set is_ram */
 800    if (ops->save_setup != NULL) {
 801        se->is_ram = 1;
 802    }
 803
 804    pstrcat(se->idstr, sizeof(se->idstr), idstr);
 805
 806    if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
 807        se->instance_id = calculate_new_instance_id(se->idstr);
 808    } else {
 809        se->instance_id = instance_id;
 810    }
 811    assert(!se->compat || se->instance_id == 0);
 812    savevm_state_handler_insert(se);
 813    return 0;
 814}
 815
 816void unregister_savevm(VMStateIf *obj, const char *idstr, void *opaque)
 817{
 818    SaveStateEntry *se, *new_se;
 819    char id[256] = "";
 820
 821    if (obj) {
 822        char *oid = vmstate_if_get_id(obj);
 823        if (oid) {
 824            pstrcpy(id, sizeof(id), oid);
 825            pstrcat(id, sizeof(id), "/");
 826            g_free(oid);
 827        }
 828    }
 829    pstrcat(id, sizeof(id), idstr);
 830
 831    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
 832        if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
 833            savevm_state_handler_remove(se);
 834            g_free(se->compat);
 835            g_free(se);
 836        }
 837    }
 838}
 839
 840int vmstate_register_with_alias_id(VMStateIf *obj, uint32_t instance_id,
 841                                   const VMStateDescription *vmsd,
 842                                   void *opaque, int alias_id,
 843                                   int required_for_version,
 844                                   Error **errp)
 845{
 846    SaveStateEntry *se;
 847
 848    /* If this triggers, alias support can be dropped for the vmsd. */
 849    assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id);
 850
 851    se = g_new0(SaveStateEntry, 1);
 852    se->version_id = vmsd->version_id;
 853    se->section_id = savevm_state.global_section_id++;
 854    se->opaque = opaque;
 855    se->vmsd = vmsd;
 856    se->alias_id = alias_id;
 857
 858    if (obj) {
 859        char *id = vmstate_if_get_id(obj);
 860        if (id) {
 861            if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
 862                sizeof(se->idstr)) {
 863                error_setg(errp, "Path too long for VMState (%s)", id);
 864                g_free(id);
 865                g_free(se);
 866
 867                return -1;
 868            }
 869            g_free(id);
 870
 871            se->compat = g_new0(CompatEntry, 1);
 872            pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name);
 873            se->compat->instance_id = instance_id == VMSTATE_INSTANCE_ID_ANY ?
 874                         calculate_compat_instance_id(vmsd->name) : instance_id;
 875            instance_id = VMSTATE_INSTANCE_ID_ANY;
 876        }
 877    }
 878    pstrcat(se->idstr, sizeof(se->idstr), vmsd->name);
 879
 880    if (instance_id == VMSTATE_INSTANCE_ID_ANY) {
 881        se->instance_id = calculate_new_instance_id(se->idstr);
 882    } else {
 883        se->instance_id = instance_id;
 884    }
 885    assert(!se->compat || se->instance_id == 0);
 886    savevm_state_handler_insert(se);
 887    return 0;
 888}
 889
 890void vmstate_unregister(VMStateIf *obj, const VMStateDescription *vmsd,
 891                        void *opaque)
 892{
 893    SaveStateEntry *se, *new_se;
 894
 895    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
 896        if (se->vmsd == vmsd && se->opaque == opaque) {
 897            savevm_state_handler_remove(se);
 898            g_free(se->compat);
 899            g_free(se);
 900        }
 901    }
 902}
 903
 904static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
 905{
 906    trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
 907    if (!se->vmsd) {         /* Old style */
 908        return se->ops->load_state(f, se->opaque, se->load_version_id);
 909    }
 910    return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id);
 911}
 912
 913static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se,
 914                                   JSONWriter *vmdesc)
 915{
 916    int64_t old_offset, size;
 917
 918    old_offset = qemu_ftell_fast(f);
 919    se->ops->save_state(f, se->opaque);
 920    size = qemu_ftell_fast(f) - old_offset;
 921
 922    if (vmdesc) {
 923        json_writer_int64(vmdesc, "size", size);
 924        json_writer_start_array(vmdesc, "fields");
 925        json_writer_start_object(vmdesc, NULL);
 926        json_writer_str(vmdesc, "name", "data");
 927        json_writer_int64(vmdesc, "size", size);
 928        json_writer_str(vmdesc, "type", "buffer");
 929        json_writer_end_object(vmdesc);
 930        json_writer_end_array(vmdesc);
 931    }
 932}
 933
 934static int vmstate_save(QEMUFile *f, SaveStateEntry *se,
 935                        JSONWriter *vmdesc)
 936{
 937    trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
 938    if (!se->vmsd) {
 939        vmstate_save_old_style(f, se, vmdesc);
 940        return 0;
 941    }
 942    return vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
 943}
 944
 945/*
 946 * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
 947 */
 948static void save_section_header(QEMUFile *f, SaveStateEntry *se,
 949                                uint8_t section_type)
 950{
 951    qemu_put_byte(f, section_type);
 952    qemu_put_be32(f, se->section_id);
 953
 954    if (section_type == QEMU_VM_SECTION_FULL ||
 955        section_type == QEMU_VM_SECTION_START) {
 956        /* ID string */
 957        size_t len = strlen(se->idstr);
 958        qemu_put_byte(f, len);
 959        qemu_put_buffer(f, (uint8_t *)se->idstr, len);
 960
 961        qemu_put_be32(f, se->instance_id);
 962        qemu_put_be32(f, se->version_id);
 963    }
 964}
 965
 966/*
 967 * Write a footer onto device sections that catches cases misformatted device
 968 * sections.
 969 */
 970static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
 971{
 972    if (migrate_get_current()->send_section_footer) {
 973        qemu_put_byte(f, QEMU_VM_SECTION_FOOTER);
 974        qemu_put_be32(f, se->section_id);
 975    }
 976}
 977
 978/**
 979 * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
 980 *                           command and associated data.
 981 *
 982 * @f: File to send command on
 983 * @command: Command type to send
 984 * @len: Length of associated data
 985 * @data: Data associated with command.
 986 */
 987static void qemu_savevm_command_send(QEMUFile *f,
 988                                     enum qemu_vm_cmd command,
 989                                     uint16_t len,
 990                                     uint8_t *data)
 991{
 992    trace_savevm_command_send(command, len);
 993    qemu_put_byte(f, QEMU_VM_COMMAND);
 994    qemu_put_be16(f, (uint16_t)command);
 995    qemu_put_be16(f, len);
 996    qemu_put_buffer(f, data, len);
 997    qemu_fflush(f);
 998}
 999
1000void qemu_savevm_send_colo_enable(QEMUFile *f)
1001{
1002    trace_savevm_send_colo_enable();
1003    qemu_savevm_command_send(f, MIG_CMD_ENABLE_COLO, 0, NULL);
1004}
1005
1006void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
1007{
1008    uint32_t buf;
1009
1010    trace_savevm_send_ping(value);
1011    buf = cpu_to_be32(value);
1012    qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
1013}
1014
1015void qemu_savevm_send_open_return_path(QEMUFile *f)
1016{
1017    trace_savevm_send_open_return_path();
1018    qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
1019}
1020
1021/* We have a buffer of data to send; we don't want that all to be loaded
1022 * by the command itself, so the command contains just the length of the
1023 * extra buffer that we then send straight after it.
1024 * TODO: Must be a better way to organise that
1025 *
1026 * Returns:
1027 *    0 on success
1028 *    -ve on error
1029 */
1030int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len)
1031{
1032    uint32_t tmp;
1033
1034    if (len > MAX_VM_CMD_PACKAGED_SIZE) {
1035        error_report("%s: Unreasonably large packaged state: %zu",
1036                     __func__, len);
1037        return -1;
1038    }
1039
1040    tmp = cpu_to_be32(len);
1041
1042    trace_qemu_savevm_send_packaged();
1043    qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
1044
1045    qemu_put_buffer(f, buf, len);
1046
1047    return 0;
1048}
1049
1050/* Send prior to any postcopy transfer */
1051void qemu_savevm_send_postcopy_advise(QEMUFile *f)
1052{
1053    if (migrate_postcopy_ram()) {
1054        uint64_t tmp[2];
1055        tmp[0] = cpu_to_be64(ram_pagesize_summary());
1056        tmp[1] = cpu_to_be64(qemu_target_page_size());
1057
1058        trace_qemu_savevm_send_postcopy_advise();
1059        qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE,
1060                                 16, (uint8_t *)tmp);
1061    } else {
1062        qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 0, NULL);
1063    }
1064}
1065
1066/* Sent prior to starting the destination running in postcopy, discard pages
1067 * that have already been sent but redirtied on the source.
1068 * CMD_POSTCOPY_RAM_DISCARD consist of:
1069 *      byte   version (0)
1070 *      byte   Length of name field (not including 0)
1071 *  n x byte   RAM block name
1072 *      byte   0 terminator (just for safety)
1073 *  n x        Byte ranges within the named RAMBlock
1074 *      be64   Start of the range
1075 *      be64   Length
1076 *
1077 *  name:  RAMBlock name that these entries are part of
1078 *  len: Number of page entries
1079 *  start_list: 'len' addresses
1080 *  length_list: 'len' addresses
1081 *
1082 */
1083void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
1084                                           uint16_t len,
1085                                           uint64_t *start_list,
1086                                           uint64_t *length_list)
1087{
1088    uint8_t *buf;
1089    uint16_t tmplen;
1090    uint16_t t;
1091    size_t name_len = strlen(name);
1092
1093    trace_qemu_savevm_send_postcopy_ram_discard(name, len);
1094    assert(name_len < 256);
1095    buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
1096    buf[0] = postcopy_ram_discard_version;
1097    buf[1] = name_len;
1098    memcpy(buf + 2, name, name_len);
1099    tmplen = 2 + name_len;
1100    buf[tmplen++] = '\0';
1101
1102    for (t = 0; t < len; t++) {
1103        stq_be_p(buf + tmplen, start_list[t]);
1104        tmplen += 8;
1105        stq_be_p(buf + tmplen, length_list[t]);
1106        tmplen += 8;
1107    }
1108    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
1109    g_free(buf);
1110}
1111
1112/* Get the destination into a state where it can receive postcopy data. */
1113void qemu_savevm_send_postcopy_listen(QEMUFile *f)
1114{
1115    trace_savevm_send_postcopy_listen();
1116    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
1117}
1118
1119/* Kick the destination into running */
1120void qemu_savevm_send_postcopy_run(QEMUFile *f)
1121{
1122    trace_savevm_send_postcopy_run();
1123    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
1124}
1125
1126void qemu_savevm_send_postcopy_resume(QEMUFile *f)
1127{
1128    trace_savevm_send_postcopy_resume();
1129    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RESUME, 0, NULL);
1130}
1131
1132void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name)
1133{
1134    size_t len;
1135    char buf[256];
1136
1137    trace_savevm_send_recv_bitmap(block_name);
1138
1139    buf[0] = len = strlen(block_name);
1140    memcpy(buf + 1, block_name, len);
1141
1142    qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
1143}
1144
1145bool qemu_savevm_state_blocked(Error **errp)
1146{
1147    SaveStateEntry *se;
1148
1149    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1150        if (se->vmsd && se->vmsd->unmigratable) {
1151            error_setg(errp, "State blocked by non-migratable device '%s'",
1152                       se->idstr);
1153            return true;
1154        }
1155    }
1156    return false;
1157}
1158
1159void qemu_savevm_non_migratable_list(strList **reasons)
1160{
1161    SaveStateEntry *se;
1162
1163    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1164        if (se->vmsd && se->vmsd->unmigratable) {
1165            QAPI_LIST_PREPEND(*reasons,
1166                              g_strdup_printf("non-migratable device: %s",
1167                                              se->idstr));
1168        }
1169    }
1170}
1171
1172void qemu_savevm_state_header(QEMUFile *f)
1173{
1174    trace_savevm_state_header();
1175    qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1176    qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1177
1178    if (migrate_get_current()->send_configuration) {
1179        qemu_put_byte(f, QEMU_VM_CONFIGURATION);
1180        vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
1181    }
1182}
1183
1184bool qemu_savevm_state_guest_unplug_pending(void)
1185{
1186    SaveStateEntry *se;
1187
1188    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1189        if (se->vmsd && se->vmsd->dev_unplug_pending &&
1190            se->vmsd->dev_unplug_pending(se->opaque)) {
1191            return true;
1192        }
1193    }
1194
1195    return false;
1196}
1197
1198void qemu_savevm_state_setup(QEMUFile *f)
1199{
1200    SaveStateEntry *se;
1201    Error *local_err = NULL;
1202    int ret;
1203
1204    trace_savevm_state_setup();
1205    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1206        if (!se->ops || !se->ops->save_setup) {
1207            continue;
1208        }
1209        if (se->ops->is_active) {
1210            if (!se->ops->is_active(se->opaque)) {
1211                continue;
1212            }
1213        }
1214        save_section_header(f, se, QEMU_VM_SECTION_START);
1215
1216        ret = se->ops->save_setup(f, se->opaque);
1217        save_section_footer(f, se);
1218        if (ret < 0) {
1219            qemu_file_set_error(f, ret);
1220            break;
1221        }
1222    }
1223
1224    if (precopy_notify(PRECOPY_NOTIFY_SETUP, &local_err)) {
1225        error_report_err(local_err);
1226    }
1227}
1228
1229int qemu_savevm_state_resume_prepare(MigrationState *s)
1230{
1231    SaveStateEntry *se;
1232    int ret;
1233
1234    trace_savevm_state_resume_prepare();
1235
1236    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1237        if (!se->ops || !se->ops->resume_prepare) {
1238            continue;
1239        }
1240        if (se->ops->is_active) {
1241            if (!se->ops->is_active(se->opaque)) {
1242                continue;
1243            }
1244        }
1245        ret = se->ops->resume_prepare(s, se->opaque);
1246        if (ret < 0) {
1247            return ret;
1248        }
1249    }
1250
1251    return 0;
1252}
1253
1254/*
1255 * this function has three return values:
1256 *   negative: there was one error, and we have -errno.
1257 *   0 : We haven't finished, caller have to go again
1258 *   1 : We have finished, we can go to complete phase
1259 */
1260int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
1261{
1262    SaveStateEntry *se;
1263    int ret = 1;
1264
1265    trace_savevm_state_iterate();
1266    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1267        if (!se->ops || !se->ops->save_live_iterate) {
1268            continue;
1269        }
1270        if (se->ops->is_active &&
1271            !se->ops->is_active(se->opaque)) {
1272            continue;
1273        }
1274        if (se->ops->is_active_iterate &&
1275            !se->ops->is_active_iterate(se->opaque)) {
1276            continue;
1277        }
1278        /*
1279         * In the postcopy phase, any device that doesn't know how to
1280         * do postcopy should have saved it's state in the _complete
1281         * call that's already run, it might get confused if we call
1282         * iterate afterwards.
1283         */
1284        if (postcopy &&
1285            !(se->ops->has_postcopy && se->ops->has_postcopy(se->opaque))) {
1286            continue;
1287        }
1288        if (qemu_file_rate_limit(f)) {
1289            return 0;
1290        }
1291        trace_savevm_section_start(se->idstr, se->section_id);
1292
1293        save_section_header(f, se, QEMU_VM_SECTION_PART);
1294
1295        ret = se->ops->save_live_iterate(f, se->opaque);
1296        trace_savevm_section_end(se->idstr, se->section_id, ret);
1297        save_section_footer(f, se);
1298
1299        if (ret < 0) {
1300            error_report("failed to save SaveStateEntry with id(name): %d(%s)",
1301                         se->section_id, se->idstr);
1302            qemu_file_set_error(f, ret);
1303        }
1304        if (ret <= 0) {
1305            /* Do not proceed to the next vmstate before this one reported
1306               completion of the current stage. This serializes the migration
1307               and reduces the probability that a faster changing state is
1308               synchronized over and over again. */
1309            break;
1310        }
1311    }
1312    return ret;
1313}
1314
1315static bool should_send_vmdesc(void)
1316{
1317    MachineState *machine = MACHINE(qdev_get_machine());
1318    bool in_postcopy = migration_in_postcopy();
1319    return !machine->suppress_vmdesc && !in_postcopy;
1320}
1321
1322/*
1323 * Calls the save_live_complete_postcopy methods
1324 * causing the last few pages to be sent immediately and doing any associated
1325 * cleanup.
1326 * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
1327 * all the other devices, but that happens at the point we switch to postcopy.
1328 */
1329void qemu_savevm_state_complete_postcopy(QEMUFile *f)
1330{
1331    SaveStateEntry *se;
1332    int ret;
1333
1334    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1335        if (!se->ops || !se->ops->save_live_complete_postcopy) {
1336            continue;
1337        }
1338        if (se->ops->is_active) {
1339            if (!se->ops->is_active(se->opaque)) {
1340                continue;
1341            }
1342        }
1343        trace_savevm_section_start(se->idstr, se->section_id);
1344        /* Section type */
1345        qemu_put_byte(f, QEMU_VM_SECTION_END);
1346        qemu_put_be32(f, se->section_id);
1347
1348        ret = se->ops->save_live_complete_postcopy(f, se->opaque);
1349        trace_savevm_section_end(se->idstr, se->section_id, ret);
1350        save_section_footer(f, se);
1351        if (ret < 0) {
1352            qemu_file_set_error(f, ret);
1353            return;
1354        }
1355    }
1356
1357    qemu_put_byte(f, QEMU_VM_EOF);
1358    qemu_fflush(f);
1359}
1360
1361static
1362int qemu_savevm_state_complete_precopy_iterable(QEMUFile *f, bool in_postcopy)
1363{
1364    SaveStateEntry *se;
1365    int ret;
1366
1367    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1368        if (!se->ops ||
1369            (in_postcopy && se->ops->has_postcopy &&
1370             se->ops->has_postcopy(se->opaque)) ||
1371            !se->ops->save_live_complete_precopy) {
1372            continue;
1373        }
1374
1375        if (se->ops->is_active) {
1376            if (!se->ops->is_active(se->opaque)) {
1377                continue;
1378            }
1379        }
1380        trace_savevm_section_start(se->idstr, se->section_id);
1381
1382        save_section_header(f, se, QEMU_VM_SECTION_END);
1383
1384        ret = se->ops->save_live_complete_precopy(f, se->opaque);
1385        trace_savevm_section_end(se->idstr, se->section_id, ret);
1386        save_section_footer(f, se);
1387        if (ret < 0) {
1388            qemu_file_set_error(f, ret);
1389            return -1;
1390        }
1391    }
1392
1393    return 0;
1394}
1395
1396int qemu_savevm_state_complete_precopy_non_iterable(QEMUFile *f,
1397                                                    bool in_postcopy,
1398                                                    bool inactivate_disks)
1399{
1400    g_autoptr(JSONWriter) vmdesc = NULL;
1401    int vmdesc_len;
1402    SaveStateEntry *se;
1403    int ret;
1404
1405    vmdesc = json_writer_new(false);
1406    json_writer_start_object(vmdesc, NULL);
1407    json_writer_int64(vmdesc, "page_size", qemu_target_page_size());
1408    json_writer_start_array(vmdesc, "devices");
1409    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1410
1411        if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1412            continue;
1413        }
1414        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1415            trace_savevm_section_skip(se->idstr, se->section_id);
1416            continue;
1417        }
1418
1419        trace_savevm_section_start(se->idstr, se->section_id);
1420
1421        json_writer_start_object(vmdesc, NULL);
1422        json_writer_str(vmdesc, "name", se->idstr);
1423        json_writer_int64(vmdesc, "instance_id", se->instance_id);
1424
1425        save_section_header(f, se, QEMU_VM_SECTION_FULL);
1426        ret = vmstate_save(f, se, vmdesc);
1427        if (ret) {
1428            qemu_file_set_error(f, ret);
1429            return ret;
1430        }
1431        trace_savevm_section_end(se->idstr, se->section_id, 0);
1432        save_section_footer(f, se);
1433
1434        json_writer_end_object(vmdesc);
1435    }
1436
1437    if (inactivate_disks) {
1438        /* Inactivate before sending QEMU_VM_EOF so that the
1439         * bdrv_invalidate_cache_all() on the other end won't fail. */
1440        ret = bdrv_inactivate_all();
1441        if (ret) {
1442            error_report("%s: bdrv_inactivate_all() failed (%d)",
1443                         __func__, ret);
1444            qemu_file_set_error(f, ret);
1445            return ret;
1446        }
1447    }
1448    if (!in_postcopy) {
1449        /* Postcopy stream will still be going */
1450        qemu_put_byte(f, QEMU_VM_EOF);
1451    }
1452
1453    json_writer_end_array(vmdesc);
1454    json_writer_end_object(vmdesc);
1455    vmdesc_len = strlen(json_writer_get(vmdesc));
1456
1457    if (should_send_vmdesc()) {
1458        qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
1459        qemu_put_be32(f, vmdesc_len);
1460        qemu_put_buffer(f, (uint8_t *)json_writer_get(vmdesc), vmdesc_len);
1461    }
1462
1463    return 0;
1464}
1465
1466int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
1467                                       bool inactivate_disks)
1468{
1469    int ret;
1470    Error *local_err = NULL;
1471    bool in_postcopy = migration_in_postcopy();
1472
1473    if (precopy_notify(PRECOPY_NOTIFY_COMPLETE, &local_err)) {
1474        error_report_err(local_err);
1475    }
1476
1477    trace_savevm_state_complete_precopy();
1478
1479    cpu_synchronize_all_states();
1480
1481    if (!in_postcopy || iterable_only) {
1482        ret = qemu_savevm_state_complete_precopy_iterable(f, in_postcopy);
1483        if (ret) {
1484            return ret;
1485        }
1486    }
1487
1488    if (iterable_only) {
1489        goto flush;
1490    }
1491
1492    ret = qemu_savevm_state_complete_precopy_non_iterable(f, in_postcopy,
1493                                                          inactivate_disks);
1494    if (ret) {
1495        return ret;
1496    }
1497
1498flush:
1499    qemu_fflush(f);
1500    return 0;
1501}
1502
1503/* Give an estimate of the amount left to be transferred,
1504 * the result is split into the amount for units that can and
1505 * for units that can't do postcopy.
1506 */
1507void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
1508                               uint64_t *res_precopy_only,
1509                               uint64_t *res_compatible,
1510                               uint64_t *res_postcopy_only)
1511{
1512    SaveStateEntry *se;
1513
1514    *res_precopy_only = 0;
1515    *res_compatible = 0;
1516    *res_postcopy_only = 0;
1517
1518
1519    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1520        if (!se->ops || !se->ops->save_live_pending) {
1521            continue;
1522        }
1523        if (se->ops->is_active) {
1524            if (!se->ops->is_active(se->opaque)) {
1525                continue;
1526            }
1527        }
1528        se->ops->save_live_pending(f, se->opaque, threshold_size,
1529                                   res_precopy_only, res_compatible,
1530                                   res_postcopy_only);
1531    }
1532}
1533
1534void qemu_savevm_state_cleanup(void)
1535{
1536    SaveStateEntry *se;
1537    Error *local_err = NULL;
1538
1539    if (precopy_notify(PRECOPY_NOTIFY_CLEANUP, &local_err)) {
1540        error_report_err(local_err);
1541    }
1542
1543    trace_savevm_state_cleanup();
1544    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1545        if (se->ops && se->ops->save_cleanup) {
1546            se->ops->save_cleanup(se->opaque);
1547        }
1548    }
1549}
1550
1551static int qemu_savevm_state(QEMUFile *f, Error **errp)
1552{
1553    int ret;
1554    MigrationState *ms = migrate_get_current();
1555    MigrationStatus status;
1556
1557    if (migration_is_running(ms->state)) {
1558        error_setg(errp, QERR_MIGRATION_ACTIVE);
1559        return -EINVAL;
1560    }
1561
1562    if (migrate_use_block()) {
1563        error_setg(errp, "Block migration and snapshots are incompatible");
1564        return -EINVAL;
1565    }
1566
1567    migrate_init(ms);
1568    memset(&ram_counters, 0, sizeof(ram_counters));
1569    ms->to_dst_file = f;
1570
1571    qemu_mutex_unlock_iothread();
1572    qemu_savevm_state_header(f);
1573    qemu_savevm_state_setup(f);
1574    qemu_mutex_lock_iothread();
1575
1576    while (qemu_file_get_error(f) == 0) {
1577        if (qemu_savevm_state_iterate(f, false) > 0) {
1578            break;
1579        }
1580    }
1581
1582    ret = qemu_file_get_error(f);
1583    if (ret == 0) {
1584        qemu_savevm_state_complete_precopy(f, false, false);
1585        ret = qemu_file_get_error(f);
1586    }
1587    qemu_savevm_state_cleanup();
1588    if (ret != 0) {
1589        error_setg_errno(errp, -ret, "Error while writing VM state");
1590    }
1591
1592    if (ret != 0) {
1593        status = MIGRATION_STATUS_FAILED;
1594    } else {
1595        status = MIGRATION_STATUS_COMPLETED;
1596    }
1597    migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status);
1598
1599    /* f is outer parameter, it should not stay in global migration state after
1600     * this function finished */
1601    ms->to_dst_file = NULL;
1602
1603    return ret;
1604}
1605
1606void qemu_savevm_live_state(QEMUFile *f)
1607{
1608    /* save QEMU_VM_SECTION_END section */
1609    qemu_savevm_state_complete_precopy(f, true, false);
1610    qemu_put_byte(f, QEMU_VM_EOF);
1611}
1612
1613int qemu_save_device_state(QEMUFile *f)
1614{
1615    SaveStateEntry *se;
1616
1617    if (!migration_in_colo_state()) {
1618        qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1619        qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1620    }
1621    cpu_synchronize_all_states();
1622
1623    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1624        int ret;
1625
1626        if (se->is_ram) {
1627            continue;
1628        }
1629        if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1630            continue;
1631        }
1632        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1633            continue;
1634        }
1635
1636        save_section_header(f, se, QEMU_VM_SECTION_FULL);
1637
1638        ret = vmstate_save(f, se, NULL);
1639        if (ret) {
1640            return ret;
1641        }
1642
1643        save_section_footer(f, se);
1644    }
1645
1646    qemu_put_byte(f, QEMU_VM_EOF);
1647
1648    return qemu_file_get_error(f);
1649}
1650
1651static SaveStateEntry *find_se(const char *idstr, uint32_t instance_id)
1652{
1653    SaveStateEntry *se;
1654
1655    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1656        if (!strcmp(se->idstr, idstr) &&
1657            (instance_id == se->instance_id ||
1658             instance_id == se->alias_id))
1659            return se;
1660        /* Migrating from an older version? */
1661        if (strstr(se->idstr, idstr) && se->compat) {
1662            if (!strcmp(se->compat->idstr, idstr) &&
1663                (instance_id == se->compat->instance_id ||
1664                 instance_id == se->alias_id))
1665                return se;
1666        }
1667    }
1668    return NULL;
1669}
1670
1671enum LoadVMExitCodes {
1672    /* Allow a command to quit all layers of nested loadvm loops */
1673    LOADVM_QUIT     =  1,
1674};
1675
1676/* ------ incoming postcopy messages ------ */
1677/* 'advise' arrives before any transfers just to tell us that a postcopy
1678 * *might* happen - it might be skipped if precopy transferred everything
1679 * quickly.
1680 */
1681static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
1682                                         uint16_t len)
1683{
1684    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1685    uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps;
1686    Error *local_err = NULL;
1687
1688    trace_loadvm_postcopy_handle_advise();
1689    if (ps != POSTCOPY_INCOMING_NONE) {
1690        error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
1691        return -1;
1692    }
1693
1694    switch (len) {
1695    case 0:
1696        if (migrate_postcopy_ram()) {
1697            error_report("RAM postcopy is enabled but have 0 byte advise");
1698            return -EINVAL;
1699        }
1700        return 0;
1701    case 8 + 8:
1702        if (!migrate_postcopy_ram()) {
1703            error_report("RAM postcopy is disabled but have 16 byte advise");
1704            return -EINVAL;
1705        }
1706        break;
1707    default:
1708        error_report("CMD_POSTCOPY_ADVISE invalid length (%d)", len);
1709        return -EINVAL;
1710    }
1711
1712    if (!postcopy_ram_supported_by_host(mis)) {
1713        postcopy_state_set(POSTCOPY_INCOMING_NONE);
1714        return -1;
1715    }
1716
1717    remote_pagesize_summary = qemu_get_be64(mis->from_src_file);
1718    local_pagesize_summary = ram_pagesize_summary();
1719
1720    if (remote_pagesize_summary != local_pagesize_summary)  {
1721        /*
1722         * This detects two potential causes of mismatch:
1723         *   a) A mismatch in host page sizes
1724         *      Some combinations of mismatch are probably possible but it gets
1725         *      a bit more complicated.  In particular we need to place whole
1726         *      host pages on the dest at once, and we need to ensure that we
1727         *      handle dirtying to make sure we never end up sending part of
1728         *      a hostpage on it's own.
1729         *   b) The use of different huge page sizes on source/destination
1730         *      a more fine grain test is performed during RAM block migration
1731         *      but this test here causes a nice early clear failure, and
1732         *      also fails when passed to an older qemu that doesn't
1733         *      do huge pages.
1734         */
1735        error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64
1736                                                             " d=%" PRIx64 ")",
1737                     remote_pagesize_summary, local_pagesize_summary);
1738        return -1;
1739    }
1740
1741    remote_tps = qemu_get_be64(mis->from_src_file);
1742    if (remote_tps != qemu_target_page_size()) {
1743        /*
1744         * Again, some differences could be dealt with, but for now keep it
1745         * simple.
1746         */
1747        error_report("Postcopy needs matching target page sizes (s=%d d=%zd)",
1748                     (int)remote_tps, qemu_target_page_size());
1749        return -1;
1750    }
1751
1752    if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_ADVISE, &local_err)) {
1753        error_report_err(local_err);
1754        return -1;
1755    }
1756
1757    if (ram_postcopy_incoming_init(mis)) {
1758        return -1;
1759    }
1760
1761    return 0;
1762}
1763
1764/* After postcopy we will be told to throw some pages away since they're
1765 * dirty and will have to be demand fetched.  Must happen before CPU is
1766 * started.
1767 * There can be 0..many of these messages, each encoding multiple pages.
1768 */
1769static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
1770                                              uint16_t len)
1771{
1772    int tmp;
1773    char ramid[256];
1774    PostcopyState ps = postcopy_state_get();
1775
1776    trace_loadvm_postcopy_ram_handle_discard();
1777
1778    switch (ps) {
1779    case POSTCOPY_INCOMING_ADVISE:
1780        /* 1st discard */
1781        tmp = postcopy_ram_prepare_discard(mis);
1782        if (tmp) {
1783            return tmp;
1784        }
1785        break;
1786
1787    case POSTCOPY_INCOMING_DISCARD:
1788        /* Expected state */
1789        break;
1790
1791    default:
1792        error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
1793                     ps);
1794        return -1;
1795    }
1796    /* We're expecting a
1797     *    Version (0)
1798     *    a RAM ID string (length byte, name, 0 term)
1799     *    then at least 1 16 byte chunk
1800    */
1801    if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
1802        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1803        return -1;
1804    }
1805
1806    tmp = qemu_get_byte(mis->from_src_file);
1807    if (tmp != postcopy_ram_discard_version) {
1808        error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
1809        return -1;
1810    }
1811
1812    if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
1813        error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
1814        return -1;
1815    }
1816    tmp = qemu_get_byte(mis->from_src_file);
1817    if (tmp != 0) {
1818        error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
1819        return -1;
1820    }
1821
1822    len -= 3 + strlen(ramid);
1823    if (len % 16) {
1824        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1825        return -1;
1826    }
1827    trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
1828    while (len) {
1829        uint64_t start_addr, block_length;
1830        start_addr = qemu_get_be64(mis->from_src_file);
1831        block_length = qemu_get_be64(mis->from_src_file);
1832
1833        len -= 16;
1834        int ret = ram_discard_range(ramid, start_addr, block_length);
1835        if (ret) {
1836            return ret;
1837        }
1838    }
1839    trace_loadvm_postcopy_ram_handle_discard_end();
1840
1841    return 0;
1842}
1843
1844/*
1845 * Triggered by a postcopy_listen command; this thread takes over reading
1846 * the input stream, leaving the main thread free to carry on loading the rest
1847 * of the device state (from RAM).
1848 * (TODO:This could do with being in a postcopy file - but there again it's
1849 * just another input loop, not that postcopy specific)
1850 */
1851static void *postcopy_ram_listen_thread(void *opaque)
1852{
1853    MigrationIncomingState *mis = migration_incoming_get_current();
1854    QEMUFile *f = mis->from_src_file;
1855    int load_res;
1856    MigrationState *migr = migrate_get_current();
1857
1858    object_ref(OBJECT(migr));
1859
1860    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
1861                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);
1862    qemu_sem_post(&mis->listen_thread_sem);
1863    trace_postcopy_ram_listen_thread_start();
1864
1865    rcu_register_thread();
1866    /*
1867     * Because we're a thread and not a coroutine we can't yield
1868     * in qemu_file, and thus we must be blocking now.
1869     */
1870    qemu_file_set_blocking(f, true);
1871    load_res = qemu_loadvm_state_main(f, mis);
1872
1873    /*
1874     * This is tricky, but, mis->from_src_file can change after it
1875     * returns, when postcopy recovery happened. In the future, we may
1876     * want a wrapper for the QEMUFile handle.
1877     */
1878    f = mis->from_src_file;
1879
1880    /* And non-blocking again so we don't block in any cleanup */
1881    qemu_file_set_blocking(f, false);
1882
1883    trace_postcopy_ram_listen_thread_exit();
1884    if (load_res < 0) {
1885        qemu_file_set_error(f, load_res);
1886        dirty_bitmap_mig_cancel_incoming();
1887        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
1888            !migrate_postcopy_ram() && migrate_dirty_bitmaps())
1889        {
1890            error_report("%s: loadvm failed during postcopy: %d. All states "
1891                         "are migrated except dirty bitmaps. Some dirty "
1892                         "bitmaps may be lost, and present migrated dirty "
1893                         "bitmaps are correctly migrated and valid.",
1894                         __func__, load_res);
1895            load_res = 0; /* prevent further exit() */
1896        } else {
1897            error_report("%s: loadvm failed: %d", __func__, load_res);
1898            migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1899                                           MIGRATION_STATUS_FAILED);
1900        }
1901    }
1902    if (load_res >= 0) {
1903        /*
1904         * This looks good, but it's possible that the device loading in the
1905         * main thread hasn't finished yet, and so we might not be in 'RUN'
1906         * state yet; wait for the end of the main thread.
1907         */
1908        qemu_event_wait(&mis->main_thread_load_event);
1909    }
1910    postcopy_ram_incoming_cleanup(mis);
1911
1912    if (load_res < 0) {
1913        /*
1914         * If something went wrong then we have a bad state so exit;
1915         * depending how far we got it might be possible at this point
1916         * to leave the guest running and fire MCEs for pages that never
1917         * arrived as a desperate recovery step.
1918         */
1919        rcu_unregister_thread();
1920        exit(EXIT_FAILURE);
1921    }
1922
1923    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1924                                   MIGRATION_STATUS_COMPLETED);
1925    /*
1926     * If everything has worked fine, then the main thread has waited
1927     * for us to start, and we're the last use of the mis.
1928     * (If something broke then qemu will have to exit anyway since it's
1929     * got a bad migration state).
1930     */
1931    migration_incoming_state_destroy();
1932    qemu_loadvm_state_cleanup();
1933
1934    rcu_unregister_thread();
1935    mis->have_listen_thread = false;
1936    postcopy_state_set(POSTCOPY_INCOMING_END);
1937
1938    object_unref(OBJECT(migr));
1939
1940    return NULL;
1941}
1942
1943/* After this message we must be able to immediately receive postcopy data */
1944static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
1945{
1946    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
1947    trace_loadvm_postcopy_handle_listen();
1948    Error *local_err = NULL;
1949
1950    if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
1951        error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
1952        return -1;
1953    }
1954    if (ps == POSTCOPY_INCOMING_ADVISE) {
1955        /*
1956         * A rare case, we entered listen without having to do any discards,
1957         * so do the setup that's normally done at the time of the 1st discard.
1958         */
1959        if (migrate_postcopy_ram()) {
1960            postcopy_ram_prepare_discard(mis);
1961        }
1962    }
1963
1964    /*
1965     * Sensitise RAM - can now generate requests for blocks that don't exist
1966     * However, at this point the CPU shouldn't be running, and the IO
1967     * shouldn't be doing anything yet so don't actually expect requests
1968     */
1969    if (migrate_postcopy_ram()) {
1970        if (postcopy_ram_incoming_setup(mis)) {
1971            postcopy_ram_incoming_cleanup(mis);
1972            return -1;
1973        }
1974    }
1975
1976    if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) {
1977        error_report_err(local_err);
1978        return -1;
1979    }
1980
1981    mis->have_listen_thread = true;
1982    /* Start up the listening thread and wait for it to signal ready */
1983    qemu_sem_init(&mis->listen_thread_sem, 0);
1984    qemu_thread_create(&mis->listen_thread, "postcopy/listen",
1985                       postcopy_ram_listen_thread, NULL,
1986                       QEMU_THREAD_DETACHED);
1987    qemu_sem_wait(&mis->listen_thread_sem);
1988    qemu_sem_destroy(&mis->listen_thread_sem);
1989
1990    return 0;
1991}
1992
1993static void loadvm_postcopy_handle_run_bh(void *opaque)
1994{
1995    Error *local_err = NULL;
1996    MigrationIncomingState *mis = opaque;
1997
1998    /* TODO we should move all of this lot into postcopy_ram.c or a shared code
1999     * in migration.c
2000     */
2001    cpu_synchronize_all_post_init();
2002
2003    qemu_announce_self(&mis->announce_timer, migrate_announce_params());
2004
2005    /* Make sure all file formats flush their mutable metadata.
2006     * If we get an error here, just don't restart the VM yet. */
2007    bdrv_invalidate_cache_all(&local_err);
2008    if (local_err) {
2009        error_report_err(local_err);
2010        local_err = NULL;
2011        autostart = false;
2012    }
2013
2014    trace_loadvm_postcopy_handle_run_cpu_sync();
2015
2016    trace_loadvm_postcopy_handle_run_vmstart();
2017
2018    dirty_bitmap_mig_before_vm_start();
2019
2020    if (autostart) {
2021        /* Hold onto your hats, starting the CPU */
2022        vm_start();
2023    } else {
2024        /* leave it paused and let management decide when to start the CPU */
2025        runstate_set(RUN_STATE_PAUSED);
2026    }
2027
2028    qemu_bh_delete(mis->bh);
2029}
2030
2031/* After all discards we can start running and asking for pages */
2032static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
2033{
2034    PostcopyState ps = postcopy_state_get();
2035
2036    trace_loadvm_postcopy_handle_run();
2037    if (ps != POSTCOPY_INCOMING_LISTENING) {
2038        error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
2039        return -1;
2040    }
2041
2042    postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
2043    mis->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, mis);
2044    qemu_bh_schedule(mis->bh);
2045
2046    /* We need to finish reading the stream from the package
2047     * and also stop reading anything more from the stream that loaded the
2048     * package (since it's now being read by the listener thread).
2049     * LOADVM_QUIT will quit all the layers of nested loadvm loops.
2050     */
2051    return LOADVM_QUIT;
2052}
2053
2054/* We must be with page_request_mutex held */
2055static gboolean postcopy_sync_page_req(gpointer key, gpointer value,
2056                                       gpointer data)
2057{
2058    MigrationIncomingState *mis = data;
2059    void *host_addr = (void *) key;
2060    ram_addr_t rb_offset;
2061    RAMBlock *rb;
2062    int ret;
2063
2064    rb = qemu_ram_block_from_host(host_addr, true, &rb_offset);
2065    if (!rb) {
2066        /*
2067         * This should _never_ happen.  However be nice for a migrating VM to
2068         * not crash/assert.  Post an error (note: intended to not use *_once
2069         * because we do want to see all the illegal addresses; and this can
2070         * never be triggered by the guest so we're safe) and move on next.
2071         */
2072        error_report("%s: illegal host addr %p", __func__, host_addr);
2073        /* Try the next entry */
2074        return FALSE;
2075    }
2076
2077    ret = migrate_send_rp_message_req_pages(mis, rb, rb_offset);
2078    if (ret) {
2079        /* Please refer to above comment. */
2080        error_report("%s: send rp message failed for addr %p",
2081                     __func__, host_addr);
2082        return FALSE;
2083    }
2084
2085    trace_postcopy_page_req_sync(host_addr);
2086
2087    return FALSE;
2088}
2089
2090static void migrate_send_rp_req_pages_pending(MigrationIncomingState *mis)
2091{
2092    WITH_QEMU_LOCK_GUARD(&mis->page_request_mutex) {
2093        g_tree_foreach(mis->page_requested, postcopy_sync_page_req, mis);
2094    }
2095}
2096
2097static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis)
2098{
2099    if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
2100        error_report("%s: illegal resume received", __func__);
2101        /* Don't fail the load, only for this. */
2102        return 0;
2103    }
2104
2105    /*
2106     * Reset the last_rb before we resend any page req to source again, since
2107     * the source should have it reset already.
2108     */
2109    mis->last_rb = NULL;
2110
2111    /*
2112     * This means source VM is ready to resume the postcopy migration.
2113     */
2114    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
2115                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
2116
2117    trace_loadvm_postcopy_handle_resume();
2118
2119    /* Tell source that "we are ready" */
2120    migrate_send_rp_resume_ack(mis, MIGRATION_RESUME_ACK_VALUE);
2121
2122    /*
2123     * After a postcopy recovery, the source should have lost the postcopy
2124     * queue, or potentially the requested pages could have been lost during
2125     * the network down phase.  Let's re-sync with the source VM by re-sending
2126     * all the pending pages that we eagerly need, so these threads won't get
2127     * blocked too long due to the recovery.
2128     *
2129     * Without this procedure, the faulted destination VM threads (waiting for
2130     * page requests right before the postcopy is interrupted) can keep hanging
2131     * until the pages are sent by the source during the background copying of
2132     * pages, or another thread faulted on the same address accidentally.
2133     */
2134    migrate_send_rp_req_pages_pending(mis);
2135
2136    /*
2137     * It's time to switch state and release the fault thread to continue
2138     * service page faults.  Note that this should be explicitly after the
2139     * above call to migrate_send_rp_req_pages_pending().  In short:
2140     * migrate_send_rp_message_req_pages() is not thread safe, yet.
2141     */
2142    qemu_sem_post(&mis->postcopy_pause_sem_fault);
2143
2144    return 0;
2145}
2146
2147/**
2148 * Immediately following this command is a blob of data containing an embedded
2149 * chunk of migration stream; read it and load it.
2150 *
2151 * @mis: Incoming state
2152 * @length: Length of packaged data to read
2153 *
2154 * Returns: Negative values on error
2155 *
2156 */
2157static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
2158{
2159    int ret;
2160    size_t length;
2161    QIOChannelBuffer *bioc;
2162
2163    length = qemu_get_be32(mis->from_src_file);
2164    trace_loadvm_handle_cmd_packaged(length);
2165
2166    if (length > MAX_VM_CMD_PACKAGED_SIZE) {
2167        error_report("Unreasonably large packaged state: %zu", length);
2168        return -1;
2169    }
2170
2171    bioc = qio_channel_buffer_new(length);
2172    qio_channel_set_name(QIO_CHANNEL(bioc), "migration-loadvm-buffer");
2173    ret = qemu_get_buffer(mis->from_src_file,
2174                          bioc->data,
2175                          length);
2176    if (ret != length) {
2177        object_unref(OBJECT(bioc));
2178        error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu",
2179                     ret, length);
2180        return (ret < 0) ? ret : -EAGAIN;
2181    }
2182    bioc->usage += length;
2183    trace_loadvm_handle_cmd_packaged_received(ret);
2184
2185    QEMUFile *packf = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
2186
2187    ret = qemu_loadvm_state_main(packf, mis);
2188    trace_loadvm_handle_cmd_packaged_main(ret);
2189    qemu_fclose(packf);
2190    object_unref(OBJECT(bioc));
2191
2192    return ret;
2193}
2194
2195/*
2196 * Handle request that source requests for recved_bitmap on
2197 * destination. Payload format:
2198 *
2199 * len (1 byte) + ramblock_name (<255 bytes)
2200 */
2201static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
2202                                     uint16_t len)
2203{
2204    QEMUFile *file = mis->from_src_file;
2205    RAMBlock *rb;
2206    char block_name[256];
2207    size_t cnt;
2208
2209    cnt = qemu_get_counted_string(file, block_name);
2210    if (!cnt) {
2211        error_report("%s: failed to read block name", __func__);
2212        return -EINVAL;
2213    }
2214
2215    /* Validate before using the data */
2216    if (qemu_file_get_error(file)) {
2217        return qemu_file_get_error(file);
2218    }
2219
2220    if (len != cnt + 1) {
2221        error_report("%s: invalid payload length (%d)", __func__, len);
2222        return -EINVAL;
2223    }
2224
2225    rb = qemu_ram_block_by_name(block_name);
2226    if (!rb) {
2227        error_report("%s: block '%s' not found", __func__, block_name);
2228        return -EINVAL;
2229    }
2230
2231    migrate_send_rp_recv_bitmap(mis, block_name);
2232
2233    trace_loadvm_handle_recv_bitmap(block_name);
2234
2235    return 0;
2236}
2237
2238static int loadvm_process_enable_colo(MigrationIncomingState *mis)
2239{
2240    int ret = migration_incoming_enable_colo();
2241
2242    if (!ret) {
2243        ret = colo_init_ram_cache();
2244        if (ret) {
2245            migration_incoming_disable_colo();
2246        }
2247    }
2248    return ret;
2249}
2250
2251/*
2252 * Process an incoming 'QEMU_VM_COMMAND'
2253 * 0           just a normal return
2254 * LOADVM_QUIT All good, but exit the loop
2255 * <0          Error
2256 */
2257static int loadvm_process_command(QEMUFile *f)
2258{
2259    MigrationIncomingState *mis = migration_incoming_get_current();
2260    uint16_t cmd;
2261    uint16_t len;
2262    uint32_t tmp32;
2263
2264    cmd = qemu_get_be16(f);
2265    len = qemu_get_be16(f);
2266
2267    /* Check validity before continue processing of cmds */
2268    if (qemu_file_get_error(f)) {
2269        return qemu_file_get_error(f);
2270    }
2271
2272    trace_loadvm_process_command(cmd, len);
2273    if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
2274        error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
2275        return -EINVAL;
2276    }
2277
2278    if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
2279        error_report("%s received with bad length - expecting %zu, got %d",
2280                     mig_cmd_args[cmd].name,
2281                     (size_t)mig_cmd_args[cmd].len, len);
2282        return -ERANGE;
2283    }
2284
2285    switch (cmd) {
2286    case MIG_CMD_OPEN_RETURN_PATH:
2287        if (mis->to_src_file) {
2288            error_report("CMD_OPEN_RETURN_PATH called when RP already open");
2289            /* Not really a problem, so don't give up */
2290            return 0;
2291        }
2292        mis->to_src_file = qemu_file_get_return_path(f);
2293        if (!mis->to_src_file) {
2294            error_report("CMD_OPEN_RETURN_PATH failed");
2295            return -1;
2296        }
2297        break;
2298
2299    case MIG_CMD_PING:
2300        tmp32 = qemu_get_be32(f);
2301        trace_loadvm_process_command_ping(tmp32);
2302        if (!mis->to_src_file) {
2303            error_report("CMD_PING (0x%x) received with no return path",
2304                         tmp32);
2305            return -1;
2306        }
2307        migrate_send_rp_pong(mis, tmp32);
2308        break;
2309
2310    case MIG_CMD_PACKAGED:
2311        return loadvm_handle_cmd_packaged(mis);
2312
2313    case MIG_CMD_POSTCOPY_ADVISE:
2314        return loadvm_postcopy_handle_advise(mis, len);
2315
2316    case MIG_CMD_POSTCOPY_LISTEN:
2317        return loadvm_postcopy_handle_listen(mis);
2318
2319    case MIG_CMD_POSTCOPY_RUN:
2320        return loadvm_postcopy_handle_run(mis);
2321
2322    case MIG_CMD_POSTCOPY_RAM_DISCARD:
2323        return loadvm_postcopy_ram_handle_discard(mis, len);
2324
2325    case MIG_CMD_POSTCOPY_RESUME:
2326        return loadvm_postcopy_handle_resume(mis);
2327
2328    case MIG_CMD_RECV_BITMAP:
2329        return loadvm_handle_recv_bitmap(mis, len);
2330
2331    case MIG_CMD_ENABLE_COLO:
2332        return loadvm_process_enable_colo(mis);
2333    }
2334
2335    return 0;
2336}
2337
2338/*
2339 * Read a footer off the wire and check that it matches the expected section
2340 *
2341 * Returns: true if the footer was good
2342 *          false if there is a problem (and calls error_report to say why)
2343 */
2344static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
2345{
2346    int ret;
2347    uint8_t read_mark;
2348    uint32_t read_section_id;
2349
2350    if (!migrate_get_current()->send_section_footer) {
2351        /* No footer to check */
2352        return true;
2353    }
2354
2355    read_mark = qemu_get_byte(f);
2356
2357    ret = qemu_file_get_error(f);
2358    if (ret) {
2359        error_report("%s: Read section footer failed: %d",
2360                     __func__, ret);
2361        return false;
2362    }
2363
2364    if (read_mark != QEMU_VM_SECTION_FOOTER) {
2365        error_report("Missing section footer for %s", se->idstr);
2366        return false;
2367    }
2368
2369    read_section_id = qemu_get_be32(f);
2370    if (read_section_id != se->load_section_id) {
2371        error_report("Mismatched section id in footer for %s -"
2372                     " read 0x%x expected 0x%x",
2373                     se->idstr, read_section_id, se->load_section_id);
2374        return false;
2375    }
2376
2377    /* All good */
2378    return true;
2379}
2380
2381static int
2382qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
2383{
2384    uint32_t instance_id, version_id, section_id;
2385    SaveStateEntry *se;
2386    char idstr[256];
2387    int ret;
2388
2389    /* Read section start */
2390    section_id = qemu_get_be32(f);
2391    if (!qemu_get_counted_string(f, idstr)) {
2392        error_report("Unable to read ID string for section %u",
2393                     section_id);
2394        return -EINVAL;
2395    }
2396    instance_id = qemu_get_be32(f);
2397    version_id = qemu_get_be32(f);
2398
2399    ret = qemu_file_get_error(f);
2400    if (ret) {
2401        error_report("%s: Failed to read instance/version ID: %d",
2402                     __func__, ret);
2403        return ret;
2404    }
2405
2406    trace_qemu_loadvm_state_section_startfull(section_id, idstr,
2407            instance_id, version_id);
2408    /* Find savevm section */
2409    se = find_se(idstr, instance_id);
2410    if (se == NULL) {
2411        error_report("Unknown savevm section or instance '%s' %"PRIu32". "
2412                     "Make sure that your current VM setup matches your "
2413                     "saved VM setup, including any hotplugged devices",
2414                     idstr, instance_id);
2415        return -EINVAL;
2416    }
2417
2418    /* Validate version */
2419    if (version_id > se->version_id) {
2420        error_report("savevm: unsupported version %d for '%s' v%d",
2421                     version_id, idstr, se->version_id);
2422        return -EINVAL;
2423    }
2424    se->load_version_id = version_id;
2425    se->load_section_id = section_id;
2426
2427    /* Validate if it is a device's state */
2428    if (xen_enabled() && se->is_ram) {
2429        error_report("loadvm: %s RAM loading not allowed on Xen", idstr);
2430        return -EINVAL;
2431    }
2432
2433    ret = vmstate_load(f, se);
2434    if (ret < 0) {
2435        error_report("error while loading state for instance 0x%"PRIx32" of"
2436                     " device '%s'", instance_id, idstr);
2437        return ret;
2438    }
2439    if (!check_section_footer(f, se)) {
2440        return -EINVAL;
2441    }
2442
2443    return 0;
2444}
2445
2446static int
2447qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
2448{
2449    uint32_t section_id;
2450    SaveStateEntry *se;
2451    int ret;
2452
2453    section_id = qemu_get_be32(f);
2454
2455    ret = qemu_file_get_error(f);
2456    if (ret) {
2457        error_report("%s: Failed to read section ID: %d",
2458                     __func__, ret);
2459        return ret;
2460    }
2461
2462    trace_qemu_loadvm_state_section_partend(section_id);
2463    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2464        if (se->load_section_id == section_id) {
2465            break;
2466        }
2467    }
2468    if (se == NULL) {
2469        error_report("Unknown savevm section %d", section_id);
2470        return -EINVAL;
2471    }
2472
2473    ret = vmstate_load(f, se);
2474    if (ret < 0) {
2475        error_report("error while loading state section id %d(%s)",
2476                     section_id, se->idstr);
2477        return ret;
2478    }
2479    if (!check_section_footer(f, se)) {
2480        return -EINVAL;
2481    }
2482
2483    return 0;
2484}
2485
2486static int qemu_loadvm_state_header(QEMUFile *f)
2487{
2488    unsigned int v;
2489    int ret;
2490
2491    v = qemu_get_be32(f);
2492    if (v != QEMU_VM_FILE_MAGIC) {
2493        error_report("Not a migration stream");
2494        return -EINVAL;
2495    }
2496
2497    v = qemu_get_be32(f);
2498    if (v == QEMU_VM_FILE_VERSION_COMPAT) {
2499        error_report("SaveVM v2 format is obsolete and don't work anymore");
2500        return -ENOTSUP;
2501    }
2502    if (v != QEMU_VM_FILE_VERSION) {
2503        error_report("Unsupported migration stream version");
2504        return -ENOTSUP;
2505    }
2506
2507    if (migrate_get_current()->send_configuration) {
2508        if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
2509            error_report("Configuration section missing");
2510            qemu_loadvm_state_cleanup();
2511            return -EINVAL;
2512        }
2513        ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
2514
2515        if (ret) {
2516            qemu_loadvm_state_cleanup();
2517            return ret;
2518        }
2519    }
2520    return 0;
2521}
2522
2523static int qemu_loadvm_state_setup(QEMUFile *f)
2524{
2525    SaveStateEntry *se;
2526    int ret;
2527
2528    trace_loadvm_state_setup();
2529    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2530        if (!se->ops || !se->ops->load_setup) {
2531            continue;
2532        }
2533        if (se->ops->is_active) {
2534            if (!se->ops->is_active(se->opaque)) {
2535                continue;
2536            }
2537        }
2538
2539        ret = se->ops->load_setup(f, se->opaque);
2540        if (ret < 0) {
2541            qemu_file_set_error(f, ret);
2542            error_report("Load state of device %s failed", se->idstr);
2543            return ret;
2544        }
2545    }
2546    return 0;
2547}
2548
2549void qemu_loadvm_state_cleanup(void)
2550{
2551    SaveStateEntry *se;
2552
2553    trace_loadvm_state_cleanup();
2554    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2555        if (se->ops && se->ops->load_cleanup) {
2556            se->ops->load_cleanup(se->opaque);
2557        }
2558    }
2559}
2560
2561/* Return true if we should continue the migration, or false. */
2562static bool postcopy_pause_incoming(MigrationIncomingState *mis)
2563{
2564    trace_postcopy_pause_incoming();
2565
2566    assert(migrate_postcopy_ram());
2567
2568    /* Clear the triggered bit to allow one recovery */
2569    mis->postcopy_recover_triggered = false;
2570
2571    assert(mis->from_src_file);
2572    qemu_file_shutdown(mis->from_src_file);
2573    qemu_fclose(mis->from_src_file);
2574    mis->from_src_file = NULL;
2575
2576    assert(mis->to_src_file);
2577    qemu_file_shutdown(mis->to_src_file);
2578    qemu_mutex_lock(&mis->rp_mutex);
2579    qemu_fclose(mis->to_src_file);
2580    mis->to_src_file = NULL;
2581    qemu_mutex_unlock(&mis->rp_mutex);
2582
2583    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2584                      MIGRATION_STATUS_POSTCOPY_PAUSED);
2585
2586    /* Notify the fault thread for the invalidated file handle */
2587    postcopy_fault_thread_notify(mis);
2588
2589    error_report("Detected IO failure for postcopy. "
2590                 "Migration paused.");
2591
2592    while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
2593        qemu_sem_wait(&mis->postcopy_pause_sem_dst);
2594    }
2595
2596    trace_postcopy_pause_incoming_continued();
2597
2598    return true;
2599}
2600
2601int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
2602{
2603    uint8_t section_type;
2604    int ret = 0;
2605
2606retry:
2607    while (true) {
2608        section_type = qemu_get_byte(f);
2609
2610        if (qemu_file_get_error(f)) {
2611            ret = qemu_file_get_error(f);
2612            break;
2613        }
2614
2615        trace_qemu_loadvm_state_section(section_type);
2616        switch (section_type) {
2617        case QEMU_VM_SECTION_START:
2618        case QEMU_VM_SECTION_FULL:
2619            ret = qemu_loadvm_section_start_full(f, mis);
2620            if (ret < 0) {
2621                goto out;
2622            }
2623            break;
2624        case QEMU_VM_SECTION_PART:
2625        case QEMU_VM_SECTION_END:
2626            ret = qemu_loadvm_section_part_end(f, mis);
2627            if (ret < 0) {
2628                goto out;
2629            }
2630            break;
2631        case QEMU_VM_COMMAND:
2632            ret = loadvm_process_command(f);
2633            trace_qemu_loadvm_state_section_command(ret);
2634            if ((ret < 0) || (ret == LOADVM_QUIT)) {
2635                goto out;
2636            }
2637            break;
2638        case QEMU_VM_EOF:
2639            /* This is the end of migration */
2640            goto out;
2641        default:
2642            error_report("Unknown savevm section type %d", section_type);
2643            ret = -EINVAL;
2644            goto out;
2645        }
2646    }
2647
2648out:
2649    if (ret < 0) {
2650        qemu_file_set_error(f, ret);
2651
2652        /* Cancel bitmaps incoming regardless of recovery */
2653        dirty_bitmap_mig_cancel_incoming();
2654
2655        /*
2656         * If we are during an active postcopy, then we pause instead
2657         * of bail out to at least keep the VM's dirty data.  Note
2658         * that POSTCOPY_INCOMING_LISTENING stage is still not enough,
2659         * during which we're still receiving device states and we
2660         * still haven't yet started the VM on destination.
2661         *
2662         * Only RAM postcopy supports recovery. Still, if RAM postcopy is
2663         * enabled, canceled bitmaps postcopy will not affect RAM postcopy
2664         * recovering.
2665         */
2666        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
2667            migrate_postcopy_ram() && postcopy_pause_incoming(mis)) {
2668            /* Reset f to point to the newly created channel */
2669            f = mis->from_src_file;
2670            goto retry;
2671        }
2672    }
2673    return ret;
2674}
2675
2676int qemu_loadvm_state(QEMUFile *f)
2677{
2678    MigrationIncomingState *mis = migration_incoming_get_current();
2679    Error *local_err = NULL;
2680    int ret;
2681
2682    if (qemu_savevm_state_blocked(&local_err)) {
2683        error_report_err(local_err);
2684        return -EINVAL;
2685    }
2686
2687    ret = qemu_loadvm_state_header(f);
2688    if (ret) {
2689        return ret;
2690    }
2691
2692    if (qemu_loadvm_state_setup(f) != 0) {
2693        return -EINVAL;
2694    }
2695
2696    cpu_synchronize_all_pre_loadvm();
2697
2698    ret = qemu_loadvm_state_main(f, mis);
2699    qemu_event_set(&mis->main_thread_load_event);
2700
2701    trace_qemu_loadvm_state_post_main(ret);
2702
2703    if (mis->have_listen_thread) {
2704        /* Listen thread still going, can't clean up yet */
2705        return ret;
2706    }
2707
2708    if (ret == 0) {
2709        ret = qemu_file_get_error(f);
2710    }
2711
2712    /*
2713     * Try to read in the VMDESC section as well, so that dumping tools that
2714     * intercept our migration stream have the chance to see it.
2715     */
2716
2717    /* We've got to be careful; if we don't read the data and just shut the fd
2718     * then the sender can error if we close while it's still sending.
2719     * We also mustn't read data that isn't there; some transports (RDMA)
2720     * will stall waiting for that data when the source has already closed.
2721     */
2722    if (ret == 0 && should_send_vmdesc()) {
2723        uint8_t *buf;
2724        uint32_t size;
2725        uint8_t  section_type = qemu_get_byte(f);
2726
2727        if (section_type != QEMU_VM_VMDESCRIPTION) {
2728            error_report("Expected vmdescription section, but got %d",
2729                         section_type);
2730            /*
2731             * It doesn't seem worth failing at this point since
2732             * we apparently have an otherwise valid VM state
2733             */
2734        } else {
2735            buf = g_malloc(0x1000);
2736            size = qemu_get_be32(f);
2737
2738            while (size > 0) {
2739                uint32_t read_chunk = MIN(size, 0x1000);
2740                qemu_get_buffer(f, buf, read_chunk);
2741                size -= read_chunk;
2742            }
2743            g_free(buf);
2744        }
2745    }
2746
2747    qemu_loadvm_state_cleanup();
2748    cpu_synchronize_all_post_init();
2749
2750    return ret;
2751}
2752
2753int qemu_load_device_state(QEMUFile *f)
2754{
2755    MigrationIncomingState *mis = migration_incoming_get_current();
2756    int ret;
2757
2758    /* Load QEMU_VM_SECTION_FULL section */
2759    ret = qemu_loadvm_state_main(f, mis);
2760    if (ret < 0) {
2761        error_report("Failed to load device state: %d", ret);
2762        return ret;
2763    }
2764
2765    cpu_synchronize_all_post_init();
2766    return 0;
2767}
2768
2769bool save_snapshot(const char *name, bool overwrite, const char *vmstate,
2770                  bool has_devices, strList *devices, Error **errp)
2771{
2772    BlockDriverState *bs;
2773    QEMUSnapshotInfo sn1, *sn = &sn1;
2774    int ret = -1, ret2;
2775    QEMUFile *f;
2776    int saved_vm_running;
2777    uint64_t vm_state_size;
2778    qemu_timeval tv;
2779    struct tm tm;
2780    AioContext *aio_context;
2781
2782    if (migration_is_blocked(errp)) {
2783        return false;
2784    }
2785
2786    if (!replay_can_snapshot()) {
2787        error_setg(errp, "Record/replay does not allow making snapshot "
2788                   "right now. Try once more later.");
2789        return false;
2790    }
2791
2792    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
2793        return false;
2794    }
2795
2796    /* Delete old snapshots of the same name */
2797    if (name) {
2798        if (overwrite) {
2799            if (bdrv_all_delete_snapshot(name, has_devices,
2800                                         devices, errp) < 0) {
2801                return false;
2802            }
2803        } else {
2804            ret2 = bdrv_all_has_snapshot(name, has_devices, devices, errp);
2805            if (ret2 < 0) {
2806                return false;
2807            }
2808            if (ret2 == 1) {
2809                error_setg(errp,
2810                           "Snapshot '%s' already exists in one or more devices",
2811                           name);
2812                return false;
2813            }
2814        }
2815    }
2816
2817    bs = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
2818    if (bs == NULL) {
2819        return false;
2820    }
2821    aio_context = bdrv_get_aio_context(bs);
2822
2823    saved_vm_running = runstate_is_running();
2824
2825    ret = global_state_store();
2826    if (ret) {
2827        error_setg(errp, "Error saving global state");
2828        return false;
2829    }
2830    vm_stop(RUN_STATE_SAVE_VM);
2831
2832    bdrv_drain_all_begin();
2833
2834    aio_context_acquire(aio_context);
2835
2836    memset(sn, 0, sizeof(*sn));
2837
2838    /* fill auxiliary fields */
2839    qemu_gettimeofday(&tv);
2840    sn->date_sec = tv.tv_sec;
2841    sn->date_nsec = tv.tv_usec * 1000;
2842    sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
2843    if (replay_mode != REPLAY_MODE_NONE) {
2844        sn->icount = replay_get_current_icount();
2845    } else {
2846        sn->icount = -1ULL;
2847    }
2848
2849    if (name) {
2850        pstrcpy(sn->name, sizeof(sn->name), name);
2851    } else {
2852        /* cast below needed for OpenBSD where tv_sec is still 'long' */
2853        localtime_r((const time_t *)&tv.tv_sec, &tm);
2854        strftime(sn->name, sizeof(sn->name), "vm-%Y%m%d%H%M%S", &tm);
2855    }
2856
2857    /* save the VM state */
2858    f = qemu_fopen_bdrv(bs, 1);
2859    if (!f) {
2860        error_setg(errp, "Could not open VM state file");
2861        goto the_end;
2862    }
2863    ret = qemu_savevm_state(f, errp);
2864    vm_state_size = qemu_ftell(f);
2865    ret2 = qemu_fclose(f);
2866    if (ret < 0) {
2867        goto the_end;
2868    }
2869    if (ret2 < 0) {
2870        ret = ret2;
2871        goto the_end;
2872    }
2873
2874    /* The bdrv_all_create_snapshot() call that follows acquires the AioContext
2875     * for itself.  BDRV_POLL_WHILE() does not support nested locking because
2876     * it only releases the lock once.  Therefore synchronous I/O will deadlock
2877     * unless we release the AioContext before bdrv_all_create_snapshot().
2878     */
2879    aio_context_release(aio_context);
2880    aio_context = NULL;
2881
2882    ret = bdrv_all_create_snapshot(sn, bs, vm_state_size,
2883                                   has_devices, devices, errp);
2884    if (ret < 0) {
2885        bdrv_all_delete_snapshot(sn->name, has_devices, devices, NULL);
2886        goto the_end;
2887    }
2888
2889    ret = 0;
2890
2891 the_end:
2892    if (aio_context) {
2893        aio_context_release(aio_context);
2894    }
2895
2896    bdrv_drain_all_end();
2897
2898    if (saved_vm_running) {
2899        vm_start();
2900    }
2901    return ret == 0;
2902}
2903
2904void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
2905                                Error **errp)
2906{
2907    QEMUFile *f;
2908    QIOChannelFile *ioc;
2909    int saved_vm_running;
2910    int ret;
2911
2912    if (!has_live) {
2913        /* live default to true so old version of Xen tool stack can have a
2914         * successful live migration */
2915        live = true;
2916    }
2917
2918    saved_vm_running = runstate_is_running();
2919    vm_stop(RUN_STATE_SAVE_VM);
2920    global_state_store_running();
2921
2922    ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT | O_TRUNC,
2923                                    0660, errp);
2924    if (!ioc) {
2925        goto the_end;
2926    }
2927    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-save-state");
2928    f = qemu_fopen_channel_output(QIO_CHANNEL(ioc));
2929    object_unref(OBJECT(ioc));
2930    ret = qemu_save_device_state(f);
2931    if (ret < 0 || qemu_fclose(f) < 0) {
2932        error_setg(errp, QERR_IO_ERROR);
2933    } else {
2934        /* libxl calls the QMP command "stop" before calling
2935         * "xen-save-devices-state" and in case of migration failure, libxl
2936         * would call "cont".
2937         * So call bdrv_inactivate_all (release locks) here to let the other
2938         * side of the migration take control of the images.
2939         */
2940        if (live && !saved_vm_running) {
2941            ret = bdrv_inactivate_all();
2942            if (ret) {
2943                error_setg(errp, "%s: bdrv_inactivate_all() failed (%d)",
2944                           __func__, ret);
2945            }
2946        }
2947    }
2948
2949 the_end:
2950    if (saved_vm_running) {
2951        vm_start();
2952    }
2953}
2954
2955void qmp_xen_load_devices_state(const char *filename, Error **errp)
2956{
2957    QEMUFile *f;
2958    QIOChannelFile *ioc;
2959    int ret;
2960
2961    /* Guest must be paused before loading the device state; the RAM state
2962     * will already have been loaded by xc
2963     */
2964    if (runstate_is_running()) {
2965        error_setg(errp, "Cannot update device state while vm is running");
2966        return;
2967    }
2968    vm_stop(RUN_STATE_RESTORE_VM);
2969
2970    ioc = qio_channel_file_new_path(filename, O_RDONLY | O_BINARY, 0, errp);
2971    if (!ioc) {
2972        return;
2973    }
2974    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-load-state");
2975    f = qemu_fopen_channel_input(QIO_CHANNEL(ioc));
2976    object_unref(OBJECT(ioc));
2977
2978    ret = qemu_loadvm_state(f);
2979    qemu_fclose(f);
2980    if (ret < 0) {
2981        error_setg(errp, QERR_IO_ERROR);
2982    }
2983    migration_incoming_state_destroy();
2984}
2985
2986bool load_snapshot(const char *name, const char *vmstate,
2987                   bool has_devices, strList *devices, Error **errp)
2988{
2989    BlockDriverState *bs_vm_state;
2990    QEMUSnapshotInfo sn;
2991    QEMUFile *f;
2992    int ret;
2993    AioContext *aio_context;
2994    MigrationIncomingState *mis = migration_incoming_get_current();
2995
2996    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
2997        return false;
2998    }
2999    ret = bdrv_all_has_snapshot(name, has_devices, devices, errp);
3000    if (ret < 0) {
3001        return false;
3002    }
3003    if (ret == 0) {
3004        error_setg(errp, "Snapshot '%s' does not exist in one or more devices",
3005                   name);
3006        return false;
3007    }
3008
3009    bs_vm_state = bdrv_all_find_vmstate_bs(vmstate, has_devices, devices, errp);
3010    if (!bs_vm_state) {
3011        return false;
3012    }
3013    aio_context = bdrv_get_aio_context(bs_vm_state);
3014
3015    /* Don't even try to load empty VM states */
3016    aio_context_acquire(aio_context);
3017    ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
3018    aio_context_release(aio_context);
3019    if (ret < 0) {
3020        return false;
3021    } else if (sn.vm_state_size == 0) {
3022        error_setg(errp, "This is a disk-only snapshot. Revert to it "
3023                   " offline using qemu-img");
3024        return false;
3025    }
3026
3027    /*
3028     * Flush the record/replay queue. Now the VM state is going
3029     * to change. Therefore we don't need to preserve its consistency
3030     */
3031    replay_flush_events();
3032
3033    /* Flush all IO requests so they don't interfere with the new state.  */
3034    bdrv_drain_all_begin();
3035
3036    ret = bdrv_all_goto_snapshot(name, has_devices, devices, errp);
3037    if (ret < 0) {
3038        goto err_drain;
3039    }
3040
3041    /* restore the VM state */
3042    f = qemu_fopen_bdrv(bs_vm_state, 0);
3043    if (!f) {
3044        error_setg(errp, "Could not open VM state file");
3045        goto err_drain;
3046    }
3047
3048    qemu_system_reset(SHUTDOWN_CAUSE_NONE);
3049    mis->from_src_file = f;
3050
3051    if (!yank_register_instance(MIGRATION_YANK_INSTANCE, errp)) {
3052        ret = -EINVAL;
3053        goto err_drain;
3054    }
3055    aio_context_acquire(aio_context);
3056    ret = qemu_loadvm_state(f);
3057    migration_incoming_state_destroy();
3058    aio_context_release(aio_context);
3059
3060    bdrv_drain_all_end();
3061
3062    if (ret < 0) {
3063        error_setg(errp, "Error %d while loading VM state", ret);
3064        return false;
3065    }
3066
3067    return true;
3068
3069err_drain:
3070    bdrv_drain_all_end();
3071    return false;
3072}
3073
3074bool delete_snapshot(const char *name, bool has_devices,
3075                     strList *devices, Error **errp)
3076{
3077    if (!bdrv_all_can_snapshot(has_devices, devices, errp)) {
3078        return false;
3079    }
3080
3081    if (bdrv_all_delete_snapshot(name, has_devices, devices, errp) < 0) {
3082        return false;
3083    }
3084
3085    return true;
3086}
3087
3088void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
3089{
3090    qemu_ram_set_idstr(mr->ram_block,
3091                       memory_region_name(mr), dev);
3092    qemu_ram_set_migratable(mr->ram_block);
3093}
3094
3095void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
3096{
3097    qemu_ram_unset_idstr(mr->ram_block);
3098    qemu_ram_unset_migratable(mr->ram_block);
3099}
3100
3101void vmstate_register_ram_global(MemoryRegion *mr)
3102{
3103    vmstate_register_ram(mr, NULL);
3104}
3105
3106bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
3107{
3108    /* check needed if --only-migratable is specified */
3109    if (!only_migratable) {
3110        return true;
3111    }
3112
3113    return !(vmsd && vmsd->unmigratable);
3114}
3115
3116typedef struct SnapshotJob {
3117    Job common;
3118    char *tag;
3119    char *vmstate;
3120    strList *devices;
3121    Coroutine *co;
3122    Error **errp;
3123    bool ret;
3124} SnapshotJob;
3125
3126static void qmp_snapshot_job_free(SnapshotJob *s)
3127{
3128    g_free(s->tag);
3129    g_free(s->vmstate);
3130    qapi_free_strList(s->devices);
3131}
3132
3133
3134static void snapshot_load_job_bh(void *opaque)
3135{
3136    Job *job = opaque;
3137    SnapshotJob *s = container_of(job, SnapshotJob, common);
3138    int orig_vm_running;
3139
3140    job_progress_set_remaining(&s->common, 1);
3141
3142    orig_vm_running = runstate_is_running();
3143    vm_stop(RUN_STATE_RESTORE_VM);
3144
3145    s->ret = load_snapshot(s->tag, s->vmstate, true, s->devices, s->errp);
3146    if (s->ret && orig_vm_running) {
3147        vm_start();
3148    }
3149
3150    job_progress_update(&s->common, 1);
3151
3152    qmp_snapshot_job_free(s);
3153    aio_co_wake(s->co);
3154}
3155
3156static void snapshot_save_job_bh(void *opaque)
3157{
3158    Job *job = opaque;
3159    SnapshotJob *s = container_of(job, SnapshotJob, common);
3160
3161    job_progress_set_remaining(&s->common, 1);
3162    s->ret = save_snapshot(s->tag, false, s->vmstate,
3163                           true, s->devices, s->errp);
3164    job_progress_update(&s->common, 1);
3165
3166    qmp_snapshot_job_free(s);
3167    aio_co_wake(s->co);
3168}
3169
3170static void snapshot_delete_job_bh(void *opaque)
3171{
3172    Job *job = opaque;
3173    SnapshotJob *s = container_of(job, SnapshotJob, common);
3174
3175    job_progress_set_remaining(&s->common, 1);
3176    s->ret = delete_snapshot(s->tag, true, s->devices, s->errp);
3177    job_progress_update(&s->common, 1);
3178
3179    qmp_snapshot_job_free(s);
3180    aio_co_wake(s->co);
3181}
3182
3183static int coroutine_fn snapshot_save_job_run(Job *job, Error **errp)
3184{
3185    SnapshotJob *s = container_of(job, SnapshotJob, common);
3186    s->errp = errp;
3187    s->co = qemu_coroutine_self();
3188    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3189                            snapshot_save_job_bh, job);
3190    qemu_coroutine_yield();
3191    return s->ret ? 0 : -1;
3192}
3193
3194static int coroutine_fn snapshot_load_job_run(Job *job, Error **errp)
3195{
3196    SnapshotJob *s = container_of(job, SnapshotJob, common);
3197    s->errp = errp;
3198    s->co = qemu_coroutine_self();
3199    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3200                            snapshot_load_job_bh, job);
3201    qemu_coroutine_yield();
3202    return s->ret ? 0 : -1;
3203}
3204
3205static int coroutine_fn snapshot_delete_job_run(Job *job, Error **errp)
3206{
3207    SnapshotJob *s = container_of(job, SnapshotJob, common);
3208    s->errp = errp;
3209    s->co = qemu_coroutine_self();
3210    aio_bh_schedule_oneshot(qemu_get_aio_context(),
3211                            snapshot_delete_job_bh, job);
3212    qemu_coroutine_yield();
3213    return s->ret ? 0 : -1;
3214}
3215
3216
3217static const JobDriver snapshot_load_job_driver = {
3218    .instance_size = sizeof(SnapshotJob),
3219    .job_type      = JOB_TYPE_SNAPSHOT_LOAD,
3220    .run           = snapshot_load_job_run,
3221};
3222
3223static const JobDriver snapshot_save_job_driver = {
3224    .instance_size = sizeof(SnapshotJob),
3225    .job_type      = JOB_TYPE_SNAPSHOT_SAVE,
3226    .run           = snapshot_save_job_run,
3227};
3228
3229static const JobDriver snapshot_delete_job_driver = {
3230    .instance_size = sizeof(SnapshotJob),
3231    .job_type      = JOB_TYPE_SNAPSHOT_DELETE,
3232    .run           = snapshot_delete_job_run,
3233};
3234
3235
3236void qmp_snapshot_save(const char *job_id,
3237                       const char *tag,
3238                       const char *vmstate,
3239                       strList *devices,
3240                       Error **errp)
3241{
3242    SnapshotJob *s;
3243
3244    s = job_create(job_id, &snapshot_save_job_driver, NULL,
3245                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3246                   NULL, NULL, errp);
3247    if (!s) {
3248        return;
3249    }
3250
3251    s->tag = g_strdup(tag);
3252    s->vmstate = g_strdup(vmstate);
3253    s->devices = QAPI_CLONE(strList, devices);
3254
3255    job_start(&s->common);
3256}
3257
3258void qmp_snapshot_load(const char *job_id,
3259                       const char *tag,
3260                       const char *vmstate,
3261                       strList *devices,
3262                       Error **errp)
3263{
3264    SnapshotJob *s;
3265
3266    s = job_create(job_id, &snapshot_load_job_driver, NULL,
3267                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3268                   NULL, NULL, errp);
3269    if (!s) {
3270        return;
3271    }
3272
3273    s->tag = g_strdup(tag);
3274    s->vmstate = g_strdup(vmstate);
3275    s->devices = QAPI_CLONE(strList, devices);
3276
3277    job_start(&s->common);
3278}
3279
3280void qmp_snapshot_delete(const char *job_id,
3281                         const char *tag,
3282                         strList *devices,
3283                         Error **errp)
3284{
3285    SnapshotJob *s;
3286
3287    s = job_create(job_id, &snapshot_delete_job_driver, NULL,
3288                   qemu_get_aio_context(), JOB_MANUAL_DISMISS,
3289                   NULL, NULL, errp);
3290    if (!s) {
3291        return;
3292    }
3293
3294    s->tag = g_strdup(tag);
3295    s->devices = QAPI_CLONE(strList, devices);
3296
3297    job_start(&s->common);
3298}
3299