qemu/migration/savevm.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2009-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include "hw/boards.h"
  31#include "hw/xen/xen.h"
  32#include "net/net.h"
  33#include "migration.h"
  34#include "migration/snapshot.h"
  35#include "migration/misc.h"
  36#include "migration/register.h"
  37#include "migration/global_state.h"
  38#include "ram.h"
  39#include "qemu-file-channel.h"
  40#include "qemu-file.h"
  41#include "savevm.h"
  42#include "postcopy-ram.h"
  43#include "qapi/error.h"
  44#include "qapi/qapi-commands-migration.h"
  45#include "qapi/qapi-commands-misc.h"
  46#include "qapi/qmp/qerror.h"
  47#include "qemu/error-report.h"
  48#include "sysemu/cpus.h"
  49#include "exec/memory.h"
  50#include "exec/target_page.h"
  51#include "trace.h"
  52#include "qemu/iov.h"
  53#include "block/snapshot.h"
  54#include "qemu/cutils.h"
  55#include "io/channel-buffer.h"
  56#include "io/channel-file.h"
  57#include "sysemu/replay.h"
  58#include "qjson.h"
  59
  60#ifndef ETH_P_RARP
  61#define ETH_P_RARP 0x8035
  62#endif
  63#define ARP_HTYPE_ETH 0x0001
  64#define ARP_PTYPE_IP 0x0800
  65#define ARP_OP_REQUEST_REV 0x3
  66
  67const unsigned int postcopy_ram_discard_version = 0;
  68
  69/* Subcommands for QEMU_VM_COMMAND */
  70enum qemu_vm_cmd {
  71    MIG_CMD_INVALID = 0,   /* Must be 0 */
  72    MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
  73    MIG_CMD_PING,              /* Request a PONG on the RP */
  74
  75    MIG_CMD_POSTCOPY_ADVISE,       /* Prior to any page transfers, just
  76                                      warn we might want to do PC */
  77    MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
  78                                      pages as it's running. */
  79    MIG_CMD_POSTCOPY_RUN,          /* Start execution */
  80
  81    MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
  82                                      were previously sent during
  83                                      precopy but are dirty. */
  84    MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
  85    MIG_CMD_POSTCOPY_RESUME,   /* resume postcopy on dest */
  86    MIG_CMD_RECV_BITMAP,       /* Request for recved bitmap on dst */
  87    MIG_CMD_MAX
  88};
  89
  90#define MAX_VM_CMD_PACKAGED_SIZE UINT32_MAX
  91static struct mig_cmd_args {
  92    ssize_t     len; /* -1 = variable */
  93    const char *name;
  94} mig_cmd_args[] = {
  95    [MIG_CMD_INVALID]          = { .len = -1, .name = "INVALID" },
  96    [MIG_CMD_OPEN_RETURN_PATH] = { .len =  0, .name = "OPEN_RETURN_PATH" },
  97    [MIG_CMD_PING]             = { .len = sizeof(uint32_t), .name = "PING" },
  98    [MIG_CMD_POSTCOPY_ADVISE]  = { .len = -1, .name = "POSTCOPY_ADVISE" },
  99    [MIG_CMD_POSTCOPY_LISTEN]  = { .len =  0, .name = "POSTCOPY_LISTEN" },
 100    [MIG_CMD_POSTCOPY_RUN]     = { .len =  0, .name = "POSTCOPY_RUN" },
 101    [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
 102                                   .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
 103    [MIG_CMD_POSTCOPY_RESUME]  = { .len =  0, .name = "POSTCOPY_RESUME" },
 104    [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
 105    [MIG_CMD_RECV_BITMAP]      = { .len = -1, .name = "RECV_BITMAP" },
 106    [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
 107};
 108
 109/* Note for MIG_CMD_POSTCOPY_ADVISE:
 110 * The format of arguments is depending on postcopy mode:
 111 * - postcopy RAM only
 112 *   uint64_t host page size
 113 *   uint64_t taget page size
 114 *
 115 * - postcopy RAM and postcopy dirty bitmaps
 116 *   format is the same as for postcopy RAM only
 117 *
 118 * - postcopy dirty bitmaps only
 119 *   Nothing. Command length field is 0.
 120 *
 121 * Be careful: adding a new postcopy entity with some other parameters should
 122 * not break format self-description ability. Good way is to introduce some
 123 * generic extendable format with an exception for two old entities.
 124 */
 125
 126static int announce_self_create(uint8_t *buf,
 127                                uint8_t *mac_addr)
 128{
 129    /* Ethernet header. */
 130    memset(buf, 0xff, 6);         /* destination MAC addr */
 131    memcpy(buf + 6, mac_addr, 6); /* source MAC addr */
 132    *(uint16_t *)(buf + 12) = htons(ETH_P_RARP); /* ethertype */
 133
 134    /* RARP header. */
 135    *(uint16_t *)(buf + 14) = htons(ARP_HTYPE_ETH); /* hardware addr space */
 136    *(uint16_t *)(buf + 16) = htons(ARP_PTYPE_IP); /* protocol addr space */
 137    *(buf + 18) = 6; /* hardware addr length (ethernet) */
 138    *(buf + 19) = 4; /* protocol addr length (IPv4) */
 139    *(uint16_t *)(buf + 20) = htons(ARP_OP_REQUEST_REV); /* opcode */
 140    memcpy(buf + 22, mac_addr, 6); /* source hw addr */
 141    memset(buf + 28, 0x00, 4);     /* source protocol addr */
 142    memcpy(buf + 32, mac_addr, 6); /* target hw addr */
 143    memset(buf + 38, 0x00, 4);     /* target protocol addr */
 144
 145    /* Padding to get up to 60 bytes (ethernet min packet size, minus FCS). */
 146    memset(buf + 42, 0x00, 18);
 147
 148    return 60; /* len (FCS will be added by hardware) */
 149}
 150
 151static void qemu_announce_self_iter(NICState *nic, void *opaque)
 152{
 153    uint8_t buf[60];
 154    int len;
 155
 156    trace_qemu_announce_self_iter(qemu_ether_ntoa(&nic->conf->macaddr));
 157    len = announce_self_create(buf, nic->conf->macaddr.a);
 158
 159    qemu_send_packet_raw(qemu_get_queue(nic), buf, len);
 160}
 161
 162
 163static void qemu_announce_self_once(void *opaque)
 164{
 165    static int count = SELF_ANNOUNCE_ROUNDS;
 166    QEMUTimer *timer = *(QEMUTimer **)opaque;
 167
 168    qemu_foreach_nic(qemu_announce_self_iter, NULL);
 169
 170    if (--count) {
 171        /* delay 50ms, 150ms, 250ms, ... */
 172        timer_mod(timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) +
 173                  self_announce_delay(count));
 174    } else {
 175            timer_del(timer);
 176            timer_free(timer);
 177    }
 178}
 179
 180void qemu_announce_self(void)
 181{
 182    static QEMUTimer *timer;
 183    timer = timer_new_ms(QEMU_CLOCK_REALTIME, qemu_announce_self_once, &timer);
 184    qemu_announce_self_once(&timer);
 185}
 186
 187/***********************************************************/
 188/* savevm/loadvm support */
 189
 190static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
 191                                   int64_t pos)
 192{
 193    int ret;
 194    QEMUIOVector qiov;
 195
 196    qemu_iovec_init_external(&qiov, iov, iovcnt);
 197    ret = bdrv_writev_vmstate(opaque, &qiov, pos);
 198    if (ret < 0) {
 199        return ret;
 200    }
 201
 202    return qiov.size;
 203}
 204
 205static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos,
 206                                size_t size)
 207{
 208    return bdrv_load_vmstate(opaque, buf, pos, size);
 209}
 210
 211static int bdrv_fclose(void *opaque)
 212{
 213    return bdrv_flush(opaque);
 214}
 215
 216static const QEMUFileOps bdrv_read_ops = {
 217    .get_buffer = block_get_buffer,
 218    .close =      bdrv_fclose
 219};
 220
 221static const QEMUFileOps bdrv_write_ops = {
 222    .writev_buffer  = block_writev_buffer,
 223    .close          = bdrv_fclose
 224};
 225
 226static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable)
 227{
 228    if (is_writable) {
 229        return qemu_fopen_ops(bs, &bdrv_write_ops);
 230    }
 231    return qemu_fopen_ops(bs, &bdrv_read_ops);
 232}
 233
 234
 235/* QEMUFile timer support.
 236 * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c
 237 */
 238
 239void timer_put(QEMUFile *f, QEMUTimer *ts)
 240{
 241    uint64_t expire_time;
 242
 243    expire_time = timer_expire_time_ns(ts);
 244    qemu_put_be64(f, expire_time);
 245}
 246
 247void timer_get(QEMUFile *f, QEMUTimer *ts)
 248{
 249    uint64_t expire_time;
 250
 251    expire_time = qemu_get_be64(f);
 252    if (expire_time != -1) {
 253        timer_mod_ns(ts, expire_time);
 254    } else {
 255        timer_del(ts);
 256    }
 257}
 258
 259
 260/* VMState timer support.
 261 * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c
 262 */
 263
 264static int get_timer(QEMUFile *f, void *pv, size_t size, VMStateField *field)
 265{
 266    QEMUTimer *v = pv;
 267    timer_get(f, v);
 268    return 0;
 269}
 270
 271static int put_timer(QEMUFile *f, void *pv, size_t size, VMStateField *field,
 272                     QJSON *vmdesc)
 273{
 274    QEMUTimer *v = pv;
 275    timer_put(f, v);
 276
 277    return 0;
 278}
 279
 280const VMStateInfo vmstate_info_timer = {
 281    .name = "timer",
 282    .get  = get_timer,
 283    .put  = put_timer,
 284};
 285
 286
 287typedef struct CompatEntry {
 288    char idstr[256];
 289    int instance_id;
 290} CompatEntry;
 291
 292typedef struct SaveStateEntry {
 293    QTAILQ_ENTRY(SaveStateEntry) entry;
 294    char idstr[256];
 295    int instance_id;
 296    int alias_id;
 297    int version_id;
 298    /* version id read from the stream */
 299    int load_version_id;
 300    int section_id;
 301    /* section id read from the stream */
 302    int load_section_id;
 303    SaveVMHandlers *ops;
 304    const VMStateDescription *vmsd;
 305    void *opaque;
 306    CompatEntry *compat;
 307    int is_ram;
 308} SaveStateEntry;
 309
 310typedef struct SaveState {
 311    QTAILQ_HEAD(, SaveStateEntry) handlers;
 312    int global_section_id;
 313    uint32_t len;
 314    const char *name;
 315    uint32_t target_page_bits;
 316} SaveState;
 317
 318static SaveState savevm_state = {
 319    .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers),
 320    .global_section_id = 0,
 321};
 322
 323static int configuration_pre_save(void *opaque)
 324{
 325    SaveState *state = opaque;
 326    const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
 327
 328    state->len = strlen(current_name);
 329    state->name = current_name;
 330    state->target_page_bits = qemu_target_page_bits();
 331
 332    return 0;
 333}
 334
 335static int configuration_pre_load(void *opaque)
 336{
 337    SaveState *state = opaque;
 338
 339    /* If there is no target-page-bits subsection it means the source
 340     * predates the variable-target-page-bits support and is using the
 341     * minimum possible value for this CPU.
 342     */
 343    state->target_page_bits = qemu_target_page_bits_min();
 344    return 0;
 345}
 346
 347static int configuration_post_load(void *opaque, int version_id)
 348{
 349    SaveState *state = opaque;
 350    const char *current_name = MACHINE_GET_CLASS(current_machine)->name;
 351
 352    if (strncmp(state->name, current_name, state->len) != 0) {
 353        error_report("Machine type received is '%.*s' and local is '%s'",
 354                     (int) state->len, state->name, current_name);
 355        return -EINVAL;
 356    }
 357
 358    if (state->target_page_bits != qemu_target_page_bits()) {
 359        error_report("Received TARGET_PAGE_BITS is %d but local is %d",
 360                     state->target_page_bits, qemu_target_page_bits());
 361        return -EINVAL;
 362    }
 363
 364    return 0;
 365}
 366
 367/* The target-page-bits subsection is present only if the
 368 * target page size is not the same as the default (ie the
 369 * minimum page size for a variable-page-size guest CPU).
 370 * If it is present then it contains the actual target page
 371 * bits for the machine, and migration will fail if the
 372 * two ends don't agree about it.
 373 */
 374static bool vmstate_target_page_bits_needed(void *opaque)
 375{
 376    return qemu_target_page_bits()
 377        > qemu_target_page_bits_min();
 378}
 379
 380static const VMStateDescription vmstate_target_page_bits = {
 381    .name = "configuration/target-page-bits",
 382    .version_id = 1,
 383    .minimum_version_id = 1,
 384    .needed = vmstate_target_page_bits_needed,
 385    .fields = (VMStateField[]) {
 386        VMSTATE_UINT32(target_page_bits, SaveState),
 387        VMSTATE_END_OF_LIST()
 388    }
 389};
 390
 391static const VMStateDescription vmstate_configuration = {
 392    .name = "configuration",
 393    .version_id = 1,
 394    .pre_load = configuration_pre_load,
 395    .post_load = configuration_post_load,
 396    .pre_save = configuration_pre_save,
 397    .fields = (VMStateField[]) {
 398        VMSTATE_UINT32(len, SaveState),
 399        VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, len),
 400        VMSTATE_END_OF_LIST()
 401    },
 402    .subsections = (const VMStateDescription*[]) {
 403        &vmstate_target_page_bits,
 404        NULL
 405    }
 406};
 407
 408static void dump_vmstate_vmsd(FILE *out_file,
 409                              const VMStateDescription *vmsd, int indent,
 410                              bool is_subsection);
 411
 412static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field,
 413                              int indent)
 414{
 415    fprintf(out_file, "%*s{\n", indent, "");
 416    indent += 2;
 417    fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name);
 418    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 419            field->version_id);
 420    fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "",
 421            field->field_exists ? "true" : "false");
 422    fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size);
 423    if (field->vmsd != NULL) {
 424        fprintf(out_file, ",\n");
 425        dump_vmstate_vmsd(out_file, field->vmsd, indent, false);
 426    }
 427    fprintf(out_file, "\n%*s}", indent - 2, "");
 428}
 429
 430static void dump_vmstate_vmss(FILE *out_file,
 431                              const VMStateDescription **subsection,
 432                              int indent)
 433{
 434    if (*subsection != NULL) {
 435        dump_vmstate_vmsd(out_file, *subsection, indent, true);
 436    }
 437}
 438
 439static void dump_vmstate_vmsd(FILE *out_file,
 440                              const VMStateDescription *vmsd, int indent,
 441                              bool is_subsection)
 442{
 443    if (is_subsection) {
 444        fprintf(out_file, "%*s{\n", indent, "");
 445    } else {
 446        fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description");
 447    }
 448    indent += 2;
 449    fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name);
 450    fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 451            vmsd->version_id);
 452    fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "",
 453            vmsd->minimum_version_id);
 454    if (vmsd->fields != NULL) {
 455        const VMStateField *field = vmsd->fields;
 456        bool first;
 457
 458        fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, "");
 459        first = true;
 460        while (field->name != NULL) {
 461            if (field->flags & VMS_MUST_EXIST) {
 462                /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */
 463                field++;
 464                continue;
 465            }
 466            if (!first) {
 467                fprintf(out_file, ",\n");
 468            }
 469            dump_vmstate_vmsf(out_file, field, indent + 2);
 470            field++;
 471            first = false;
 472        }
 473        fprintf(out_file, "\n%*s]", indent, "");
 474    }
 475    if (vmsd->subsections != NULL) {
 476        const VMStateDescription **subsection = vmsd->subsections;
 477        bool first;
 478
 479        fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, "");
 480        first = true;
 481        while (*subsection != NULL) {
 482            if (!first) {
 483                fprintf(out_file, ",\n");
 484            }
 485            dump_vmstate_vmss(out_file, subsection, indent + 2);
 486            subsection++;
 487            first = false;
 488        }
 489        fprintf(out_file, "\n%*s]", indent, "");
 490    }
 491    fprintf(out_file, "\n%*s}", indent - 2, "");
 492}
 493
 494static void dump_machine_type(FILE *out_file)
 495{
 496    MachineClass *mc;
 497
 498    mc = MACHINE_GET_CLASS(current_machine);
 499
 500    fprintf(out_file, "  \"vmschkmachine\": {\n");
 501    fprintf(out_file, "    \"Name\": \"%s\"\n", mc->name);
 502    fprintf(out_file, "  },\n");
 503}
 504
 505void dump_vmstate_json_to_file(FILE *out_file)
 506{
 507    GSList *list, *elt;
 508    bool first;
 509
 510    fprintf(out_file, "{\n");
 511    dump_machine_type(out_file);
 512
 513    first = true;
 514    list = object_class_get_list(TYPE_DEVICE, true);
 515    for (elt = list; elt; elt = elt->next) {
 516        DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data,
 517                                             TYPE_DEVICE);
 518        const char *name;
 519        int indent = 2;
 520
 521        if (!dc->vmsd) {
 522            continue;
 523        }
 524
 525        if (!first) {
 526            fprintf(out_file, ",\n");
 527        }
 528        name = object_class_get_name(OBJECT_CLASS(dc));
 529        fprintf(out_file, "%*s\"%s\": {\n", indent, "", name);
 530        indent += 2;
 531        fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name);
 532        fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "",
 533                dc->vmsd->version_id);
 534        fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "",
 535                dc->vmsd->minimum_version_id);
 536
 537        dump_vmstate_vmsd(out_file, dc->vmsd, indent, false);
 538
 539        fprintf(out_file, "\n%*s}", indent - 2, "");
 540        first = false;
 541    }
 542    fprintf(out_file, "\n}\n");
 543    fclose(out_file);
 544}
 545
 546static int calculate_new_instance_id(const char *idstr)
 547{
 548    SaveStateEntry *se;
 549    int instance_id = 0;
 550
 551    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 552        if (strcmp(idstr, se->idstr) == 0
 553            && instance_id <= se->instance_id) {
 554            instance_id = se->instance_id + 1;
 555        }
 556    }
 557    return instance_id;
 558}
 559
 560static int calculate_compat_instance_id(const char *idstr)
 561{
 562    SaveStateEntry *se;
 563    int instance_id = 0;
 564
 565    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 566        if (!se->compat) {
 567            continue;
 568        }
 569
 570        if (strcmp(idstr, se->compat->idstr) == 0
 571            && instance_id <= se->compat->instance_id) {
 572            instance_id = se->compat->instance_id + 1;
 573        }
 574    }
 575    return instance_id;
 576}
 577
 578static inline MigrationPriority save_state_priority(SaveStateEntry *se)
 579{
 580    if (se->vmsd) {
 581        return se->vmsd->priority;
 582    }
 583    return MIG_PRI_DEFAULT;
 584}
 585
 586static void savevm_state_handler_insert(SaveStateEntry *nse)
 587{
 588    MigrationPriority priority = save_state_priority(nse);
 589    SaveStateEntry *se;
 590
 591    assert(priority <= MIG_PRI_MAX);
 592
 593    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 594        if (save_state_priority(se) < priority) {
 595            break;
 596        }
 597    }
 598
 599    if (se) {
 600        QTAILQ_INSERT_BEFORE(se, nse, entry);
 601    } else {
 602        QTAILQ_INSERT_TAIL(&savevm_state.handlers, nse, entry);
 603    }
 604}
 605
 606/* TODO: Individual devices generally have very little idea about the rest
 607   of the system, so instance_id should be removed/replaced.
 608   Meanwhile pass -1 as instance_id if you do not already have a clearly
 609   distinguishing id for all instances of your device class. */
 610int register_savevm_live(DeviceState *dev,
 611                         const char *idstr,
 612                         int instance_id,
 613                         int version_id,
 614                         SaveVMHandlers *ops,
 615                         void *opaque)
 616{
 617    SaveStateEntry *se;
 618
 619    se = g_new0(SaveStateEntry, 1);
 620    se->version_id = version_id;
 621    se->section_id = savevm_state.global_section_id++;
 622    se->ops = ops;
 623    se->opaque = opaque;
 624    se->vmsd = NULL;
 625    /* if this is a live_savem then set is_ram */
 626    if (ops->save_setup != NULL) {
 627        se->is_ram = 1;
 628    }
 629
 630    if (dev) {
 631        char *id = qdev_get_dev_path(dev);
 632        if (id) {
 633            if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
 634                sizeof(se->idstr)) {
 635                error_report("Path too long for VMState (%s)", id);
 636                g_free(id);
 637                g_free(se);
 638
 639                return -1;
 640            }
 641            g_free(id);
 642
 643            se->compat = g_new0(CompatEntry, 1);
 644            pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), idstr);
 645            se->compat->instance_id = instance_id == -1 ?
 646                         calculate_compat_instance_id(idstr) : instance_id;
 647            instance_id = -1;
 648        }
 649    }
 650    pstrcat(se->idstr, sizeof(se->idstr), idstr);
 651
 652    if (instance_id == -1) {
 653        se->instance_id = calculate_new_instance_id(se->idstr);
 654    } else {
 655        se->instance_id = instance_id;
 656    }
 657    assert(!se->compat || se->instance_id == 0);
 658    savevm_state_handler_insert(se);
 659    return 0;
 660}
 661
 662void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque)
 663{
 664    SaveStateEntry *se, *new_se;
 665    char id[256] = "";
 666
 667    if (dev) {
 668        char *path = qdev_get_dev_path(dev);
 669        if (path) {
 670            pstrcpy(id, sizeof(id), path);
 671            pstrcat(id, sizeof(id), "/");
 672            g_free(path);
 673        }
 674    }
 675    pstrcat(id, sizeof(id), idstr);
 676
 677    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
 678        if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) {
 679            QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
 680            g_free(se->compat);
 681            g_free(se);
 682        }
 683    }
 684}
 685
 686int vmstate_register_with_alias_id(DeviceState *dev, int instance_id,
 687                                   const VMStateDescription *vmsd,
 688                                   void *opaque, int alias_id,
 689                                   int required_for_version,
 690                                   Error **errp)
 691{
 692    SaveStateEntry *se;
 693
 694    /* If this triggers, alias support can be dropped for the vmsd. */
 695    assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id);
 696
 697    se = g_new0(SaveStateEntry, 1);
 698    se->version_id = vmsd->version_id;
 699    se->section_id = savevm_state.global_section_id++;
 700    se->opaque = opaque;
 701    se->vmsd = vmsd;
 702    se->alias_id = alias_id;
 703
 704    if (dev) {
 705        char *id = qdev_get_dev_path(dev);
 706        if (id) {
 707            if (snprintf(se->idstr, sizeof(se->idstr), "%s/", id) >=
 708                sizeof(se->idstr)) {
 709                error_setg(errp, "Path too long for VMState (%s)", id);
 710                g_free(id);
 711                g_free(se);
 712
 713                return -1;
 714            }
 715            g_free(id);
 716
 717            se->compat = g_new0(CompatEntry, 1);
 718            pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name);
 719            se->compat->instance_id = instance_id == -1 ?
 720                         calculate_compat_instance_id(vmsd->name) : instance_id;
 721            instance_id = -1;
 722        }
 723    }
 724    pstrcat(se->idstr, sizeof(se->idstr), vmsd->name);
 725
 726    if (instance_id == -1) {
 727        se->instance_id = calculate_new_instance_id(se->idstr);
 728    } else {
 729        se->instance_id = instance_id;
 730    }
 731    assert(!se->compat || se->instance_id == 0);
 732    savevm_state_handler_insert(se);
 733    return 0;
 734}
 735
 736void vmstate_unregister(DeviceState *dev, const VMStateDescription *vmsd,
 737                        void *opaque)
 738{
 739    SaveStateEntry *se, *new_se;
 740
 741    QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) {
 742        if (se->vmsd == vmsd && se->opaque == opaque) {
 743            QTAILQ_REMOVE(&savevm_state.handlers, se, entry);
 744            g_free(se->compat);
 745            g_free(se);
 746        }
 747    }
 748}
 749
 750static int vmstate_load(QEMUFile *f, SaveStateEntry *se)
 751{
 752    trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
 753    if (!se->vmsd) {         /* Old style */
 754        return se->ops->load_state(f, se->opaque, se->load_version_id);
 755    }
 756    return vmstate_load_state(f, se->vmsd, se->opaque, se->load_version_id);
 757}
 758
 759static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc)
 760{
 761    int64_t old_offset, size;
 762
 763    old_offset = qemu_ftell_fast(f);
 764    se->ops->save_state(f, se->opaque);
 765    size = qemu_ftell_fast(f) - old_offset;
 766
 767    if (vmdesc) {
 768        json_prop_int(vmdesc, "size", size);
 769        json_start_array(vmdesc, "fields");
 770        json_start_object(vmdesc, NULL);
 771        json_prop_str(vmdesc, "name", "data");
 772        json_prop_int(vmdesc, "size", size);
 773        json_prop_str(vmdesc, "type", "buffer");
 774        json_end_object(vmdesc);
 775        json_end_array(vmdesc);
 776    }
 777}
 778
 779static int vmstate_save(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc)
 780{
 781    trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)");
 782    if (!se->vmsd) {
 783        vmstate_save_old_style(f, se, vmdesc);
 784        return 0;
 785    }
 786    return vmstate_save_state(f, se->vmsd, se->opaque, vmdesc);
 787}
 788
 789/*
 790 * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL)
 791 */
 792static void save_section_header(QEMUFile *f, SaveStateEntry *se,
 793                                uint8_t section_type)
 794{
 795    qemu_put_byte(f, section_type);
 796    qemu_put_be32(f, se->section_id);
 797
 798    if (section_type == QEMU_VM_SECTION_FULL ||
 799        section_type == QEMU_VM_SECTION_START) {
 800        /* ID string */
 801        size_t len = strlen(se->idstr);
 802        qemu_put_byte(f, len);
 803        qemu_put_buffer(f, (uint8_t *)se->idstr, len);
 804
 805        qemu_put_be32(f, se->instance_id);
 806        qemu_put_be32(f, se->version_id);
 807    }
 808}
 809
 810/*
 811 * Write a footer onto device sections that catches cases misformatted device
 812 * sections.
 813 */
 814static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
 815{
 816    if (migrate_get_current()->send_section_footer) {
 817        qemu_put_byte(f, QEMU_VM_SECTION_FOOTER);
 818        qemu_put_be32(f, se->section_id);
 819    }
 820}
 821
 822/**
 823 * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
 824 *                           command and associated data.
 825 *
 826 * @f: File to send command on
 827 * @command: Command type to send
 828 * @len: Length of associated data
 829 * @data: Data associated with command.
 830 */
 831static void qemu_savevm_command_send(QEMUFile *f,
 832                                     enum qemu_vm_cmd command,
 833                                     uint16_t len,
 834                                     uint8_t *data)
 835{
 836    trace_savevm_command_send(command, len);
 837    qemu_put_byte(f, QEMU_VM_COMMAND);
 838    qemu_put_be16(f, (uint16_t)command);
 839    qemu_put_be16(f, len);
 840    qemu_put_buffer(f, data, len);
 841    qemu_fflush(f);
 842}
 843
 844void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
 845{
 846    uint32_t buf;
 847
 848    trace_savevm_send_ping(value);
 849    buf = cpu_to_be32(value);
 850    qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
 851}
 852
 853void qemu_savevm_send_open_return_path(QEMUFile *f)
 854{
 855    trace_savevm_send_open_return_path();
 856    qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
 857}
 858
 859/* We have a buffer of data to send; we don't want that all to be loaded
 860 * by the command itself, so the command contains just the length of the
 861 * extra buffer that we then send straight after it.
 862 * TODO: Must be a better way to organise that
 863 *
 864 * Returns:
 865 *    0 on success
 866 *    -ve on error
 867 */
 868int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len)
 869{
 870    uint32_t tmp;
 871
 872    if (len > MAX_VM_CMD_PACKAGED_SIZE) {
 873        error_report("%s: Unreasonably large packaged state: %zu",
 874                     __func__, len);
 875        return -1;
 876    }
 877
 878    tmp = cpu_to_be32(len);
 879
 880    trace_qemu_savevm_send_packaged();
 881    qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
 882
 883    qemu_put_buffer(f, buf, len);
 884
 885    return 0;
 886}
 887
 888/* Send prior to any postcopy transfer */
 889void qemu_savevm_send_postcopy_advise(QEMUFile *f)
 890{
 891    if (migrate_postcopy_ram()) {
 892        uint64_t tmp[2];
 893        tmp[0] = cpu_to_be64(ram_pagesize_summary());
 894        tmp[1] = cpu_to_be64(qemu_target_page_size());
 895
 896        trace_qemu_savevm_send_postcopy_advise();
 897        qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE,
 898                                 16, (uint8_t *)tmp);
 899    } else {
 900        qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 0, NULL);
 901    }
 902}
 903
 904/* Sent prior to starting the destination running in postcopy, discard pages
 905 * that have already been sent but redirtied on the source.
 906 * CMD_POSTCOPY_RAM_DISCARD consist of:
 907 *      byte   version (0)
 908 *      byte   Length of name field (not including 0)
 909 *  n x byte   RAM block name
 910 *      byte   0 terminator (just for safety)
 911 *  n x        Byte ranges within the named RAMBlock
 912 *      be64   Start of the range
 913 *      be64   Length
 914 *
 915 *  name:  RAMBlock name that these entries are part of
 916 *  len: Number of page entries
 917 *  start_list: 'len' addresses
 918 *  length_list: 'len' addresses
 919 *
 920 */
 921void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
 922                                           uint16_t len,
 923                                           uint64_t *start_list,
 924                                           uint64_t *length_list)
 925{
 926    uint8_t *buf;
 927    uint16_t tmplen;
 928    uint16_t t;
 929    size_t name_len = strlen(name);
 930
 931    trace_qemu_savevm_send_postcopy_ram_discard(name, len);
 932    assert(name_len < 256);
 933    buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
 934    buf[0] = postcopy_ram_discard_version;
 935    buf[1] = name_len;
 936    memcpy(buf + 2, name, name_len);
 937    tmplen = 2 + name_len;
 938    buf[tmplen++] = '\0';
 939
 940    for (t = 0; t < len; t++) {
 941        stq_be_p(buf + tmplen, start_list[t]);
 942        tmplen += 8;
 943        stq_be_p(buf + tmplen, length_list[t]);
 944        tmplen += 8;
 945    }
 946    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
 947    g_free(buf);
 948}
 949
 950/* Get the destination into a state where it can receive postcopy data. */
 951void qemu_savevm_send_postcopy_listen(QEMUFile *f)
 952{
 953    trace_savevm_send_postcopy_listen();
 954    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
 955}
 956
 957/* Kick the destination into running */
 958void qemu_savevm_send_postcopy_run(QEMUFile *f)
 959{
 960    trace_savevm_send_postcopy_run();
 961    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
 962}
 963
 964void qemu_savevm_send_postcopy_resume(QEMUFile *f)
 965{
 966    trace_savevm_send_postcopy_resume();
 967    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RESUME, 0, NULL);
 968}
 969
 970void qemu_savevm_send_recv_bitmap(QEMUFile *f, char *block_name)
 971{
 972    size_t len;
 973    char buf[256];
 974
 975    trace_savevm_send_recv_bitmap(block_name);
 976
 977    buf[0] = len = strlen(block_name);
 978    memcpy(buf + 1, block_name, len);
 979
 980    qemu_savevm_command_send(f, MIG_CMD_RECV_BITMAP, len + 1, (uint8_t *)buf);
 981}
 982
 983bool qemu_savevm_state_blocked(Error **errp)
 984{
 985    SaveStateEntry *se;
 986
 987    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
 988        if (se->vmsd && se->vmsd->unmigratable) {
 989            error_setg(errp, "State blocked by non-migratable device '%s'",
 990                       se->idstr);
 991            return true;
 992        }
 993    }
 994    return false;
 995}
 996
 997void qemu_savevm_state_header(QEMUFile *f)
 998{
 999    trace_savevm_state_header();
1000    qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1001    qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1002
1003    if (migrate_get_current()->send_configuration) {
1004        qemu_put_byte(f, QEMU_VM_CONFIGURATION);
1005        vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
1006    }
1007}
1008
1009void qemu_savevm_state_setup(QEMUFile *f)
1010{
1011    SaveStateEntry *se;
1012    int ret;
1013
1014    trace_savevm_state_setup();
1015    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1016        if (!se->ops || !se->ops->save_setup) {
1017            continue;
1018        }
1019        if (se->ops && se->ops->is_active) {
1020            if (!se->ops->is_active(se->opaque)) {
1021                continue;
1022            }
1023        }
1024        save_section_header(f, se, QEMU_VM_SECTION_START);
1025
1026        ret = se->ops->save_setup(f, se->opaque);
1027        save_section_footer(f, se);
1028        if (ret < 0) {
1029            qemu_file_set_error(f, ret);
1030            break;
1031        }
1032    }
1033}
1034
1035int qemu_savevm_state_resume_prepare(MigrationState *s)
1036{
1037    SaveStateEntry *se;
1038    int ret;
1039
1040    trace_savevm_state_resume_prepare();
1041
1042    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1043        if (!se->ops || !se->ops->resume_prepare) {
1044            continue;
1045        }
1046        if (se->ops && se->ops->is_active) {
1047            if (!se->ops->is_active(se->opaque)) {
1048                continue;
1049            }
1050        }
1051        ret = se->ops->resume_prepare(s, se->opaque);
1052        if (ret < 0) {
1053            return ret;
1054        }
1055    }
1056
1057    return 0;
1058}
1059
1060/*
1061 * this function has three return values:
1062 *   negative: there was one error, and we have -errno.
1063 *   0 : We haven't finished, caller have to go again
1064 *   1 : We have finished, we can go to complete phase
1065 */
1066int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
1067{
1068    SaveStateEntry *se;
1069    int ret = 1;
1070
1071    trace_savevm_state_iterate();
1072    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1073        if (!se->ops || !se->ops->save_live_iterate) {
1074            continue;
1075        }
1076        if (se->ops && se->ops->is_active) {
1077            if (!se->ops->is_active(se->opaque)) {
1078                continue;
1079            }
1080        }
1081        if (se->ops && se->ops->is_active_iterate) {
1082            if (!se->ops->is_active_iterate(se->opaque)) {
1083                continue;
1084            }
1085        }
1086        /*
1087         * In the postcopy phase, any device that doesn't know how to
1088         * do postcopy should have saved it's state in the _complete
1089         * call that's already run, it might get confused if we call
1090         * iterate afterwards.
1091         */
1092        if (postcopy &&
1093            !(se->ops->has_postcopy && se->ops->has_postcopy(se->opaque))) {
1094            continue;
1095        }
1096        if (qemu_file_rate_limit(f)) {
1097            return 0;
1098        }
1099        trace_savevm_section_start(se->idstr, se->section_id);
1100
1101        save_section_header(f, se, QEMU_VM_SECTION_PART);
1102
1103        ret = se->ops->save_live_iterate(f, se->opaque);
1104        trace_savevm_section_end(se->idstr, se->section_id, ret);
1105        save_section_footer(f, se);
1106
1107        if (ret < 0) {
1108            qemu_file_set_error(f, ret);
1109        }
1110        if (ret <= 0) {
1111            /* Do not proceed to the next vmstate before this one reported
1112               completion of the current stage. This serializes the migration
1113               and reduces the probability that a faster changing state is
1114               synchronized over and over again. */
1115            break;
1116        }
1117    }
1118    return ret;
1119}
1120
1121static bool should_send_vmdesc(void)
1122{
1123    MachineState *machine = MACHINE(qdev_get_machine());
1124    bool in_postcopy = migration_in_postcopy();
1125    return !machine->suppress_vmdesc && !in_postcopy;
1126}
1127
1128/*
1129 * Calls the save_live_complete_postcopy methods
1130 * causing the last few pages to be sent immediately and doing any associated
1131 * cleanup.
1132 * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
1133 * all the other devices, but that happens at the point we switch to postcopy.
1134 */
1135void qemu_savevm_state_complete_postcopy(QEMUFile *f)
1136{
1137    SaveStateEntry *se;
1138    int ret;
1139
1140    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1141        if (!se->ops || !se->ops->save_live_complete_postcopy) {
1142            continue;
1143        }
1144        if (se->ops && se->ops->is_active) {
1145            if (!se->ops->is_active(se->opaque)) {
1146                continue;
1147            }
1148        }
1149        trace_savevm_section_start(se->idstr, se->section_id);
1150        /* Section type */
1151        qemu_put_byte(f, QEMU_VM_SECTION_END);
1152        qemu_put_be32(f, se->section_id);
1153
1154        ret = se->ops->save_live_complete_postcopy(f, se->opaque);
1155        trace_savevm_section_end(se->idstr, se->section_id, ret);
1156        save_section_footer(f, se);
1157        if (ret < 0) {
1158            qemu_file_set_error(f, ret);
1159            return;
1160        }
1161    }
1162
1163    qemu_put_byte(f, QEMU_VM_EOF);
1164    qemu_fflush(f);
1165}
1166
1167int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
1168                                       bool inactivate_disks)
1169{
1170    QJSON *vmdesc;
1171    int vmdesc_len;
1172    SaveStateEntry *se;
1173    int ret;
1174    bool in_postcopy = migration_in_postcopy();
1175
1176    trace_savevm_state_complete_precopy();
1177
1178    cpu_synchronize_all_states();
1179
1180    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1181        if (!se->ops ||
1182            (in_postcopy && se->ops->has_postcopy &&
1183             se->ops->has_postcopy(se->opaque)) ||
1184            (in_postcopy && !iterable_only) ||
1185            !se->ops->save_live_complete_precopy) {
1186            continue;
1187        }
1188
1189        if (se->ops && se->ops->is_active) {
1190            if (!se->ops->is_active(se->opaque)) {
1191                continue;
1192            }
1193        }
1194        trace_savevm_section_start(se->idstr, se->section_id);
1195
1196        save_section_header(f, se, QEMU_VM_SECTION_END);
1197
1198        ret = se->ops->save_live_complete_precopy(f, se->opaque);
1199        trace_savevm_section_end(se->idstr, se->section_id, ret);
1200        save_section_footer(f, se);
1201        if (ret < 0) {
1202            qemu_file_set_error(f, ret);
1203            return -1;
1204        }
1205    }
1206
1207    if (iterable_only) {
1208        return 0;
1209    }
1210
1211    vmdesc = qjson_new();
1212    json_prop_int(vmdesc, "page_size", qemu_target_page_size());
1213    json_start_array(vmdesc, "devices");
1214    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1215
1216        if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1217            continue;
1218        }
1219        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1220            trace_savevm_section_skip(se->idstr, se->section_id);
1221            continue;
1222        }
1223
1224        trace_savevm_section_start(se->idstr, se->section_id);
1225
1226        json_start_object(vmdesc, NULL);
1227        json_prop_str(vmdesc, "name", se->idstr);
1228        json_prop_int(vmdesc, "instance_id", se->instance_id);
1229
1230        save_section_header(f, se, QEMU_VM_SECTION_FULL);
1231        ret = vmstate_save(f, se, vmdesc);
1232        if (ret) {
1233            qemu_file_set_error(f, ret);
1234            return ret;
1235        }
1236        trace_savevm_section_end(se->idstr, se->section_id, 0);
1237        save_section_footer(f, se);
1238
1239        json_end_object(vmdesc);
1240    }
1241
1242    if (inactivate_disks) {
1243        /* Inactivate before sending QEMU_VM_EOF so that the
1244         * bdrv_invalidate_cache_all() on the other end won't fail. */
1245        ret = bdrv_inactivate_all();
1246        if (ret) {
1247            error_report("%s: bdrv_inactivate_all() failed (%d)",
1248                         __func__, ret);
1249            qemu_file_set_error(f, ret);
1250            return ret;
1251        }
1252    }
1253    if (!in_postcopy) {
1254        /* Postcopy stream will still be going */
1255        qemu_put_byte(f, QEMU_VM_EOF);
1256    }
1257
1258    json_end_array(vmdesc);
1259    qjson_finish(vmdesc);
1260    vmdesc_len = strlen(qjson_get_str(vmdesc));
1261
1262    if (should_send_vmdesc()) {
1263        qemu_put_byte(f, QEMU_VM_VMDESCRIPTION);
1264        qemu_put_be32(f, vmdesc_len);
1265        qemu_put_buffer(f, (uint8_t *)qjson_get_str(vmdesc), vmdesc_len);
1266    }
1267    qjson_destroy(vmdesc);
1268
1269    qemu_fflush(f);
1270    return 0;
1271}
1272
1273/* Give an estimate of the amount left to be transferred,
1274 * the result is split into the amount for units that can and
1275 * for units that can't do postcopy.
1276 */
1277void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
1278                               uint64_t *res_precopy_only,
1279                               uint64_t *res_compatible,
1280                               uint64_t *res_postcopy_only)
1281{
1282    SaveStateEntry *se;
1283
1284    *res_precopy_only = 0;
1285    *res_compatible = 0;
1286    *res_postcopy_only = 0;
1287
1288
1289    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1290        if (!se->ops || !se->ops->save_live_pending) {
1291            continue;
1292        }
1293        if (se->ops && se->ops->is_active) {
1294            if (!se->ops->is_active(se->opaque)) {
1295                continue;
1296            }
1297        }
1298        se->ops->save_live_pending(f, se->opaque, threshold_size,
1299                                   res_precopy_only, res_compatible,
1300                                   res_postcopy_only);
1301    }
1302}
1303
1304void qemu_savevm_state_cleanup(void)
1305{
1306    SaveStateEntry *se;
1307
1308    trace_savevm_state_cleanup();
1309    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1310        if (se->ops && se->ops->save_cleanup) {
1311            se->ops->save_cleanup(se->opaque);
1312        }
1313    }
1314}
1315
1316static int qemu_savevm_state(QEMUFile *f, Error **errp)
1317{
1318    int ret;
1319    MigrationState *ms = migrate_get_current();
1320    MigrationStatus status;
1321
1322    migrate_init(ms);
1323
1324    ms->to_dst_file = f;
1325
1326    if (migration_is_blocked(errp)) {
1327        ret = -EINVAL;
1328        goto done;
1329    }
1330
1331    if (migrate_use_block()) {
1332        error_setg(errp, "Block migration and snapshots are incompatible");
1333        ret = -EINVAL;
1334        goto done;
1335    }
1336
1337    qemu_mutex_unlock_iothread();
1338    qemu_savevm_state_header(f);
1339    qemu_savevm_state_setup(f);
1340    qemu_mutex_lock_iothread();
1341
1342    while (qemu_file_get_error(f) == 0) {
1343        if (qemu_savevm_state_iterate(f, false) > 0) {
1344            break;
1345        }
1346    }
1347
1348    ret = qemu_file_get_error(f);
1349    if (ret == 0) {
1350        qemu_savevm_state_complete_precopy(f, false, false);
1351        ret = qemu_file_get_error(f);
1352    }
1353    qemu_savevm_state_cleanup();
1354    if (ret != 0) {
1355        error_setg_errno(errp, -ret, "Error while writing VM state");
1356    }
1357
1358done:
1359    if (ret != 0) {
1360        status = MIGRATION_STATUS_FAILED;
1361    } else {
1362        status = MIGRATION_STATUS_COMPLETED;
1363    }
1364    migrate_set_state(&ms->state, MIGRATION_STATUS_SETUP, status);
1365
1366    /* f is outer parameter, it should not stay in global migration state after
1367     * this function finished */
1368    ms->to_dst_file = NULL;
1369
1370    return ret;
1371}
1372
1373static int qemu_save_device_state(QEMUFile *f)
1374{
1375    SaveStateEntry *se;
1376
1377    qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
1378    qemu_put_be32(f, QEMU_VM_FILE_VERSION);
1379
1380    cpu_synchronize_all_states();
1381
1382    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1383        int ret;
1384
1385        if (se->is_ram) {
1386            continue;
1387        }
1388        if ((!se->ops || !se->ops->save_state) && !se->vmsd) {
1389            continue;
1390        }
1391        if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) {
1392            continue;
1393        }
1394
1395        save_section_header(f, se, QEMU_VM_SECTION_FULL);
1396
1397        ret = vmstate_save(f, se, NULL);
1398        if (ret) {
1399            return ret;
1400        }
1401
1402        save_section_footer(f, se);
1403    }
1404
1405    qemu_put_byte(f, QEMU_VM_EOF);
1406
1407    return qemu_file_get_error(f);
1408}
1409
1410static SaveStateEntry *find_se(const char *idstr, int instance_id)
1411{
1412    SaveStateEntry *se;
1413
1414    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
1415        if (!strcmp(se->idstr, idstr) &&
1416            (instance_id == se->instance_id ||
1417             instance_id == se->alias_id))
1418            return se;
1419        /* Migrating from an older version? */
1420        if (strstr(se->idstr, idstr) && se->compat) {
1421            if (!strcmp(se->compat->idstr, idstr) &&
1422                (instance_id == se->compat->instance_id ||
1423                 instance_id == se->alias_id))
1424                return se;
1425        }
1426    }
1427    return NULL;
1428}
1429
1430enum LoadVMExitCodes {
1431    /* Allow a command to quit all layers of nested loadvm loops */
1432    LOADVM_QUIT     =  1,
1433};
1434
1435static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
1436
1437/* ------ incoming postcopy messages ------ */
1438/* 'advise' arrives before any transfers just to tell us that a postcopy
1439 * *might* happen - it might be skipped if precopy transferred everything
1440 * quickly.
1441 */
1442static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis,
1443                                         uint16_t len)
1444{
1445    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1446    uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps;
1447    Error *local_err = NULL;
1448
1449    trace_loadvm_postcopy_handle_advise();
1450    if (ps != POSTCOPY_INCOMING_NONE) {
1451        error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
1452        return -1;
1453    }
1454
1455    switch (len) {
1456    case 0:
1457        if (migrate_postcopy_ram()) {
1458            error_report("RAM postcopy is enabled but have 0 byte advise");
1459            return -EINVAL;
1460        }
1461        return 0;
1462    case 8 + 8:
1463        if (!migrate_postcopy_ram()) {
1464            error_report("RAM postcopy is disabled but have 16 byte advise");
1465            return -EINVAL;
1466        }
1467        break;
1468    default:
1469        error_report("CMD_POSTCOPY_ADVISE invalid length (%d)", len);
1470        return -EINVAL;
1471    }
1472
1473    if (!postcopy_ram_supported_by_host(mis)) {
1474        postcopy_state_set(POSTCOPY_INCOMING_NONE);
1475        return -1;
1476    }
1477
1478    remote_pagesize_summary = qemu_get_be64(mis->from_src_file);
1479    local_pagesize_summary = ram_pagesize_summary();
1480
1481    if (remote_pagesize_summary != local_pagesize_summary)  {
1482        /*
1483         * This detects two potential causes of mismatch:
1484         *   a) A mismatch in host page sizes
1485         *      Some combinations of mismatch are probably possible but it gets
1486         *      a bit more complicated.  In particular we need to place whole
1487         *      host pages on the dest at once, and we need to ensure that we
1488         *      handle dirtying to make sure we never end up sending part of
1489         *      a hostpage on it's own.
1490         *   b) The use of different huge page sizes on source/destination
1491         *      a more fine grain test is performed during RAM block migration
1492         *      but this test here causes a nice early clear failure, and
1493         *      also fails when passed to an older qemu that doesn't
1494         *      do huge pages.
1495         */
1496        error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64
1497                                                             " d=%" PRIx64 ")",
1498                     remote_pagesize_summary, local_pagesize_summary);
1499        return -1;
1500    }
1501
1502    remote_tps = qemu_get_be64(mis->from_src_file);
1503    if (remote_tps != qemu_target_page_size()) {
1504        /*
1505         * Again, some differences could be dealt with, but for now keep it
1506         * simple.
1507         */
1508        error_report("Postcopy needs matching target page sizes (s=%d d=%zd)",
1509                     (int)remote_tps, qemu_target_page_size());
1510        return -1;
1511    }
1512
1513    if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_ADVISE, &local_err)) {
1514        error_report_err(local_err);
1515        return -1;
1516    }
1517
1518    if (ram_postcopy_incoming_init(mis)) {
1519        return -1;
1520    }
1521
1522    postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
1523
1524    return 0;
1525}
1526
1527/* After postcopy we will be told to throw some pages away since they're
1528 * dirty and will have to be demand fetched.  Must happen before CPU is
1529 * started.
1530 * There can be 0..many of these messages, each encoding multiple pages.
1531 */
1532static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
1533                                              uint16_t len)
1534{
1535    int tmp;
1536    char ramid[256];
1537    PostcopyState ps = postcopy_state_get();
1538
1539    trace_loadvm_postcopy_ram_handle_discard();
1540
1541    switch (ps) {
1542    case POSTCOPY_INCOMING_ADVISE:
1543        /* 1st discard */
1544        tmp = postcopy_ram_prepare_discard(mis);
1545        if (tmp) {
1546            return tmp;
1547        }
1548        break;
1549
1550    case POSTCOPY_INCOMING_DISCARD:
1551        /* Expected state */
1552        break;
1553
1554    default:
1555        error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
1556                     ps);
1557        return -1;
1558    }
1559    /* We're expecting a
1560     *    Version (0)
1561     *    a RAM ID string (length byte, name, 0 term)
1562     *    then at least 1 16 byte chunk
1563    */
1564    if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
1565        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1566        return -1;
1567    }
1568
1569    tmp = qemu_get_byte(mis->from_src_file);
1570    if (tmp != postcopy_ram_discard_version) {
1571        error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
1572        return -1;
1573    }
1574
1575    if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
1576        error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
1577        return -1;
1578    }
1579    tmp = qemu_get_byte(mis->from_src_file);
1580    if (tmp != 0) {
1581        error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
1582        return -1;
1583    }
1584
1585    len -= 3 + strlen(ramid);
1586    if (len % 16) {
1587        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
1588        return -1;
1589    }
1590    trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
1591    while (len) {
1592        uint64_t start_addr, block_length;
1593        start_addr = qemu_get_be64(mis->from_src_file);
1594        block_length = qemu_get_be64(mis->from_src_file);
1595
1596        len -= 16;
1597        int ret = ram_discard_range(ramid, start_addr, block_length);
1598        if (ret) {
1599            return ret;
1600        }
1601    }
1602    trace_loadvm_postcopy_ram_handle_discard_end();
1603
1604    return 0;
1605}
1606
1607/*
1608 * Triggered by a postcopy_listen command; this thread takes over reading
1609 * the input stream, leaving the main thread free to carry on loading the rest
1610 * of the device state (from RAM).
1611 * (TODO:This could do with being in a postcopy file - but there again it's
1612 * just another input loop, not that postcopy specific)
1613 */
1614static void *postcopy_ram_listen_thread(void *opaque)
1615{
1616    MigrationIncomingState *mis = migration_incoming_get_current();
1617    QEMUFile *f = mis->from_src_file;
1618    int load_res;
1619
1620    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
1621                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);
1622    qemu_sem_post(&mis->listen_thread_sem);
1623    trace_postcopy_ram_listen_thread_start();
1624
1625    /*
1626     * Because we're a thread and not a coroutine we can't yield
1627     * in qemu_file, and thus we must be blocking now.
1628     */
1629    qemu_file_set_blocking(f, true);
1630    load_res = qemu_loadvm_state_main(f, mis);
1631
1632    /*
1633     * This is tricky, but, mis->from_src_file can change after it
1634     * returns, when postcopy recovery happened. In the future, we may
1635     * want a wrapper for the QEMUFile handle.
1636     */
1637    f = mis->from_src_file;
1638
1639    /* And non-blocking again so we don't block in any cleanup */
1640    qemu_file_set_blocking(f, false);
1641
1642    trace_postcopy_ram_listen_thread_exit();
1643    if (load_res < 0) {
1644        error_report("%s: loadvm failed: %d", __func__, load_res);
1645        qemu_file_set_error(f, load_res);
1646        migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1647                                       MIGRATION_STATUS_FAILED);
1648    } else {
1649        /*
1650         * This looks good, but it's possible that the device loading in the
1651         * main thread hasn't finished yet, and so we might not be in 'RUN'
1652         * state yet; wait for the end of the main thread.
1653         */
1654        qemu_event_wait(&mis->main_thread_load_event);
1655    }
1656    postcopy_ram_incoming_cleanup(mis);
1657
1658    if (load_res < 0) {
1659        /*
1660         * If something went wrong then we have a bad state so exit;
1661         * depending how far we got it might be possible at this point
1662         * to leave the guest running and fire MCEs for pages that never
1663         * arrived as a desperate recovery step.
1664         */
1665        exit(EXIT_FAILURE);
1666    }
1667
1668    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
1669                                   MIGRATION_STATUS_COMPLETED);
1670    /*
1671     * If everything has worked fine, then the main thread has waited
1672     * for us to start, and we're the last use of the mis.
1673     * (If something broke then qemu will have to exit anyway since it's
1674     * got a bad migration state).
1675     */
1676    migration_incoming_state_destroy();
1677    qemu_loadvm_state_cleanup();
1678
1679    return NULL;
1680}
1681
1682/* After this message we must be able to immediately receive postcopy data */
1683static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
1684{
1685    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
1686    trace_loadvm_postcopy_handle_listen();
1687    Error *local_err = NULL;
1688
1689    if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
1690        error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
1691        return -1;
1692    }
1693    if (ps == POSTCOPY_INCOMING_ADVISE) {
1694        /*
1695         * A rare case, we entered listen without having to do any discards,
1696         * so do the setup that's normally done at the time of the 1st discard.
1697         */
1698        if (migrate_postcopy_ram()) {
1699            postcopy_ram_prepare_discard(mis);
1700        }
1701    }
1702
1703    /*
1704     * Sensitise RAM - can now generate requests for blocks that don't exist
1705     * However, at this point the CPU shouldn't be running, and the IO
1706     * shouldn't be doing anything yet so don't actually expect requests
1707     */
1708    if (migrate_postcopy_ram()) {
1709        if (postcopy_ram_enable_notify(mis)) {
1710            return -1;
1711        }
1712    }
1713
1714    if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_LISTEN, &local_err)) {
1715        error_report_err(local_err);
1716        return -1;
1717    }
1718
1719    if (mis->have_listen_thread) {
1720        error_report("CMD_POSTCOPY_RAM_LISTEN already has a listen thread");
1721        return -1;
1722    }
1723
1724    mis->have_listen_thread = true;
1725    /* Start up the listening thread and wait for it to signal ready */
1726    qemu_sem_init(&mis->listen_thread_sem, 0);
1727    qemu_thread_create(&mis->listen_thread, "postcopy/listen",
1728                       postcopy_ram_listen_thread, NULL,
1729                       QEMU_THREAD_DETACHED);
1730    qemu_sem_wait(&mis->listen_thread_sem);
1731    qemu_sem_destroy(&mis->listen_thread_sem);
1732
1733    return 0;
1734}
1735
1736
1737typedef struct {
1738    QEMUBH *bh;
1739} HandleRunBhData;
1740
1741static void loadvm_postcopy_handle_run_bh(void *opaque)
1742{
1743    Error *local_err = NULL;
1744    HandleRunBhData *data = opaque;
1745
1746    /* TODO we should move all of this lot into postcopy_ram.c or a shared code
1747     * in migration.c
1748     */
1749    cpu_synchronize_all_post_init();
1750
1751    qemu_announce_self();
1752
1753    /* Make sure all file formats flush their mutable metadata.
1754     * If we get an error here, just don't restart the VM yet. */
1755    bdrv_invalidate_cache_all(&local_err);
1756    if (local_err) {
1757        error_report_err(local_err);
1758        local_err = NULL;
1759        autostart = false;
1760    }
1761
1762    trace_loadvm_postcopy_handle_run_cpu_sync();
1763    cpu_synchronize_all_post_init();
1764
1765    trace_loadvm_postcopy_handle_run_vmstart();
1766
1767    dirty_bitmap_mig_before_vm_start();
1768
1769    if (autostart) {
1770        /* Hold onto your hats, starting the CPU */
1771        vm_start();
1772    } else {
1773        /* leave it paused and let management decide when to start the CPU */
1774        runstate_set(RUN_STATE_PAUSED);
1775    }
1776
1777    qemu_bh_delete(data->bh);
1778    g_free(data);
1779}
1780
1781/* After all discards we can start running and asking for pages */
1782static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
1783{
1784    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
1785    HandleRunBhData *data;
1786
1787    trace_loadvm_postcopy_handle_run();
1788    if (ps != POSTCOPY_INCOMING_LISTENING) {
1789        error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
1790        return -1;
1791    }
1792
1793    data = g_new(HandleRunBhData, 1);
1794    data->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, data);
1795    qemu_bh_schedule(data->bh);
1796
1797    /* We need to finish reading the stream from the package
1798     * and also stop reading anything more from the stream that loaded the
1799     * package (since it's now being read by the listener thread).
1800     * LOADVM_QUIT will quit all the layers of nested loadvm loops.
1801     */
1802    return LOADVM_QUIT;
1803}
1804
1805static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis)
1806{
1807    if (mis->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
1808        error_report("%s: illegal resume received", __func__);
1809        /* Don't fail the load, only for this. */
1810        return 0;
1811    }
1812
1813    /*
1814     * This means source VM is ready to resume the postcopy migration.
1815     * It's time to switch state and release the fault thread to
1816     * continue service page faults.
1817     */
1818    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_RECOVER,
1819                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
1820    qemu_sem_post(&mis->postcopy_pause_sem_fault);
1821
1822    trace_loadvm_postcopy_handle_resume();
1823
1824    /* Tell source that "we are ready" */
1825    migrate_send_rp_resume_ack(mis, MIGRATION_RESUME_ACK_VALUE);
1826
1827    return 0;
1828}
1829
1830/**
1831 * Immediately following this command is a blob of data containing an embedded
1832 * chunk of migration stream; read it and load it.
1833 *
1834 * @mis: Incoming state
1835 * @length: Length of packaged data to read
1836 *
1837 * Returns: Negative values on error
1838 *
1839 */
1840static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
1841{
1842    int ret;
1843    size_t length;
1844    QIOChannelBuffer *bioc;
1845
1846    length = qemu_get_be32(mis->from_src_file);
1847    trace_loadvm_handle_cmd_packaged(length);
1848
1849    if (length > MAX_VM_CMD_PACKAGED_SIZE) {
1850        error_report("Unreasonably large packaged state: %zu", length);
1851        return -1;
1852    }
1853
1854    bioc = qio_channel_buffer_new(length);
1855    qio_channel_set_name(QIO_CHANNEL(bioc), "migration-loadvm-buffer");
1856    ret = qemu_get_buffer(mis->from_src_file,
1857                          bioc->data,
1858                          length);
1859    if (ret != length) {
1860        object_unref(OBJECT(bioc));
1861        error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%zu",
1862                     ret, length);
1863        return (ret < 0) ? ret : -EAGAIN;
1864    }
1865    bioc->usage += length;
1866    trace_loadvm_handle_cmd_packaged_received(ret);
1867
1868    QEMUFile *packf = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
1869
1870    ret = qemu_loadvm_state_main(packf, mis);
1871    trace_loadvm_handle_cmd_packaged_main(ret);
1872    qemu_fclose(packf);
1873    object_unref(OBJECT(bioc));
1874
1875    return ret;
1876}
1877
1878/*
1879 * Handle request that source requests for recved_bitmap on
1880 * destination. Payload format:
1881 *
1882 * len (1 byte) + ramblock_name (<255 bytes)
1883 */
1884static int loadvm_handle_recv_bitmap(MigrationIncomingState *mis,
1885                                     uint16_t len)
1886{
1887    QEMUFile *file = mis->from_src_file;
1888    RAMBlock *rb;
1889    char block_name[256];
1890    size_t cnt;
1891
1892    cnt = qemu_get_counted_string(file, block_name);
1893    if (!cnt) {
1894        error_report("%s: failed to read block name", __func__);
1895        return -EINVAL;
1896    }
1897
1898    /* Validate before using the data */
1899    if (qemu_file_get_error(file)) {
1900        return qemu_file_get_error(file);
1901    }
1902
1903    if (len != cnt + 1) {
1904        error_report("%s: invalid payload length (%d)", __func__, len);
1905        return -EINVAL;
1906    }
1907
1908    rb = qemu_ram_block_by_name(block_name);
1909    if (!rb) {
1910        error_report("%s: block '%s' not found", __func__, block_name);
1911        return -EINVAL;
1912    }
1913
1914    migrate_send_rp_recv_bitmap(mis, block_name);
1915
1916    trace_loadvm_handle_recv_bitmap(block_name);
1917
1918    return 0;
1919}
1920
1921/*
1922 * Process an incoming 'QEMU_VM_COMMAND'
1923 * 0           just a normal return
1924 * LOADVM_QUIT All good, but exit the loop
1925 * <0          Error
1926 */
1927static int loadvm_process_command(QEMUFile *f)
1928{
1929    MigrationIncomingState *mis = migration_incoming_get_current();
1930    uint16_t cmd;
1931    uint16_t len;
1932    uint32_t tmp32;
1933
1934    cmd = qemu_get_be16(f);
1935    len = qemu_get_be16(f);
1936
1937    /* Check validity before continue processing of cmds */
1938    if (qemu_file_get_error(f)) {
1939        return qemu_file_get_error(f);
1940    }
1941
1942    trace_loadvm_process_command(cmd, len);
1943    if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
1944        error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
1945        return -EINVAL;
1946    }
1947
1948    if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
1949        error_report("%s received with bad length - expecting %zu, got %d",
1950                     mig_cmd_args[cmd].name,
1951                     (size_t)mig_cmd_args[cmd].len, len);
1952        return -ERANGE;
1953    }
1954
1955    switch (cmd) {
1956    case MIG_CMD_OPEN_RETURN_PATH:
1957        if (mis->to_src_file) {
1958            error_report("CMD_OPEN_RETURN_PATH called when RP already open");
1959            /* Not really a problem, so don't give up */
1960            return 0;
1961        }
1962        mis->to_src_file = qemu_file_get_return_path(f);
1963        if (!mis->to_src_file) {
1964            error_report("CMD_OPEN_RETURN_PATH failed");
1965            return -1;
1966        }
1967        break;
1968
1969    case MIG_CMD_PING:
1970        tmp32 = qemu_get_be32(f);
1971        trace_loadvm_process_command_ping(tmp32);
1972        if (!mis->to_src_file) {
1973            error_report("CMD_PING (0x%x) received with no return path",
1974                         tmp32);
1975            return -1;
1976        }
1977        migrate_send_rp_pong(mis, tmp32);
1978        break;
1979
1980    case MIG_CMD_PACKAGED:
1981        return loadvm_handle_cmd_packaged(mis);
1982
1983    case MIG_CMD_POSTCOPY_ADVISE:
1984        return loadvm_postcopy_handle_advise(mis, len);
1985
1986    case MIG_CMD_POSTCOPY_LISTEN:
1987        return loadvm_postcopy_handle_listen(mis);
1988
1989    case MIG_CMD_POSTCOPY_RUN:
1990        return loadvm_postcopy_handle_run(mis);
1991
1992    case MIG_CMD_POSTCOPY_RAM_DISCARD:
1993        return loadvm_postcopy_ram_handle_discard(mis, len);
1994
1995    case MIG_CMD_POSTCOPY_RESUME:
1996        return loadvm_postcopy_handle_resume(mis);
1997
1998    case MIG_CMD_RECV_BITMAP:
1999        return loadvm_handle_recv_bitmap(mis, len);
2000    }
2001
2002    return 0;
2003}
2004
2005/*
2006 * Read a footer off the wire and check that it matches the expected section
2007 *
2008 * Returns: true if the footer was good
2009 *          false if there is a problem (and calls error_report to say why)
2010 */
2011static bool check_section_footer(QEMUFile *f, SaveStateEntry *se)
2012{
2013    int ret;
2014    uint8_t read_mark;
2015    uint32_t read_section_id;
2016
2017    if (!migrate_get_current()->send_section_footer) {
2018        /* No footer to check */
2019        return true;
2020    }
2021
2022    read_mark = qemu_get_byte(f);
2023
2024    ret = qemu_file_get_error(f);
2025    if (ret) {
2026        error_report("%s: Read section footer failed: %d",
2027                     __func__, ret);
2028        return false;
2029    }
2030
2031    if (read_mark != QEMU_VM_SECTION_FOOTER) {
2032        error_report("Missing section footer for %s", se->idstr);
2033        return false;
2034    }
2035
2036    read_section_id = qemu_get_be32(f);
2037    if (read_section_id != se->load_section_id) {
2038        error_report("Mismatched section id in footer for %s -"
2039                     " read 0x%x expected 0x%x",
2040                     se->idstr, read_section_id, se->load_section_id);
2041        return false;
2042    }
2043
2044    /* All good */
2045    return true;
2046}
2047
2048static int
2049qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis)
2050{
2051    uint32_t instance_id, version_id, section_id;
2052    SaveStateEntry *se;
2053    char idstr[256];
2054    int ret;
2055
2056    /* Read section start */
2057    section_id = qemu_get_be32(f);
2058    if (!qemu_get_counted_string(f, idstr)) {
2059        error_report("Unable to read ID string for section %u",
2060                     section_id);
2061        return -EINVAL;
2062    }
2063    instance_id = qemu_get_be32(f);
2064    version_id = qemu_get_be32(f);
2065
2066    ret = qemu_file_get_error(f);
2067    if (ret) {
2068        error_report("%s: Failed to read instance/version ID: %d",
2069                     __func__, ret);
2070        return ret;
2071    }
2072
2073    trace_qemu_loadvm_state_section_startfull(section_id, idstr,
2074            instance_id, version_id);
2075    /* Find savevm section */
2076    se = find_se(idstr, instance_id);
2077    if (se == NULL) {
2078        error_report("Unknown savevm section or instance '%s' %d",
2079                     idstr, instance_id);
2080        return -EINVAL;
2081    }
2082
2083    /* Validate version */
2084    if (version_id > se->version_id) {
2085        error_report("savevm: unsupported version %d for '%s' v%d",
2086                     version_id, idstr, se->version_id);
2087        return -EINVAL;
2088    }
2089    se->load_version_id = version_id;
2090    se->load_section_id = section_id;
2091
2092    /* Validate if it is a device's state */
2093    if (xen_enabled() && se->is_ram) {
2094        error_report("loadvm: %s RAM loading not allowed on Xen", idstr);
2095        return -EINVAL;
2096    }
2097
2098    ret = vmstate_load(f, se);
2099    if (ret < 0) {
2100        error_report("error while loading state for instance 0x%x of"
2101                     " device '%s'", instance_id, idstr);
2102        return ret;
2103    }
2104    if (!check_section_footer(f, se)) {
2105        return -EINVAL;
2106    }
2107
2108    return 0;
2109}
2110
2111static int
2112qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis)
2113{
2114    uint32_t section_id;
2115    SaveStateEntry *se;
2116    int ret;
2117
2118    section_id = qemu_get_be32(f);
2119
2120    ret = qemu_file_get_error(f);
2121    if (ret) {
2122        error_report("%s: Failed to read section ID: %d",
2123                     __func__, ret);
2124        return ret;
2125    }
2126
2127    trace_qemu_loadvm_state_section_partend(section_id);
2128    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2129        if (se->load_section_id == section_id) {
2130            break;
2131        }
2132    }
2133    if (se == NULL) {
2134        error_report("Unknown savevm section %d", section_id);
2135        return -EINVAL;
2136    }
2137
2138    ret = vmstate_load(f, se);
2139    if (ret < 0) {
2140        error_report("error while loading state section id %d(%s)",
2141                     section_id, se->idstr);
2142        return ret;
2143    }
2144    if (!check_section_footer(f, se)) {
2145        return -EINVAL;
2146    }
2147
2148    return 0;
2149}
2150
2151static int qemu_loadvm_state_setup(QEMUFile *f)
2152{
2153    SaveStateEntry *se;
2154    int ret;
2155
2156    trace_loadvm_state_setup();
2157    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2158        if (!se->ops || !se->ops->load_setup) {
2159            continue;
2160        }
2161        if (se->ops && se->ops->is_active) {
2162            if (!se->ops->is_active(se->opaque)) {
2163                continue;
2164            }
2165        }
2166
2167        ret = se->ops->load_setup(f, se->opaque);
2168        if (ret < 0) {
2169            qemu_file_set_error(f, ret);
2170            error_report("Load state of device %s failed", se->idstr);
2171            return ret;
2172        }
2173    }
2174    return 0;
2175}
2176
2177void qemu_loadvm_state_cleanup(void)
2178{
2179    SaveStateEntry *se;
2180
2181    trace_loadvm_state_cleanup();
2182    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
2183        if (se->ops && se->ops->load_cleanup) {
2184            se->ops->load_cleanup(se->opaque);
2185        }
2186    }
2187}
2188
2189/* Return true if we should continue the migration, or false. */
2190static bool postcopy_pause_incoming(MigrationIncomingState *mis)
2191{
2192    trace_postcopy_pause_incoming();
2193
2194    /* Clear the triggered bit to allow one recovery */
2195    mis->postcopy_recover_triggered = false;
2196
2197    assert(mis->from_src_file);
2198    qemu_file_shutdown(mis->from_src_file);
2199    qemu_fclose(mis->from_src_file);
2200    mis->from_src_file = NULL;
2201
2202    assert(mis->to_src_file);
2203    qemu_file_shutdown(mis->to_src_file);
2204    qemu_mutex_lock(&mis->rp_mutex);
2205    qemu_fclose(mis->to_src_file);
2206    mis->to_src_file = NULL;
2207    qemu_mutex_unlock(&mis->rp_mutex);
2208
2209    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
2210                      MIGRATION_STATUS_POSTCOPY_PAUSED);
2211
2212    /* Notify the fault thread for the invalidated file handle */
2213    postcopy_fault_thread_notify(mis);
2214
2215    error_report("Detected IO failure for postcopy. "
2216                 "Migration paused.");
2217
2218    while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
2219        qemu_sem_wait(&mis->postcopy_pause_sem_dst);
2220    }
2221
2222    trace_postcopy_pause_incoming_continued();
2223
2224    return true;
2225}
2226
2227static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
2228{
2229    uint8_t section_type;
2230    int ret = 0;
2231
2232retry:
2233    while (true) {
2234        section_type = qemu_get_byte(f);
2235
2236        if (qemu_file_get_error(f)) {
2237            ret = qemu_file_get_error(f);
2238            break;
2239        }
2240
2241        trace_qemu_loadvm_state_section(section_type);
2242        switch (section_type) {
2243        case QEMU_VM_SECTION_START:
2244        case QEMU_VM_SECTION_FULL:
2245            ret = qemu_loadvm_section_start_full(f, mis);
2246            if (ret < 0) {
2247                goto out;
2248            }
2249            break;
2250        case QEMU_VM_SECTION_PART:
2251        case QEMU_VM_SECTION_END:
2252            ret = qemu_loadvm_section_part_end(f, mis);
2253            if (ret < 0) {
2254                goto out;
2255            }
2256            break;
2257        case QEMU_VM_COMMAND:
2258            ret = loadvm_process_command(f);
2259            trace_qemu_loadvm_state_section_command(ret);
2260            if ((ret < 0) || (ret & LOADVM_QUIT)) {
2261                goto out;
2262            }
2263            break;
2264        case QEMU_VM_EOF:
2265            /* This is the end of migration */
2266            goto out;
2267        default:
2268            error_report("Unknown savevm section type %d", section_type);
2269            ret = -EINVAL;
2270            goto out;
2271        }
2272    }
2273
2274out:
2275    if (ret < 0) {
2276        qemu_file_set_error(f, ret);
2277
2278        /*
2279         * If we are during an active postcopy, then we pause instead
2280         * of bail out to at least keep the VM's dirty data.  Note
2281         * that POSTCOPY_INCOMING_LISTENING stage is still not enough,
2282         * during which we're still receiving device states and we
2283         * still haven't yet started the VM on destination.
2284         */
2285        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
2286            postcopy_pause_incoming(mis)) {
2287            /* Reset f to point to the newly created channel */
2288            f = mis->from_src_file;
2289            goto retry;
2290        }
2291    }
2292    return ret;
2293}
2294
2295int qemu_loadvm_state(QEMUFile *f)
2296{
2297    MigrationIncomingState *mis = migration_incoming_get_current();
2298    Error *local_err = NULL;
2299    unsigned int v;
2300    int ret;
2301
2302    if (qemu_savevm_state_blocked(&local_err)) {
2303        error_report_err(local_err);
2304        return -EINVAL;
2305    }
2306
2307    v = qemu_get_be32(f);
2308    if (v != QEMU_VM_FILE_MAGIC) {
2309        error_report("Not a migration stream");
2310        return -EINVAL;
2311    }
2312
2313    v = qemu_get_be32(f);
2314    if (v == QEMU_VM_FILE_VERSION_COMPAT) {
2315        error_report("SaveVM v2 format is obsolete and don't work anymore");
2316        return -ENOTSUP;
2317    }
2318    if (v != QEMU_VM_FILE_VERSION) {
2319        error_report("Unsupported migration stream version");
2320        return -ENOTSUP;
2321    }
2322
2323    if (qemu_loadvm_state_setup(f) != 0) {
2324        return -EINVAL;
2325    }
2326
2327    if (migrate_get_current()->send_configuration) {
2328        if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
2329            error_report("Configuration section missing");
2330            return -EINVAL;
2331        }
2332        ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
2333
2334        if (ret) {
2335            return ret;
2336        }
2337    }
2338
2339    cpu_synchronize_all_pre_loadvm();
2340
2341    ret = qemu_loadvm_state_main(f, mis);
2342    qemu_event_set(&mis->main_thread_load_event);
2343
2344    trace_qemu_loadvm_state_post_main(ret);
2345
2346    if (mis->have_listen_thread) {
2347        /* Listen thread still going, can't clean up yet */
2348        return ret;
2349    }
2350
2351    if (ret == 0) {
2352        ret = qemu_file_get_error(f);
2353    }
2354
2355    /*
2356     * Try to read in the VMDESC section as well, so that dumping tools that
2357     * intercept our migration stream have the chance to see it.
2358     */
2359
2360    /* We've got to be careful; if we don't read the data and just shut the fd
2361     * then the sender can error if we close while it's still sending.
2362     * We also mustn't read data that isn't there; some transports (RDMA)
2363     * will stall waiting for that data when the source has already closed.
2364     */
2365    if (ret == 0 && should_send_vmdesc()) {
2366        uint8_t *buf;
2367        uint32_t size;
2368        uint8_t  section_type = qemu_get_byte(f);
2369
2370        if (section_type != QEMU_VM_VMDESCRIPTION) {
2371            error_report("Expected vmdescription section, but got %d",
2372                         section_type);
2373            /*
2374             * It doesn't seem worth failing at this point since
2375             * we apparently have an otherwise valid VM state
2376             */
2377        } else {
2378            buf = g_malloc(0x1000);
2379            size = qemu_get_be32(f);
2380
2381            while (size > 0) {
2382                uint32_t read_chunk = MIN(size, 0x1000);
2383                qemu_get_buffer(f, buf, read_chunk);
2384                size -= read_chunk;
2385            }
2386            g_free(buf);
2387        }
2388    }
2389
2390    qemu_loadvm_state_cleanup();
2391    cpu_synchronize_all_post_init();
2392
2393    return ret;
2394}
2395
2396int save_snapshot(const char *name, Error **errp)
2397{
2398    BlockDriverState *bs, *bs1;
2399    QEMUSnapshotInfo sn1, *sn = &sn1, old_sn1, *old_sn = &old_sn1;
2400    int ret = -1;
2401    QEMUFile *f;
2402    int saved_vm_running;
2403    uint64_t vm_state_size;
2404    qemu_timeval tv;
2405    struct tm tm;
2406    AioContext *aio_context;
2407
2408    if (!replay_can_snapshot()) {
2409        error_report("Record/replay does not allow making snapshot "
2410                     "right now. Try once more later.");
2411        return ret;
2412    }
2413
2414    if (!bdrv_all_can_snapshot(&bs)) {
2415        error_setg(errp, "Device '%s' is writable but does not support "
2416                   "snapshots", bdrv_get_device_name(bs));
2417        return ret;
2418    }
2419
2420    /* Delete old snapshots of the same name */
2421    if (name) {
2422        ret = bdrv_all_delete_snapshot(name, &bs1, errp);
2423        if (ret < 0) {
2424            error_prepend(errp, "Error while deleting snapshot on device "
2425                          "'%s': ", bdrv_get_device_name(bs1));
2426            return ret;
2427        }
2428    }
2429
2430    bs = bdrv_all_find_vmstate_bs();
2431    if (bs == NULL) {
2432        error_setg(errp, "No block device can accept snapshots");
2433        return ret;
2434    }
2435    aio_context = bdrv_get_aio_context(bs);
2436
2437    saved_vm_running = runstate_is_running();
2438
2439    ret = global_state_store();
2440    if (ret) {
2441        error_setg(errp, "Error saving global state");
2442        return ret;
2443    }
2444    vm_stop(RUN_STATE_SAVE_VM);
2445
2446    bdrv_drain_all_begin();
2447
2448    aio_context_acquire(aio_context);
2449
2450    memset(sn, 0, sizeof(*sn));
2451
2452    /* fill auxiliary fields */
2453    qemu_gettimeofday(&tv);
2454    sn->date_sec = tv.tv_sec;
2455    sn->date_nsec = tv.tv_usec * 1000;
2456    sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
2457
2458    if (name) {
2459        ret = bdrv_snapshot_find(bs, old_sn, name);
2460        if (ret >= 0) {
2461            pstrcpy(sn->name, sizeof(sn->name), old_sn->name);
2462            pstrcpy(sn->id_str, sizeof(sn->id_str), old_sn->id_str);
2463        } else {
2464            pstrcpy(sn->name, sizeof(sn->name), name);
2465        }
2466    } else {
2467        /* cast below needed for OpenBSD where tv_sec is still 'long' */
2468        localtime_r((const time_t *)&tv.tv_sec, &tm);
2469        strftime(sn->name, sizeof(sn->name), "vm-%Y%m%d%H%M%S", &tm);
2470    }
2471
2472    /* save the VM state */
2473    f = qemu_fopen_bdrv(bs, 1);
2474    if (!f) {
2475        error_setg(errp, "Could not open VM state file");
2476        goto the_end;
2477    }
2478    ret = qemu_savevm_state(f, errp);
2479    vm_state_size = qemu_ftell(f);
2480    qemu_fclose(f);
2481    if (ret < 0) {
2482        goto the_end;
2483    }
2484
2485    /* The bdrv_all_create_snapshot() call that follows acquires the AioContext
2486     * for itself.  BDRV_POLL_WHILE() does not support nested locking because
2487     * it only releases the lock once.  Therefore synchronous I/O will deadlock
2488     * unless we release the AioContext before bdrv_all_create_snapshot().
2489     */
2490    aio_context_release(aio_context);
2491    aio_context = NULL;
2492
2493    ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs);
2494    if (ret < 0) {
2495        error_setg(errp, "Error while creating snapshot on '%s'",
2496                   bdrv_get_device_name(bs));
2497        goto the_end;
2498    }
2499
2500    ret = 0;
2501
2502 the_end:
2503    if (aio_context) {
2504        aio_context_release(aio_context);
2505    }
2506
2507    bdrv_drain_all_end();
2508
2509    if (saved_vm_running) {
2510        vm_start();
2511    }
2512    return ret;
2513}
2514
2515void qmp_xen_save_devices_state(const char *filename, bool has_live, bool live,
2516                                Error **errp)
2517{
2518    QEMUFile *f;
2519    QIOChannelFile *ioc;
2520    int saved_vm_running;
2521    int ret;
2522
2523    if (!has_live) {
2524        /* live default to true so old version of Xen tool stack can have a
2525         * successfull live migration */
2526        live = true;
2527    }
2528
2529    saved_vm_running = runstate_is_running();
2530    vm_stop(RUN_STATE_SAVE_VM);
2531    global_state_store_running();
2532
2533    ioc = qio_channel_file_new_path(filename, O_WRONLY | O_CREAT, 0660, errp);
2534    if (!ioc) {
2535        goto the_end;
2536    }
2537    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-save-state");
2538    f = qemu_fopen_channel_output(QIO_CHANNEL(ioc));
2539    object_unref(OBJECT(ioc));
2540    ret = qemu_save_device_state(f);
2541    if (ret < 0 || qemu_fclose(f) < 0) {
2542        error_setg(errp, QERR_IO_ERROR);
2543    } else {
2544        /* libxl calls the QMP command "stop" before calling
2545         * "xen-save-devices-state" and in case of migration failure, libxl
2546         * would call "cont".
2547         * So call bdrv_inactivate_all (release locks) here to let the other
2548         * side of the migration take controle of the images.
2549         */
2550        if (live && !saved_vm_running) {
2551            ret = bdrv_inactivate_all();
2552            if (ret) {
2553                error_setg(errp, "%s: bdrv_inactivate_all() failed (%d)",
2554                           __func__, ret);
2555            }
2556        }
2557    }
2558
2559 the_end:
2560    if (saved_vm_running) {
2561        vm_start();
2562    }
2563}
2564
2565void qmp_xen_load_devices_state(const char *filename, Error **errp)
2566{
2567    QEMUFile *f;
2568    QIOChannelFile *ioc;
2569    int ret;
2570
2571    /* Guest must be paused before loading the device state; the RAM state
2572     * will already have been loaded by xc
2573     */
2574    if (runstate_is_running()) {
2575        error_setg(errp, "Cannot update device state while vm is running");
2576        return;
2577    }
2578    vm_stop(RUN_STATE_RESTORE_VM);
2579
2580    ioc = qio_channel_file_new_path(filename, O_RDONLY | O_BINARY, 0, errp);
2581    if (!ioc) {
2582        return;
2583    }
2584    qio_channel_set_name(QIO_CHANNEL(ioc), "migration-xen-load-state");
2585    f = qemu_fopen_channel_input(QIO_CHANNEL(ioc));
2586    object_unref(OBJECT(ioc));
2587
2588    ret = qemu_loadvm_state(f);
2589    qemu_fclose(f);
2590    if (ret < 0) {
2591        error_setg(errp, QERR_IO_ERROR);
2592    }
2593    migration_incoming_state_destroy();
2594}
2595
2596int load_snapshot(const char *name, Error **errp)
2597{
2598    BlockDriverState *bs, *bs_vm_state;
2599    QEMUSnapshotInfo sn;
2600    QEMUFile *f;
2601    int ret;
2602    AioContext *aio_context;
2603    MigrationIncomingState *mis = migration_incoming_get_current();
2604
2605    if (!replay_can_snapshot()) {
2606        error_report("Record/replay does not allow loading snapshot "
2607                     "right now. Try once more later.");
2608        return -EINVAL;
2609    }
2610
2611    if (!bdrv_all_can_snapshot(&bs)) {
2612        error_setg(errp,
2613                   "Device '%s' is writable but does not support snapshots",
2614                   bdrv_get_device_name(bs));
2615        return -ENOTSUP;
2616    }
2617    ret = bdrv_all_find_snapshot(name, &bs);
2618    if (ret < 0) {
2619        error_setg(errp,
2620                   "Device '%s' does not have the requested snapshot '%s'",
2621                   bdrv_get_device_name(bs), name);
2622        return ret;
2623    }
2624
2625    bs_vm_state = bdrv_all_find_vmstate_bs();
2626    if (!bs_vm_state) {
2627        error_setg(errp, "No block device supports snapshots");
2628        return -ENOTSUP;
2629    }
2630    aio_context = bdrv_get_aio_context(bs_vm_state);
2631
2632    /* Don't even try to load empty VM states */
2633    aio_context_acquire(aio_context);
2634    ret = bdrv_snapshot_find(bs_vm_state, &sn, name);
2635    aio_context_release(aio_context);
2636    if (ret < 0) {
2637        return ret;
2638    } else if (sn.vm_state_size == 0) {
2639        error_setg(errp, "This is a disk-only snapshot. Revert to it "
2640                   " offline using qemu-img");
2641        return -EINVAL;
2642    }
2643
2644    /* Flush all IO requests so they don't interfere with the new state.  */
2645    bdrv_drain_all_begin();
2646
2647    ret = bdrv_all_goto_snapshot(name, &bs, errp);
2648    if (ret < 0) {
2649        error_prepend(errp, "Could not load snapshot '%s' on '%s': ",
2650                      name, bdrv_get_device_name(bs));
2651        goto err_drain;
2652    }
2653
2654    /* restore the VM state */
2655    f = qemu_fopen_bdrv(bs_vm_state, 0);
2656    if (!f) {
2657        error_setg(errp, "Could not open VM state file");
2658        ret = -EINVAL;
2659        goto err_drain;
2660    }
2661
2662    qemu_system_reset(SHUTDOWN_CAUSE_NONE);
2663    mis->from_src_file = f;
2664
2665    aio_context_acquire(aio_context);
2666    ret = qemu_loadvm_state(f);
2667    migration_incoming_state_destroy();
2668    aio_context_release(aio_context);
2669
2670    bdrv_drain_all_end();
2671
2672    if (ret < 0) {
2673        error_setg(errp, "Error %d while loading VM state", ret);
2674        return ret;
2675    }
2676
2677    return 0;
2678
2679err_drain:
2680    bdrv_drain_all_end();
2681    return ret;
2682}
2683
2684void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev)
2685{
2686    qemu_ram_set_idstr(mr->ram_block,
2687                       memory_region_name(mr), dev);
2688    qemu_ram_set_migratable(mr->ram_block);
2689}
2690
2691void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev)
2692{
2693    qemu_ram_unset_idstr(mr->ram_block);
2694    qemu_ram_unset_migratable(mr->ram_block);
2695}
2696
2697void vmstate_register_ram_global(MemoryRegion *mr)
2698{
2699    vmstate_register_ram(mr, NULL);
2700}
2701
2702bool vmstate_check_only_migratable(const VMStateDescription *vmsd)
2703{
2704    /* check needed if --only-migratable is specified */
2705    if (!migrate_get_current()->only_migratable) {
2706        return true;
2707    }
2708
2709    return !(vmsd && vmsd->unmigratable);
2710}
2711