qemu/migration/ram.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2011-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include "cpu.h"
  31#include <zlib.h>
  32#include "qemu/cutils.h"
  33#include "qemu/bitops.h"
  34#include "qemu/bitmap.h"
  35#include "qemu/main-loop.h"
  36#include "xbzrle.h"
  37#include "ram.h"
  38#include "migration.h"
  39#include "socket.h"
  40#include "migration/register.h"
  41#include "migration/misc.h"
  42#include "qemu-file.h"
  43#include "postcopy-ram.h"
  44#include "page_cache.h"
  45#include "qemu/error-report.h"
  46#include "qapi/error.h"
  47#include "qapi/qapi-events-migration.h"
  48#include "qapi/qmp/qerror.h"
  49#include "trace.h"
  50#include "exec/ram_addr.h"
  51#include "exec/target_page.h"
  52#include "qemu/rcu_queue.h"
  53#include "migration/colo.h"
  54#include "block.h"
  55#include "sysemu/sysemu.h"
  56#include "qemu/uuid.h"
  57#include "savevm.h"
  58#include "qemu/iov.h"
  59
  60/***********************************************************/
  61/* ram save/restore */
  62
  63/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  64 * worked for pages that where filled with the same char.  We switched
  65 * it to only search for the zero value.  And to avoid confusion with
  66 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  67 */
  68
  69#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  70#define RAM_SAVE_FLAG_ZERO     0x02
  71#define RAM_SAVE_FLAG_MEM_SIZE 0x04
  72#define RAM_SAVE_FLAG_PAGE     0x08
  73#define RAM_SAVE_FLAG_EOS      0x10
  74#define RAM_SAVE_FLAG_CONTINUE 0x20
  75#define RAM_SAVE_FLAG_XBZRLE   0x40
  76/* 0x80 is reserved in migration.h start with 0x100 next */
  77#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  78
  79static inline bool is_zero_range(uint8_t *p, uint64_t size)
  80{
  81    return buffer_is_zero(p, size);
  82}
  83
  84XBZRLECacheStats xbzrle_counters;
  85
  86/* struct contains XBZRLE cache and a static page
  87   used by the compression */
  88static struct {
  89    /* buffer used for XBZRLE encoding */
  90    uint8_t *encoded_buf;
  91    /* buffer for storing page content */
  92    uint8_t *current_buf;
  93    /* Cache for XBZRLE, Protected by lock. */
  94    PageCache *cache;
  95    QemuMutex lock;
  96    /* it will store a page full of zeros */
  97    uint8_t *zero_target_page;
  98    /* buffer used for XBZRLE decoding */
  99    uint8_t *decoded_buf;
 100} XBZRLE;
 101
 102static void XBZRLE_cache_lock(void)
 103{
 104    if (migrate_use_xbzrle())
 105        qemu_mutex_lock(&XBZRLE.lock);
 106}
 107
 108static void XBZRLE_cache_unlock(void)
 109{
 110    if (migrate_use_xbzrle())
 111        qemu_mutex_unlock(&XBZRLE.lock);
 112}
 113
 114/**
 115 * xbzrle_cache_resize: resize the xbzrle cache
 116 *
 117 * This function is called from qmp_migrate_set_cache_size in main
 118 * thread, possibly while a migration is in progress.  A running
 119 * migration may be using the cache and might finish during this call,
 120 * hence changes to the cache are protected by XBZRLE.lock().
 121 *
 122 * Returns 0 for success or -1 for error
 123 *
 124 * @new_size: new cache size
 125 * @errp: set *errp if the check failed, with reason
 126 */
 127int xbzrle_cache_resize(int64_t new_size, Error **errp)
 128{
 129    PageCache *new_cache;
 130    int64_t ret = 0;
 131
 132    /* Check for truncation */
 133    if (new_size != (size_t)new_size) {
 134        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 135                   "exceeding address space");
 136        return -1;
 137    }
 138
 139    if (new_size == migrate_xbzrle_cache_size()) {
 140        /* nothing to do */
 141        return 0;
 142    }
 143
 144    XBZRLE_cache_lock();
 145
 146    if (XBZRLE.cache != NULL) {
 147        new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 148        if (!new_cache) {
 149            ret = -1;
 150            goto out;
 151        }
 152
 153        cache_fini(XBZRLE.cache);
 154        XBZRLE.cache = new_cache;
 155    }
 156out:
 157    XBZRLE_cache_unlock();
 158    return ret;
 159}
 160
 161/* Should be holding either ram_list.mutex, or the RCU lock. */
 162#define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 163    INTERNAL_RAMBLOCK_FOREACH(block)                   \
 164        if (!qemu_ram_is_migratable(block)) {} else
 165
 166#undef RAMBLOCK_FOREACH
 167
 168static void ramblock_recv_map_init(void)
 169{
 170    RAMBlock *rb;
 171
 172    RAMBLOCK_FOREACH_MIGRATABLE(rb) {
 173        assert(!rb->receivedmap);
 174        rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 175    }
 176}
 177
 178int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 179{
 180    return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 181                    rb->receivedmap);
 182}
 183
 184bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 185{
 186    return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 187}
 188
 189void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 190{
 191    set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 192}
 193
 194void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 195                                    size_t nr)
 196{
 197    bitmap_set_atomic(rb->receivedmap,
 198                      ramblock_recv_bitmap_offset(host_addr, rb),
 199                      nr);
 200}
 201
 202#define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 203
 204/*
 205 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 206 *
 207 * Returns >0 if success with sent bytes, or <0 if error.
 208 */
 209int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 210                                  const char *block_name)
 211{
 212    RAMBlock *block = qemu_ram_block_by_name(block_name);
 213    unsigned long *le_bitmap, nbits;
 214    uint64_t size;
 215
 216    if (!block) {
 217        error_report("%s: invalid block name: %s", __func__, block_name);
 218        return -1;
 219    }
 220
 221    nbits = block->used_length >> TARGET_PAGE_BITS;
 222
 223    /*
 224     * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 225     * machines we may need 4 more bytes for padding (see below
 226     * comment). So extend it a bit before hand.
 227     */
 228    le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 229
 230    /*
 231     * Always use little endian when sending the bitmap. This is
 232     * required that when source and destination VMs are not using the
 233     * same endianess. (Note: big endian won't work.)
 234     */
 235    bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 236
 237    /* Size of the bitmap, in bytes */
 238    size = DIV_ROUND_UP(nbits, 8);
 239
 240    /*
 241     * size is always aligned to 8 bytes for 64bit machines, but it
 242     * may not be true for 32bit machines. We need this padding to
 243     * make sure the migration can survive even between 32bit and
 244     * 64bit machines.
 245     */
 246    size = ROUND_UP(size, 8);
 247
 248    qemu_put_be64(file, size);
 249    qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 250    /*
 251     * Mark as an end, in case the middle part is screwed up due to
 252     * some "misterious" reason.
 253     */
 254    qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 255    qemu_fflush(file);
 256
 257    g_free(le_bitmap);
 258
 259    if (qemu_file_get_error(file)) {
 260        return qemu_file_get_error(file);
 261    }
 262
 263    return size + sizeof(size);
 264}
 265
 266/*
 267 * An outstanding page request, on the source, having been received
 268 * and queued
 269 */
 270struct RAMSrcPageRequest {
 271    RAMBlock *rb;
 272    hwaddr    offset;
 273    hwaddr    len;
 274
 275    QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 276};
 277
 278/* State of RAM for migration */
 279struct RAMState {
 280    /* QEMUFile used for this migration */
 281    QEMUFile *f;
 282    /* Last block that we have visited searching for dirty pages */
 283    RAMBlock *last_seen_block;
 284    /* Last block from where we have sent data */
 285    RAMBlock *last_sent_block;
 286    /* Last dirty target page we have sent */
 287    ram_addr_t last_page;
 288    /* last ram version we have seen */
 289    uint32_t last_version;
 290    /* We are in the first round */
 291    bool ram_bulk_stage;
 292    /* How many times we have dirty too many pages */
 293    int dirty_rate_high_cnt;
 294    /* these variables are used for bitmap sync */
 295    /* last time we did a full bitmap_sync */
 296    int64_t time_last_bitmap_sync;
 297    /* bytes transferred at start_time */
 298    uint64_t bytes_xfer_prev;
 299    /* number of dirty pages since start_time */
 300    uint64_t num_dirty_pages_period;
 301    /* xbzrle misses since the beginning of the period */
 302    uint64_t xbzrle_cache_miss_prev;
 303    /* number of iterations at the beginning of period */
 304    uint64_t iterations_prev;
 305    /* Iterations since start */
 306    uint64_t iterations;
 307    /* number of dirty bits in the bitmap */
 308    uint64_t migration_dirty_pages;
 309    /* protects modification of the bitmap */
 310    QemuMutex bitmap_mutex;
 311    /* The RAMBlock used in the last src_page_requests */
 312    RAMBlock *last_req_rb;
 313    /* Queue of outstanding page requests from the destination */
 314    QemuMutex src_page_req_mutex;
 315    QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 316};
 317typedef struct RAMState RAMState;
 318
 319static RAMState *ram_state;
 320
 321uint64_t ram_bytes_remaining(void)
 322{
 323    return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 324                       0;
 325}
 326
 327MigrationStats ram_counters;
 328
 329/* used by the search for pages to send */
 330struct PageSearchStatus {
 331    /* Current block being searched */
 332    RAMBlock    *block;
 333    /* Current page to search from */
 334    unsigned long page;
 335    /* Set once we wrap around */
 336    bool         complete_round;
 337};
 338typedef struct PageSearchStatus PageSearchStatus;
 339
 340struct CompressParam {
 341    bool done;
 342    bool quit;
 343    QEMUFile *file;
 344    QemuMutex mutex;
 345    QemuCond cond;
 346    RAMBlock *block;
 347    ram_addr_t offset;
 348
 349    /* internally used fields */
 350    z_stream stream;
 351    uint8_t *originbuf;
 352};
 353typedef struct CompressParam CompressParam;
 354
 355struct DecompressParam {
 356    bool done;
 357    bool quit;
 358    QemuMutex mutex;
 359    QemuCond cond;
 360    void *des;
 361    uint8_t *compbuf;
 362    int len;
 363    z_stream stream;
 364};
 365typedef struct DecompressParam DecompressParam;
 366
 367static CompressParam *comp_param;
 368static QemuThread *compress_threads;
 369/* comp_done_cond is used to wake up the migration thread when
 370 * one of the compression threads has finished the compression.
 371 * comp_done_lock is used to co-work with comp_done_cond.
 372 */
 373static QemuMutex comp_done_lock;
 374static QemuCond comp_done_cond;
 375/* The empty QEMUFileOps will be used by file in CompressParam */
 376static const QEMUFileOps empty_ops = { };
 377
 378static QEMUFile *decomp_file;
 379static DecompressParam *decomp_param;
 380static QemuThread *decompress_threads;
 381static QemuMutex decomp_done_lock;
 382static QemuCond decomp_done_cond;
 383
 384static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 385                                ram_addr_t offset, uint8_t *source_buf);
 386
 387static void *do_data_compress(void *opaque)
 388{
 389    CompressParam *param = opaque;
 390    RAMBlock *block;
 391    ram_addr_t offset;
 392
 393    qemu_mutex_lock(&param->mutex);
 394    while (!param->quit) {
 395        if (param->block) {
 396            block = param->block;
 397            offset = param->offset;
 398            param->block = NULL;
 399            qemu_mutex_unlock(&param->mutex);
 400
 401            do_compress_ram_page(param->file, &param->stream, block, offset,
 402                                 param->originbuf);
 403
 404            qemu_mutex_lock(&comp_done_lock);
 405            param->done = true;
 406            qemu_cond_signal(&comp_done_cond);
 407            qemu_mutex_unlock(&comp_done_lock);
 408
 409            qemu_mutex_lock(&param->mutex);
 410        } else {
 411            qemu_cond_wait(&param->cond, &param->mutex);
 412        }
 413    }
 414    qemu_mutex_unlock(&param->mutex);
 415
 416    return NULL;
 417}
 418
 419static inline void terminate_compression_threads(void)
 420{
 421    int idx, thread_count;
 422
 423    thread_count = migrate_compress_threads();
 424
 425    for (idx = 0; idx < thread_count; idx++) {
 426        qemu_mutex_lock(&comp_param[idx].mutex);
 427        comp_param[idx].quit = true;
 428        qemu_cond_signal(&comp_param[idx].cond);
 429        qemu_mutex_unlock(&comp_param[idx].mutex);
 430    }
 431}
 432
 433static void compress_threads_save_cleanup(void)
 434{
 435    int i, thread_count;
 436
 437    if (!migrate_use_compression()) {
 438        return;
 439    }
 440    terminate_compression_threads();
 441    thread_count = migrate_compress_threads();
 442    for (i = 0; i < thread_count; i++) {
 443        /*
 444         * we use it as a indicator which shows if the thread is
 445         * properly init'd or not
 446         */
 447        if (!comp_param[i].file) {
 448            break;
 449        }
 450        qemu_thread_join(compress_threads + i);
 451        qemu_mutex_destroy(&comp_param[i].mutex);
 452        qemu_cond_destroy(&comp_param[i].cond);
 453        deflateEnd(&comp_param[i].stream);
 454        g_free(comp_param[i].originbuf);
 455        qemu_fclose(comp_param[i].file);
 456        comp_param[i].file = NULL;
 457    }
 458    qemu_mutex_destroy(&comp_done_lock);
 459    qemu_cond_destroy(&comp_done_cond);
 460    g_free(compress_threads);
 461    g_free(comp_param);
 462    compress_threads = NULL;
 463    comp_param = NULL;
 464}
 465
 466static int compress_threads_save_setup(void)
 467{
 468    int i, thread_count;
 469
 470    if (!migrate_use_compression()) {
 471        return 0;
 472    }
 473    thread_count = migrate_compress_threads();
 474    compress_threads = g_new0(QemuThread, thread_count);
 475    comp_param = g_new0(CompressParam, thread_count);
 476    qemu_cond_init(&comp_done_cond);
 477    qemu_mutex_init(&comp_done_lock);
 478    for (i = 0; i < thread_count; i++) {
 479        comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 480        if (!comp_param[i].originbuf) {
 481            goto exit;
 482        }
 483
 484        if (deflateInit(&comp_param[i].stream,
 485                        migrate_compress_level()) != Z_OK) {
 486            g_free(comp_param[i].originbuf);
 487            goto exit;
 488        }
 489
 490        /* comp_param[i].file is just used as a dummy buffer to save data,
 491         * set its ops to empty.
 492         */
 493        comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 494        comp_param[i].done = true;
 495        comp_param[i].quit = false;
 496        qemu_mutex_init(&comp_param[i].mutex);
 497        qemu_cond_init(&comp_param[i].cond);
 498        qemu_thread_create(compress_threads + i, "compress",
 499                           do_data_compress, comp_param + i,
 500                           QEMU_THREAD_JOINABLE);
 501    }
 502    return 0;
 503
 504exit:
 505    compress_threads_save_cleanup();
 506    return -1;
 507}
 508
 509/* Multiple fd's */
 510
 511#define MULTIFD_MAGIC 0x11223344U
 512#define MULTIFD_VERSION 1
 513
 514#define MULTIFD_FLAG_SYNC (1 << 0)
 515
 516typedef struct {
 517    uint32_t magic;
 518    uint32_t version;
 519    unsigned char uuid[16]; /* QemuUUID */
 520    uint8_t id;
 521} __attribute__((packed)) MultiFDInit_t;
 522
 523typedef struct {
 524    uint32_t magic;
 525    uint32_t version;
 526    uint32_t flags;
 527    uint32_t size;
 528    uint32_t used;
 529    uint64_t packet_num;
 530    char ramblock[256];
 531    uint64_t offset[];
 532} __attribute__((packed)) MultiFDPacket_t;
 533
 534typedef struct {
 535    /* number of used pages */
 536    uint32_t used;
 537    /* number of allocated pages */
 538    uint32_t allocated;
 539    /* global number of generated multifd packets */
 540    uint64_t packet_num;
 541    /* offset of each page */
 542    ram_addr_t *offset;
 543    /* pointer to each page */
 544    struct iovec *iov;
 545    RAMBlock *block;
 546} MultiFDPages_t;
 547
 548typedef struct {
 549    /* this fields are not changed once the thread is created */
 550    /* channel number */
 551    uint8_t id;
 552    /* channel thread name */
 553    char *name;
 554    /* channel thread id */
 555    QemuThread thread;
 556    /* communication channel */
 557    QIOChannel *c;
 558    /* sem where to wait for more work */
 559    QemuSemaphore sem;
 560    /* this mutex protects the following parameters */
 561    QemuMutex mutex;
 562    /* is this channel thread running */
 563    bool running;
 564    /* should this thread finish */
 565    bool quit;
 566    /* thread has work to do */
 567    int pending_job;
 568    /* array of pages to sent */
 569    MultiFDPages_t *pages;
 570    /* packet allocated len */
 571    uint32_t packet_len;
 572    /* pointer to the packet */
 573    MultiFDPacket_t *packet;
 574    /* multifd flags for each packet */
 575    uint32_t flags;
 576    /* global number of generated multifd packets */
 577    uint64_t packet_num;
 578    /* thread local variables */
 579    /* packets sent through this channel */
 580    uint64_t num_packets;
 581    /* pages sent through this channel */
 582    uint64_t num_pages;
 583    /* syncs main thread and channels */
 584    QemuSemaphore sem_sync;
 585}  MultiFDSendParams;
 586
 587typedef struct {
 588    /* this fields are not changed once the thread is created */
 589    /* channel number */
 590    uint8_t id;
 591    /* channel thread name */
 592    char *name;
 593    /* channel thread id */
 594    QemuThread thread;
 595    /* communication channel */
 596    QIOChannel *c;
 597    /* this mutex protects the following parameters */
 598    QemuMutex mutex;
 599    /* is this channel thread running */
 600    bool running;
 601    /* array of pages to receive */
 602    MultiFDPages_t *pages;
 603    /* packet allocated len */
 604    uint32_t packet_len;
 605    /* pointer to the packet */
 606    MultiFDPacket_t *packet;
 607    /* multifd flags for each packet */
 608    uint32_t flags;
 609    /* global number of generated multifd packets */
 610    uint64_t packet_num;
 611    /* thread local variables */
 612    /* packets sent through this channel */
 613    uint64_t num_packets;
 614    /* pages sent through this channel */
 615    uint64_t num_pages;
 616    /* syncs main thread and channels */
 617    QemuSemaphore sem_sync;
 618} MultiFDRecvParams;
 619
 620static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 621{
 622    MultiFDInit_t msg;
 623    int ret;
 624
 625    msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 626    msg.version = cpu_to_be32(MULTIFD_VERSION);
 627    msg.id = p->id;
 628    memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 629
 630    ret = qio_channel_write_all(p->c, (char *)&msg, sizeof(msg), errp);
 631    if (ret != 0) {
 632        return -1;
 633    }
 634    return 0;
 635}
 636
 637static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 638{
 639    MultiFDInit_t msg;
 640    int ret;
 641
 642    ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 643    if (ret != 0) {
 644        return -1;
 645    }
 646
 647    be32_to_cpus(&msg.magic);
 648    be32_to_cpus(&msg.version);
 649
 650    if (msg.magic != MULTIFD_MAGIC) {
 651        error_setg(errp, "multifd: received packet magic %x "
 652                   "expected %x", msg.magic, MULTIFD_MAGIC);
 653        return -1;
 654    }
 655
 656    if (msg.version != MULTIFD_VERSION) {
 657        error_setg(errp, "multifd: received packet version %d "
 658                   "expected %d", msg.version, MULTIFD_VERSION);
 659        return -1;
 660    }
 661
 662    if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 663        char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 664        char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 665
 666        error_setg(errp, "multifd: received uuid '%s' and expected "
 667                   "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 668        g_free(uuid);
 669        g_free(msg_uuid);
 670        return -1;
 671    }
 672
 673    if (msg.id > migrate_multifd_channels()) {
 674        error_setg(errp, "multifd: received channel version %d "
 675                   "expected %d", msg.version, MULTIFD_VERSION);
 676        return -1;
 677    }
 678
 679    return msg.id;
 680}
 681
 682static MultiFDPages_t *multifd_pages_init(size_t size)
 683{
 684    MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 685
 686    pages->allocated = size;
 687    pages->iov = g_new0(struct iovec, size);
 688    pages->offset = g_new0(ram_addr_t, size);
 689
 690    return pages;
 691}
 692
 693static void multifd_pages_clear(MultiFDPages_t *pages)
 694{
 695    pages->used = 0;
 696    pages->allocated = 0;
 697    pages->packet_num = 0;
 698    pages->block = NULL;
 699    g_free(pages->iov);
 700    pages->iov = NULL;
 701    g_free(pages->offset);
 702    pages->offset = NULL;
 703    g_free(pages);
 704}
 705
 706static void multifd_send_fill_packet(MultiFDSendParams *p)
 707{
 708    MultiFDPacket_t *packet = p->packet;
 709    int i;
 710
 711    packet->magic = cpu_to_be32(MULTIFD_MAGIC);
 712    packet->version = cpu_to_be32(MULTIFD_VERSION);
 713    packet->flags = cpu_to_be32(p->flags);
 714    packet->size = cpu_to_be32(migrate_multifd_page_count());
 715    packet->used = cpu_to_be32(p->pages->used);
 716    packet->packet_num = cpu_to_be64(p->packet_num);
 717
 718    if (p->pages->block) {
 719        strncpy(packet->ramblock, p->pages->block->idstr, 256);
 720    }
 721
 722    for (i = 0; i < p->pages->used; i++) {
 723        packet->offset[i] = cpu_to_be64(p->pages->offset[i]);
 724    }
 725}
 726
 727static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 728{
 729    MultiFDPacket_t *packet = p->packet;
 730    RAMBlock *block;
 731    int i;
 732
 733    be32_to_cpus(&packet->magic);
 734    if (packet->magic != MULTIFD_MAGIC) {
 735        error_setg(errp, "multifd: received packet "
 736                   "magic %x and expected magic %x",
 737                   packet->magic, MULTIFD_MAGIC);
 738        return -1;
 739    }
 740
 741    be32_to_cpus(&packet->version);
 742    if (packet->version != MULTIFD_VERSION) {
 743        error_setg(errp, "multifd: received packet "
 744                   "version %d and expected version %d",
 745                   packet->version, MULTIFD_VERSION);
 746        return -1;
 747    }
 748
 749    p->flags = be32_to_cpu(packet->flags);
 750
 751    be32_to_cpus(&packet->size);
 752    if (packet->size > migrate_multifd_page_count()) {
 753        error_setg(errp, "multifd: received packet "
 754                   "with size %d and expected maximum size %d",
 755                   packet->size, migrate_multifd_page_count()) ;
 756        return -1;
 757    }
 758
 759    p->pages->used = be32_to_cpu(packet->used);
 760    if (p->pages->used > packet->size) {
 761        error_setg(errp, "multifd: received packet "
 762                   "with size %d and expected maximum size %d",
 763                   p->pages->used, packet->size) ;
 764        return -1;
 765    }
 766
 767    p->packet_num = be64_to_cpu(packet->packet_num);
 768
 769    if (p->pages->used) {
 770        /* make sure that ramblock is 0 terminated */
 771        packet->ramblock[255] = 0;
 772        block = qemu_ram_block_by_name(packet->ramblock);
 773        if (!block) {
 774            error_setg(errp, "multifd: unknown ram block %s",
 775                       packet->ramblock);
 776            return -1;
 777        }
 778    }
 779
 780    for (i = 0; i < p->pages->used; i++) {
 781        ram_addr_t offset = be64_to_cpu(packet->offset[i]);
 782
 783        if (offset > (block->used_length - TARGET_PAGE_SIZE)) {
 784            error_setg(errp, "multifd: offset too long " RAM_ADDR_FMT
 785                       " (max " RAM_ADDR_FMT ")",
 786                       offset, block->max_length);
 787            return -1;
 788        }
 789        p->pages->iov[i].iov_base = block->host + offset;
 790        p->pages->iov[i].iov_len = TARGET_PAGE_SIZE;
 791    }
 792
 793    return 0;
 794}
 795
 796struct {
 797    MultiFDSendParams *params;
 798    /* number of created threads */
 799    int count;
 800    /* array of pages to sent */
 801    MultiFDPages_t *pages;
 802    /* syncs main thread and channels */
 803    QemuSemaphore sem_sync;
 804    /* global number of generated multifd packets */
 805    uint64_t packet_num;
 806    /* send channels ready */
 807    QemuSemaphore channels_ready;
 808} *multifd_send_state;
 809
 810/*
 811 * How we use multifd_send_state->pages and channel->pages?
 812 *
 813 * We create a pages for each channel, and a main one.  Each time that
 814 * we need to send a batch of pages we interchange the ones between
 815 * multifd_send_state and the channel that is sending it.  There are
 816 * two reasons for that:
 817 *    - to not have to do so many mallocs during migration
 818 *    - to make easier to know what to free at the end of migration
 819 *
 820 * This way we always know who is the owner of each "pages" struct,
 821 * and we don't need any loocking.  It belongs to the migration thread
 822 * or to the channel thread.  Switching is safe because the migration
 823 * thread is using the channel mutex when changing it, and the channel
 824 * have to had finish with its own, otherwise pending_job can't be
 825 * false.
 826 */
 827
 828static void multifd_send_pages(void)
 829{
 830    int i;
 831    static int next_channel;
 832    MultiFDSendParams *p = NULL; /* make happy gcc */
 833    MultiFDPages_t *pages = multifd_send_state->pages;
 834    uint64_t transferred;
 835
 836    qemu_sem_wait(&multifd_send_state->channels_ready);
 837    for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 838        p = &multifd_send_state->params[i];
 839
 840        qemu_mutex_lock(&p->mutex);
 841        if (!p->pending_job) {
 842            p->pending_job++;
 843            next_channel = (i + 1) % migrate_multifd_channels();
 844            break;
 845        }
 846        qemu_mutex_unlock(&p->mutex);
 847    }
 848    p->pages->used = 0;
 849
 850    p->packet_num = multifd_send_state->packet_num++;
 851    p->pages->block = NULL;
 852    multifd_send_state->pages = p->pages;
 853    p->pages = pages;
 854    transferred = ((uint64_t) pages->used) * TARGET_PAGE_SIZE + p->packet_len;
 855    ram_counters.multifd_bytes += transferred;
 856    ram_counters.transferred += transferred;;
 857    qemu_mutex_unlock(&p->mutex);
 858    qemu_sem_post(&p->sem);
 859}
 860
 861static void multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 862{
 863    MultiFDPages_t *pages = multifd_send_state->pages;
 864
 865    if (!pages->block) {
 866        pages->block = block;
 867    }
 868
 869    if (pages->block == block) {
 870        pages->offset[pages->used] = offset;
 871        pages->iov[pages->used].iov_base = block->host + offset;
 872        pages->iov[pages->used].iov_len = TARGET_PAGE_SIZE;
 873        pages->used++;
 874
 875        if (pages->used < pages->allocated) {
 876            return;
 877        }
 878    }
 879
 880    multifd_send_pages();
 881
 882    if (pages->block != block) {
 883        multifd_queue_page(block, offset);
 884    }
 885}
 886
 887static void multifd_send_terminate_threads(Error *err)
 888{
 889    int i;
 890
 891    if (err) {
 892        MigrationState *s = migrate_get_current();
 893        migrate_set_error(s, err);
 894        if (s->state == MIGRATION_STATUS_SETUP ||
 895            s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 896            s->state == MIGRATION_STATUS_DEVICE ||
 897            s->state == MIGRATION_STATUS_ACTIVE) {
 898            migrate_set_state(&s->state, s->state,
 899                              MIGRATION_STATUS_FAILED);
 900        }
 901    }
 902
 903    for (i = 0; i < migrate_multifd_channels(); i++) {
 904        MultiFDSendParams *p = &multifd_send_state->params[i];
 905
 906        qemu_mutex_lock(&p->mutex);
 907        p->quit = true;
 908        qemu_sem_post(&p->sem);
 909        qemu_mutex_unlock(&p->mutex);
 910    }
 911}
 912
 913int multifd_save_cleanup(Error **errp)
 914{
 915    int i;
 916    int ret = 0;
 917
 918    if (!migrate_use_multifd()) {
 919        return 0;
 920    }
 921    multifd_send_terminate_threads(NULL);
 922    for (i = 0; i < migrate_multifd_channels(); i++) {
 923        MultiFDSendParams *p = &multifd_send_state->params[i];
 924
 925        if (p->running) {
 926            qemu_thread_join(&p->thread);
 927        }
 928        socket_send_channel_destroy(p->c);
 929        p->c = NULL;
 930        qemu_mutex_destroy(&p->mutex);
 931        qemu_sem_destroy(&p->sem);
 932        qemu_sem_destroy(&p->sem_sync);
 933        g_free(p->name);
 934        p->name = NULL;
 935        multifd_pages_clear(p->pages);
 936        p->pages = NULL;
 937        p->packet_len = 0;
 938        g_free(p->packet);
 939        p->packet = NULL;
 940    }
 941    qemu_sem_destroy(&multifd_send_state->channels_ready);
 942    qemu_sem_destroy(&multifd_send_state->sem_sync);
 943    g_free(multifd_send_state->params);
 944    multifd_send_state->params = NULL;
 945    multifd_pages_clear(multifd_send_state->pages);
 946    multifd_send_state->pages = NULL;
 947    g_free(multifd_send_state);
 948    multifd_send_state = NULL;
 949    return ret;
 950}
 951
 952static void multifd_send_sync_main(void)
 953{
 954    int i;
 955
 956    if (!migrate_use_multifd()) {
 957        return;
 958    }
 959    if (multifd_send_state->pages->used) {
 960        multifd_send_pages();
 961    }
 962    for (i = 0; i < migrate_multifd_channels(); i++) {
 963        MultiFDSendParams *p = &multifd_send_state->params[i];
 964
 965        trace_multifd_send_sync_main_signal(p->id);
 966
 967        qemu_mutex_lock(&p->mutex);
 968
 969        p->packet_num = multifd_send_state->packet_num++;
 970        p->flags |= MULTIFD_FLAG_SYNC;
 971        p->pending_job++;
 972        qemu_mutex_unlock(&p->mutex);
 973        qemu_sem_post(&p->sem);
 974    }
 975    for (i = 0; i < migrate_multifd_channels(); i++) {
 976        MultiFDSendParams *p = &multifd_send_state->params[i];
 977
 978        trace_multifd_send_sync_main_wait(p->id);
 979        qemu_sem_wait(&multifd_send_state->sem_sync);
 980    }
 981    trace_multifd_send_sync_main(multifd_send_state->packet_num);
 982}
 983
 984static void *multifd_send_thread(void *opaque)
 985{
 986    MultiFDSendParams *p = opaque;
 987    Error *local_err = NULL;
 988    int ret;
 989
 990    trace_multifd_send_thread_start(p->id);
 991
 992    if (multifd_send_initial_packet(p, &local_err) < 0) {
 993        goto out;
 994    }
 995    /* initial packet */
 996    p->num_packets = 1;
 997
 998    while (true) {
 999        qemu_sem_wait(&p->sem);
1000        qemu_mutex_lock(&p->mutex);
1001
1002        if (p->pending_job) {
1003            uint32_t used = p->pages->used;
1004            uint64_t packet_num = p->packet_num;
1005            uint32_t flags = p->flags;
1006
1007            multifd_send_fill_packet(p);
1008            p->flags = 0;
1009            p->num_packets++;
1010            p->num_pages += used;
1011            p->pages->used = 0;
1012            qemu_mutex_unlock(&p->mutex);
1013
1014            trace_multifd_send(p->id, packet_num, used, flags);
1015
1016            ret = qio_channel_write_all(p->c, (void *)p->packet,
1017                                        p->packet_len, &local_err);
1018            if (ret != 0) {
1019                break;
1020            }
1021
1022            ret = qio_channel_writev_all(p->c, p->pages->iov, used, &local_err);
1023            if (ret != 0) {
1024                break;
1025            }
1026
1027            qemu_mutex_lock(&p->mutex);
1028            p->pending_job--;
1029            qemu_mutex_unlock(&p->mutex);
1030
1031            if (flags & MULTIFD_FLAG_SYNC) {
1032                qemu_sem_post(&multifd_send_state->sem_sync);
1033            }
1034            qemu_sem_post(&multifd_send_state->channels_ready);
1035        } else if (p->quit) {
1036            qemu_mutex_unlock(&p->mutex);
1037            break;
1038        } else {
1039            qemu_mutex_unlock(&p->mutex);
1040            /* sometimes there are spurious wakeups */
1041        }
1042    }
1043
1044out:
1045    if (local_err) {
1046        multifd_send_terminate_threads(local_err);
1047    }
1048
1049    qemu_mutex_lock(&p->mutex);
1050    p->running = false;
1051    qemu_mutex_unlock(&p->mutex);
1052
1053    trace_multifd_send_thread_end(p->id, p->num_packets, p->num_pages);
1054
1055    return NULL;
1056}
1057
1058static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1059{
1060    MultiFDSendParams *p = opaque;
1061    QIOChannel *sioc = QIO_CHANNEL(qio_task_get_source(task));
1062    Error *local_err = NULL;
1063
1064    if (qio_task_propagate_error(task, &local_err)) {
1065        if (multifd_save_cleanup(&local_err) != 0) {
1066            migrate_set_error(migrate_get_current(), local_err);
1067        }
1068    } else {
1069        p->c = QIO_CHANNEL(sioc);
1070        qio_channel_set_delay(p->c, false);
1071        p->running = true;
1072        qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1073                           QEMU_THREAD_JOINABLE);
1074
1075        atomic_inc(&multifd_send_state->count);
1076    }
1077}
1078
1079int multifd_save_setup(void)
1080{
1081    int thread_count;
1082    uint32_t page_count = migrate_multifd_page_count();
1083    uint8_t i;
1084
1085    if (!migrate_use_multifd()) {
1086        return 0;
1087    }
1088    thread_count = migrate_multifd_channels();
1089    multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1090    multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1091    atomic_set(&multifd_send_state->count, 0);
1092    multifd_send_state->pages = multifd_pages_init(page_count);
1093    qemu_sem_init(&multifd_send_state->sem_sync, 0);
1094    qemu_sem_init(&multifd_send_state->channels_ready, 0);
1095
1096    for (i = 0; i < thread_count; i++) {
1097        MultiFDSendParams *p = &multifd_send_state->params[i];
1098
1099        qemu_mutex_init(&p->mutex);
1100        qemu_sem_init(&p->sem, 0);
1101        qemu_sem_init(&p->sem_sync, 0);
1102        p->quit = false;
1103        p->pending_job = 0;
1104        p->id = i;
1105        p->pages = multifd_pages_init(page_count);
1106        p->packet_len = sizeof(MultiFDPacket_t)
1107                      + sizeof(ram_addr_t) * page_count;
1108        p->packet = g_malloc0(p->packet_len);
1109        p->name = g_strdup_printf("multifdsend_%d", i);
1110        socket_send_channel_create(multifd_new_send_channel_async, p);
1111    }
1112    return 0;
1113}
1114
1115struct {
1116    MultiFDRecvParams *params;
1117    /* number of created threads */
1118    int count;
1119    /* syncs main thread and channels */
1120    QemuSemaphore sem_sync;
1121    /* global number of generated multifd packets */
1122    uint64_t packet_num;
1123} *multifd_recv_state;
1124
1125static void multifd_recv_terminate_threads(Error *err)
1126{
1127    int i;
1128
1129    if (err) {
1130        MigrationState *s = migrate_get_current();
1131        migrate_set_error(s, err);
1132        if (s->state == MIGRATION_STATUS_SETUP ||
1133            s->state == MIGRATION_STATUS_ACTIVE) {
1134            migrate_set_state(&s->state, s->state,
1135                              MIGRATION_STATUS_FAILED);
1136        }
1137    }
1138
1139    for (i = 0; i < migrate_multifd_channels(); i++) {
1140        MultiFDRecvParams *p = &multifd_recv_state->params[i];
1141
1142        qemu_mutex_lock(&p->mutex);
1143        /* We could arrive here for two reasons:
1144           - normal quit, i.e. everything went fine, just finished
1145           - error quit: We close the channels so the channel threads
1146             finish the qio_channel_read_all_eof() */
1147        qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1148        qemu_mutex_unlock(&p->mutex);
1149    }
1150}
1151
1152int multifd_load_cleanup(Error **errp)
1153{
1154    int i;
1155    int ret = 0;
1156
1157    if (!migrate_use_multifd()) {
1158        return 0;
1159    }
1160    multifd_recv_terminate_threads(NULL);
1161    for (i = 0; i < migrate_multifd_channels(); i++) {
1162        MultiFDRecvParams *p = &multifd_recv_state->params[i];
1163
1164        if (p->running) {
1165            qemu_thread_join(&p->thread);
1166        }
1167        object_unref(OBJECT(p->c));
1168        p->c = NULL;
1169        qemu_mutex_destroy(&p->mutex);
1170        qemu_sem_destroy(&p->sem_sync);
1171        g_free(p->name);
1172        p->name = NULL;
1173        multifd_pages_clear(p->pages);
1174        p->pages = NULL;
1175        p->packet_len = 0;
1176        g_free(p->packet);
1177        p->packet = NULL;
1178    }
1179    qemu_sem_destroy(&multifd_recv_state->sem_sync);
1180    g_free(multifd_recv_state->params);
1181    multifd_recv_state->params = NULL;
1182    g_free(multifd_recv_state);
1183    multifd_recv_state = NULL;
1184
1185    return ret;
1186}
1187
1188static void multifd_recv_sync_main(void)
1189{
1190    int i;
1191
1192    if (!migrate_use_multifd()) {
1193        return;
1194    }
1195    for (i = 0; i < migrate_multifd_channels(); i++) {
1196        MultiFDRecvParams *p = &multifd_recv_state->params[i];
1197
1198        trace_multifd_recv_sync_main_wait(p->id);
1199        qemu_sem_wait(&multifd_recv_state->sem_sync);
1200        qemu_mutex_lock(&p->mutex);
1201        if (multifd_recv_state->packet_num < p->packet_num) {
1202            multifd_recv_state->packet_num = p->packet_num;
1203        }
1204        qemu_mutex_unlock(&p->mutex);
1205    }
1206    for (i = 0; i < migrate_multifd_channels(); i++) {
1207        MultiFDRecvParams *p = &multifd_recv_state->params[i];
1208
1209        trace_multifd_recv_sync_main_signal(p->id);
1210        qemu_sem_post(&p->sem_sync);
1211    }
1212    trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1213}
1214
1215static void *multifd_recv_thread(void *opaque)
1216{
1217    MultiFDRecvParams *p = opaque;
1218    Error *local_err = NULL;
1219    int ret;
1220
1221    trace_multifd_recv_thread_start(p->id);
1222
1223    while (true) {
1224        uint32_t used;
1225        uint32_t flags;
1226
1227        ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1228                                       p->packet_len, &local_err);
1229        if (ret == 0) {   /* EOF */
1230            break;
1231        }
1232        if (ret == -1) {   /* Error */
1233            break;
1234        }
1235
1236        qemu_mutex_lock(&p->mutex);
1237        ret = multifd_recv_unfill_packet(p, &local_err);
1238        if (ret) {
1239            qemu_mutex_unlock(&p->mutex);
1240            break;
1241        }
1242
1243        used = p->pages->used;
1244        flags = p->flags;
1245        trace_multifd_recv(p->id, p->packet_num, used, flags);
1246        p->num_packets++;
1247        p->num_pages += used;
1248        qemu_mutex_unlock(&p->mutex);
1249
1250        ret = qio_channel_readv_all(p->c, p->pages->iov, used, &local_err);
1251        if (ret != 0) {
1252            break;
1253        }
1254
1255        if (flags & MULTIFD_FLAG_SYNC) {
1256            qemu_sem_post(&multifd_recv_state->sem_sync);
1257            qemu_sem_wait(&p->sem_sync);
1258        }
1259    }
1260
1261    if (local_err) {
1262        multifd_recv_terminate_threads(local_err);
1263    }
1264    qemu_mutex_lock(&p->mutex);
1265    p->running = false;
1266    qemu_mutex_unlock(&p->mutex);
1267
1268    trace_multifd_recv_thread_end(p->id, p->num_packets, p->num_pages);
1269
1270    return NULL;
1271}
1272
1273int multifd_load_setup(void)
1274{
1275    int thread_count;
1276    uint32_t page_count = migrate_multifd_page_count();
1277    uint8_t i;
1278
1279    if (!migrate_use_multifd()) {
1280        return 0;
1281    }
1282    thread_count = migrate_multifd_channels();
1283    multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1284    multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1285    atomic_set(&multifd_recv_state->count, 0);
1286    qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1287
1288    for (i = 0; i < thread_count; i++) {
1289        MultiFDRecvParams *p = &multifd_recv_state->params[i];
1290
1291        qemu_mutex_init(&p->mutex);
1292        qemu_sem_init(&p->sem_sync, 0);
1293        p->id = i;
1294        p->pages = multifd_pages_init(page_count);
1295        p->packet_len = sizeof(MultiFDPacket_t)
1296                      + sizeof(ram_addr_t) * page_count;
1297        p->packet = g_malloc0(p->packet_len);
1298        p->name = g_strdup_printf("multifdrecv_%d", i);
1299    }
1300    return 0;
1301}
1302
1303bool multifd_recv_all_channels_created(void)
1304{
1305    int thread_count = migrate_multifd_channels();
1306
1307    if (!migrate_use_multifd()) {
1308        return true;
1309    }
1310
1311    return thread_count == atomic_read(&multifd_recv_state->count);
1312}
1313
1314/* Return true if multifd is ready for the migration, otherwise false */
1315bool multifd_recv_new_channel(QIOChannel *ioc)
1316{
1317    MultiFDRecvParams *p;
1318    Error *local_err = NULL;
1319    int id;
1320
1321    id = multifd_recv_initial_packet(ioc, &local_err);
1322    if (id < 0) {
1323        multifd_recv_terminate_threads(local_err);
1324        return false;
1325    }
1326
1327    p = &multifd_recv_state->params[id];
1328    if (p->c != NULL) {
1329        error_setg(&local_err, "multifd: received id '%d' already setup'",
1330                   id);
1331        multifd_recv_terminate_threads(local_err);
1332        return false;
1333    }
1334    p->c = ioc;
1335    object_ref(OBJECT(ioc));
1336    /* initial packet */
1337    p->num_packets = 1;
1338
1339    p->running = true;
1340    qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1341                       QEMU_THREAD_JOINABLE);
1342    atomic_inc(&multifd_recv_state->count);
1343    return multifd_recv_state->count == migrate_multifd_channels();
1344}
1345
1346/**
1347 * save_page_header: write page header to wire
1348 *
1349 * If this is the 1st block, it also writes the block identification
1350 *
1351 * Returns the number of bytes written
1352 *
1353 * @f: QEMUFile where to send the data
1354 * @block: block that contains the page we want to send
1355 * @offset: offset inside the block for the page
1356 *          in the lower bits, it contains flags
1357 */
1358static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
1359                               ram_addr_t offset)
1360{
1361    size_t size, len;
1362
1363    if (block == rs->last_sent_block) {
1364        offset |= RAM_SAVE_FLAG_CONTINUE;
1365    }
1366    qemu_put_be64(f, offset);
1367    size = 8;
1368
1369    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
1370        len = strlen(block->idstr);
1371        qemu_put_byte(f, len);
1372        qemu_put_buffer(f, (uint8_t *)block->idstr, len);
1373        size += 1 + len;
1374        rs->last_sent_block = block;
1375    }
1376    return size;
1377}
1378
1379/**
1380 * mig_throttle_guest_down: throotle down the guest
1381 *
1382 * Reduce amount of guest cpu execution to hopefully slow down memory
1383 * writes. If guest dirty memory rate is reduced below the rate at
1384 * which we can transfer pages to the destination then we should be
1385 * able to complete migration. Some workloads dirty memory way too
1386 * fast and will not effectively converge, even with auto-converge.
1387 */
1388static void mig_throttle_guest_down(void)
1389{
1390    MigrationState *s = migrate_get_current();
1391    uint64_t pct_initial = s->parameters.cpu_throttle_initial;
1392    uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
1393
1394    /* We have not started throttling yet. Let's start it. */
1395    if (!cpu_throttle_active()) {
1396        cpu_throttle_set(pct_initial);
1397    } else {
1398        /* Throttling already on, just increase the rate */
1399        cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
1400    }
1401}
1402
1403/**
1404 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
1405 *
1406 * @rs: current RAM state
1407 * @current_addr: address for the zero page
1408 *
1409 * Update the xbzrle cache to reflect a page that's been sent as all 0.
1410 * The important thing is that a stale (not-yet-0'd) page be replaced
1411 * by the new data.
1412 * As a bonus, if the page wasn't in the cache it gets added so that
1413 * when a small write is made into the 0'd page it gets XBZRLE sent.
1414 */
1415static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
1416{
1417    if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1418        return;
1419    }
1420
1421    /* We don't care if this fails to allocate a new cache page
1422     * as long as it updated an old one */
1423    cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
1424                 ram_counters.dirty_sync_count);
1425}
1426
1427#define ENCODING_FLAG_XBZRLE 0x1
1428
1429/**
1430 * save_xbzrle_page: compress and send current page
1431 *
1432 * Returns: 1 means that we wrote the page
1433 *          0 means that page is identical to the one already sent
1434 *          -1 means that xbzrle would be longer than normal
1435 *
1436 * @rs: current RAM state
1437 * @current_data: pointer to the address of the page contents
1438 * @current_addr: addr of the page
1439 * @block: block that contains the page we want to send
1440 * @offset: offset inside the block for the page
1441 * @last_stage: if we are at the completion stage
1442 */
1443static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
1444                            ram_addr_t current_addr, RAMBlock *block,
1445                            ram_addr_t offset, bool last_stage)
1446{
1447    int encoded_len = 0, bytes_xbzrle;
1448    uint8_t *prev_cached_page;
1449
1450    if (!cache_is_cached(XBZRLE.cache, current_addr,
1451                         ram_counters.dirty_sync_count)) {
1452        xbzrle_counters.cache_miss++;
1453        if (!last_stage) {
1454            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
1455                             ram_counters.dirty_sync_count) == -1) {
1456                return -1;
1457            } else {
1458                /* update *current_data when the page has been
1459                   inserted into cache */
1460                *current_data = get_cached_data(XBZRLE.cache, current_addr);
1461            }
1462        }
1463        return -1;
1464    }
1465
1466    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
1467
1468    /* save current buffer into memory */
1469    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
1470
1471    /* XBZRLE encoding (if there is no overflow) */
1472    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
1473                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
1474                                       TARGET_PAGE_SIZE);
1475    if (encoded_len == 0) {
1476        trace_save_xbzrle_page_skipping();
1477        return 0;
1478    } else if (encoded_len == -1) {
1479        trace_save_xbzrle_page_overflow();
1480        xbzrle_counters.overflow++;
1481        /* update data in the cache */
1482        if (!last_stage) {
1483            memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
1484            *current_data = prev_cached_page;
1485        }
1486        return -1;
1487    }
1488
1489    /* we need to update the data in the cache, in order to get the same data */
1490    if (!last_stage) {
1491        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
1492    }
1493
1494    /* Send XBZRLE based compressed page */
1495    bytes_xbzrle = save_page_header(rs, rs->f, block,
1496                                    offset | RAM_SAVE_FLAG_XBZRLE);
1497    qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
1498    qemu_put_be16(rs->f, encoded_len);
1499    qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
1500    bytes_xbzrle += encoded_len + 1 + 2;
1501    xbzrle_counters.pages++;
1502    xbzrle_counters.bytes += bytes_xbzrle;
1503    ram_counters.transferred += bytes_xbzrle;
1504
1505    return 1;
1506}
1507
1508/**
1509 * migration_bitmap_find_dirty: find the next dirty page from start
1510 *
1511 * Called with rcu_read_lock() to protect migration_bitmap
1512 *
1513 * Returns the byte offset within memory region of the start of a dirty page
1514 *
1515 * @rs: current RAM state
1516 * @rb: RAMBlock where to search for dirty pages
1517 * @start: page where we start the search
1518 */
1519static inline
1520unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
1521                                          unsigned long start)
1522{
1523    unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
1524    unsigned long *bitmap = rb->bmap;
1525    unsigned long next;
1526
1527    if (!qemu_ram_is_migratable(rb)) {
1528        return size;
1529    }
1530
1531    if (rs->ram_bulk_stage && start > 0) {
1532        next = start + 1;
1533    } else {
1534        next = find_next_bit(bitmap, size, start);
1535    }
1536
1537    return next;
1538}
1539
1540static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1541                                                RAMBlock *rb,
1542                                                unsigned long page)
1543{
1544    bool ret;
1545
1546    ret = test_and_clear_bit(page, rb->bmap);
1547
1548    if (ret) {
1549        rs->migration_dirty_pages--;
1550    }
1551    return ret;
1552}
1553
1554static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
1555                                        ram_addr_t start, ram_addr_t length)
1556{
1557    rs->migration_dirty_pages +=
1558        cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
1559                                              &rs->num_dirty_pages_period);
1560}
1561
1562/**
1563 * ram_pagesize_summary: calculate all the pagesizes of a VM
1564 *
1565 * Returns a summary bitmap of the page sizes of all RAMBlocks
1566 *
1567 * For VMs with just normal pages this is equivalent to the host page
1568 * size. If it's got some huge pages then it's the OR of all the
1569 * different page sizes.
1570 */
1571uint64_t ram_pagesize_summary(void)
1572{
1573    RAMBlock *block;
1574    uint64_t summary = 0;
1575
1576    RAMBLOCK_FOREACH_MIGRATABLE(block) {
1577        summary |= block->page_size;
1578    }
1579
1580    return summary;
1581}
1582
1583static void migration_update_rates(RAMState *rs, int64_t end_time)
1584{
1585    uint64_t iter_count = rs->iterations - rs->iterations_prev;
1586
1587    /* calculate period counters */
1588    ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1589                / (end_time - rs->time_last_bitmap_sync);
1590
1591    if (!iter_count) {
1592        return;
1593    }
1594
1595    if (migrate_use_xbzrle()) {
1596        xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1597            rs->xbzrle_cache_miss_prev) / iter_count;
1598        rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1599    }
1600}
1601
1602static void migration_bitmap_sync(RAMState *rs)
1603{
1604    RAMBlock *block;
1605    int64_t end_time;
1606    uint64_t bytes_xfer_now;
1607
1608    ram_counters.dirty_sync_count++;
1609
1610    if (!rs->time_last_bitmap_sync) {
1611        rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1612    }
1613
1614    trace_migration_bitmap_sync_start();
1615    memory_global_dirty_log_sync();
1616
1617    qemu_mutex_lock(&rs->bitmap_mutex);
1618    rcu_read_lock();
1619    RAMBLOCK_FOREACH_MIGRATABLE(block) {
1620        migration_bitmap_sync_range(rs, block, 0, block->used_length);
1621    }
1622    ram_counters.remaining = ram_bytes_remaining();
1623    rcu_read_unlock();
1624    qemu_mutex_unlock(&rs->bitmap_mutex);
1625
1626    trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1627
1628    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1629
1630    /* more than 1 second = 1000 millisecons */
1631    if (end_time > rs->time_last_bitmap_sync + 1000) {
1632        bytes_xfer_now = ram_counters.transferred;
1633
1634        /* During block migration the auto-converge logic incorrectly detects
1635         * that ram migration makes no progress. Avoid this by disabling the
1636         * throttling logic during the bulk phase of block migration. */
1637        if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1638            /* The following detection logic can be refined later. For now:
1639               Check to see if the dirtied bytes is 50% more than the approx.
1640               amount of bytes that just got transferred since the last time we
1641               were in this routine. If that happens twice, start or increase
1642               throttling */
1643
1644            if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
1645                   (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
1646                (++rs->dirty_rate_high_cnt >= 2)) {
1647                    trace_migration_throttle();
1648                    rs->dirty_rate_high_cnt = 0;
1649                    mig_throttle_guest_down();
1650            }
1651        }
1652
1653        migration_update_rates(rs, end_time);
1654
1655        rs->iterations_prev = rs->iterations;
1656
1657        /* reset period counters */
1658        rs->time_last_bitmap_sync = end_time;
1659        rs->num_dirty_pages_period = 0;
1660        rs->bytes_xfer_prev = bytes_xfer_now;
1661    }
1662    if (migrate_use_events()) {
1663        qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
1664    }
1665}
1666
1667/**
1668 * save_zero_page: send the zero page to the stream
1669 *
1670 * Returns the number of pages written.
1671 *
1672 * @rs: current RAM state
1673 * @block: block that contains the page we want to send
1674 * @offset: offset inside the block for the page
1675 */
1676static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1677{
1678    uint8_t *p = block->host + offset;
1679    int pages = -1;
1680
1681    if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1682        ram_counters.duplicate++;
1683        ram_counters.transferred +=
1684            save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
1685        qemu_put_byte(rs->f, 0);
1686        ram_counters.transferred += 1;
1687        pages = 1;
1688    }
1689
1690    return pages;
1691}
1692
1693static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1694{
1695    if (!migrate_release_ram() || !migration_in_postcopy()) {
1696        return;
1697    }
1698
1699    ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
1700}
1701
1702/*
1703 * @pages: the number of pages written by the control path,
1704 *        < 0 - error
1705 *        > 0 - number of pages written
1706 *
1707 * Return true if the pages has been saved, otherwise false is returned.
1708 */
1709static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1710                              int *pages)
1711{
1712    uint64_t bytes_xmit = 0;
1713    int ret;
1714
1715    *pages = -1;
1716    ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1717                                &bytes_xmit);
1718    if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1719        return false;
1720    }
1721
1722    if (bytes_xmit) {
1723        ram_counters.transferred += bytes_xmit;
1724        *pages = 1;
1725    }
1726
1727    if (ret == RAM_SAVE_CONTROL_DELAYED) {
1728        return true;
1729    }
1730
1731    if (bytes_xmit > 0) {
1732        ram_counters.normal++;
1733    } else if (bytes_xmit == 0) {
1734        ram_counters.duplicate++;
1735    }
1736
1737    return true;
1738}
1739
1740/*
1741 * directly send the page to the stream
1742 *
1743 * Returns the number of pages written.
1744 *
1745 * @rs: current RAM state
1746 * @block: block that contains the page we want to send
1747 * @offset: offset inside the block for the page
1748 * @buf: the page to be sent
1749 * @async: send to page asyncly
1750 */
1751static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1752                            uint8_t *buf, bool async)
1753{
1754    ram_counters.transferred += save_page_header(rs, rs->f, block,
1755                                                 offset | RAM_SAVE_FLAG_PAGE);
1756    if (async) {
1757        qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1758                              migrate_release_ram() &
1759                              migration_in_postcopy());
1760    } else {
1761        qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1762    }
1763    ram_counters.transferred += TARGET_PAGE_SIZE;
1764    ram_counters.normal++;
1765    return 1;
1766}
1767
1768/**
1769 * ram_save_page: send the given page to the stream
1770 *
1771 * Returns the number of pages written.
1772 *          < 0 - error
1773 *          >=0 - Number of pages written - this might legally be 0
1774 *                if xbzrle noticed the page was the same.
1775 *
1776 * @rs: current RAM state
1777 * @block: block that contains the page we want to send
1778 * @offset: offset inside the block for the page
1779 * @last_stage: if we are at the completion stage
1780 */
1781static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1782{
1783    int pages = -1;
1784    uint8_t *p;
1785    bool send_async = true;
1786    RAMBlock *block = pss->block;
1787    ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1788    ram_addr_t current_addr = block->offset + offset;
1789
1790    p = block->host + offset;
1791    trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1792
1793    XBZRLE_cache_lock();
1794    if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1795        migrate_use_xbzrle()) {
1796        pages = save_xbzrle_page(rs, &p, current_addr, block,
1797                                 offset, last_stage);
1798        if (!last_stage) {
1799            /* Can't send this cached data async, since the cache page
1800             * might get updated before it gets to the wire
1801             */
1802            send_async = false;
1803        }
1804    }
1805
1806    /* XBZRLE overflow or normal page */
1807    if (pages == -1) {
1808        pages = save_normal_page(rs, block, offset, p, send_async);
1809    }
1810
1811    XBZRLE_cache_unlock();
1812
1813    return pages;
1814}
1815
1816static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1817                                 ram_addr_t offset)
1818{
1819    multifd_queue_page(block, offset);
1820    ram_counters.normal++;
1821
1822    return 1;
1823}
1824
1825static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1826                                ram_addr_t offset, uint8_t *source_buf)
1827{
1828    RAMState *rs = ram_state;
1829    int bytes_sent, blen;
1830    uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1831
1832    bytes_sent = save_page_header(rs, f, block, offset |
1833                                  RAM_SAVE_FLAG_COMPRESS_PAGE);
1834
1835    /*
1836     * copy it to a internal buffer to avoid it being modified by VM
1837     * so that we can catch up the error during compression and
1838     * decompression
1839     */
1840    memcpy(source_buf, p, TARGET_PAGE_SIZE);
1841    blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1842    if (blen < 0) {
1843        bytes_sent = 0;
1844        qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1845        error_report("compressed data failed!");
1846    } else {
1847        bytes_sent += blen;
1848        ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1849    }
1850
1851    return bytes_sent;
1852}
1853
1854static void flush_compressed_data(RAMState *rs)
1855{
1856    int idx, len, thread_count;
1857
1858    if (!migrate_use_compression()) {
1859        return;
1860    }
1861    thread_count = migrate_compress_threads();
1862
1863    qemu_mutex_lock(&comp_done_lock);
1864    for (idx = 0; idx < thread_count; idx++) {
1865        while (!comp_param[idx].done) {
1866            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1867        }
1868    }
1869    qemu_mutex_unlock(&comp_done_lock);
1870
1871    for (idx = 0; idx < thread_count; idx++) {
1872        qemu_mutex_lock(&comp_param[idx].mutex);
1873        if (!comp_param[idx].quit) {
1874            len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1875            ram_counters.transferred += len;
1876        }
1877        qemu_mutex_unlock(&comp_param[idx].mutex);
1878    }
1879}
1880
1881static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1882                                       ram_addr_t offset)
1883{
1884    param->block = block;
1885    param->offset = offset;
1886}
1887
1888static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1889                                           ram_addr_t offset)
1890{
1891    int idx, thread_count, bytes_xmit = -1, pages = -1;
1892
1893    thread_count = migrate_compress_threads();
1894    qemu_mutex_lock(&comp_done_lock);
1895    while (true) {
1896        for (idx = 0; idx < thread_count; idx++) {
1897            if (comp_param[idx].done) {
1898                comp_param[idx].done = false;
1899                bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1900                qemu_mutex_lock(&comp_param[idx].mutex);
1901                set_compress_params(&comp_param[idx], block, offset);
1902                qemu_cond_signal(&comp_param[idx].cond);
1903                qemu_mutex_unlock(&comp_param[idx].mutex);
1904                pages = 1;
1905                ram_counters.normal++;
1906                ram_counters.transferred += bytes_xmit;
1907                break;
1908            }
1909        }
1910        if (pages > 0) {
1911            break;
1912        } else {
1913            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1914        }
1915    }
1916    qemu_mutex_unlock(&comp_done_lock);
1917
1918    return pages;
1919}
1920
1921/**
1922 * find_dirty_block: find the next dirty page and update any state
1923 * associated with the search process.
1924 *
1925 * Returns if a page is found
1926 *
1927 * @rs: current RAM state
1928 * @pss: data about the state of the current dirty page scan
1929 * @again: set to false if the search has scanned the whole of RAM
1930 */
1931static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1932{
1933    pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1934    if (pss->complete_round && pss->block == rs->last_seen_block &&
1935        pss->page >= rs->last_page) {
1936        /*
1937         * We've been once around the RAM and haven't found anything.
1938         * Give up.
1939         */
1940        *again = false;
1941        return false;
1942    }
1943    if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1944        /* Didn't find anything in this RAM Block */
1945        pss->page = 0;
1946        pss->block = QLIST_NEXT_RCU(pss->block, next);
1947        if (!pss->block) {
1948            /* Hit the end of the list */
1949            pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1950            /* Flag that we've looped */
1951            pss->complete_round = true;
1952            rs->ram_bulk_stage = false;
1953            if (migrate_use_xbzrle()) {
1954                /* If xbzrle is on, stop using the data compression at this
1955                 * point. In theory, xbzrle can do better than compression.
1956                 */
1957                flush_compressed_data(rs);
1958            }
1959        }
1960        /* Didn't find anything this time, but try again on the new block */
1961        *again = true;
1962        return false;
1963    } else {
1964        /* Can go around again, but... */
1965        *again = true;
1966        /* We've found something so probably don't need to */
1967        return true;
1968    }
1969}
1970
1971/**
1972 * unqueue_page: gets a page of the queue
1973 *
1974 * Helper for 'get_queued_page' - gets a page off the queue
1975 *
1976 * Returns the block of the page (or NULL if none available)
1977 *
1978 * @rs: current RAM state
1979 * @offset: used to return the offset within the RAMBlock
1980 */
1981static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1982{
1983    RAMBlock *block = NULL;
1984
1985    qemu_mutex_lock(&rs->src_page_req_mutex);
1986    if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1987        struct RAMSrcPageRequest *entry =
1988                                QSIMPLEQ_FIRST(&rs->src_page_requests);
1989        block = entry->rb;
1990        *offset = entry->offset;
1991
1992        if (entry->len > TARGET_PAGE_SIZE) {
1993            entry->len -= TARGET_PAGE_SIZE;
1994            entry->offset += TARGET_PAGE_SIZE;
1995        } else {
1996            memory_region_unref(block->mr);
1997            QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1998            g_free(entry);
1999            migration_consume_urgent_request();
2000        }
2001    }
2002    qemu_mutex_unlock(&rs->src_page_req_mutex);
2003
2004    return block;
2005}
2006
2007/**
2008 * get_queued_page: unqueue a page from the postocpy requests
2009 *
2010 * Skips pages that are already sent (!dirty)
2011 *
2012 * Returns if a queued page is found
2013 *
2014 * @rs: current RAM state
2015 * @pss: data about the state of the current dirty page scan
2016 */
2017static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2018{
2019    RAMBlock  *block;
2020    ram_addr_t offset;
2021    bool dirty;
2022
2023    do {
2024        block = unqueue_page(rs, &offset);
2025        /*
2026         * We're sending this page, and since it's postcopy nothing else
2027         * will dirty it, and we must make sure it doesn't get sent again
2028         * even if this queue request was received after the background
2029         * search already sent it.
2030         */
2031        if (block) {
2032            unsigned long page;
2033
2034            page = offset >> TARGET_PAGE_BITS;
2035            dirty = test_bit(page, block->bmap);
2036            if (!dirty) {
2037                trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2038                       page, test_bit(page, block->unsentmap));
2039            } else {
2040                trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2041            }
2042        }
2043
2044    } while (block && !dirty);
2045
2046    if (block) {
2047        /*
2048         * As soon as we start servicing pages out of order, then we have
2049         * to kill the bulk stage, since the bulk stage assumes
2050         * in (migration_bitmap_find_and_reset_dirty) that every page is
2051         * dirty, that's no longer true.
2052         */
2053        rs->ram_bulk_stage = false;
2054
2055        /*
2056         * We want the background search to continue from the queued page
2057         * since the guest is likely to want other pages near to the page
2058         * it just requested.
2059         */
2060        pss->block = block;
2061        pss->page = offset >> TARGET_PAGE_BITS;
2062    }
2063
2064    return !!block;
2065}
2066
2067/**
2068 * migration_page_queue_free: drop any remaining pages in the ram
2069 * request queue
2070 *
2071 * It should be empty at the end anyway, but in error cases there may
2072 * be some left.  in case that there is any page left, we drop it.
2073 *
2074 */
2075static void migration_page_queue_free(RAMState *rs)
2076{
2077    struct RAMSrcPageRequest *mspr, *next_mspr;
2078    /* This queue generally should be empty - but in the case of a failed
2079     * migration might have some droppings in.
2080     */
2081    rcu_read_lock();
2082    QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2083        memory_region_unref(mspr->rb->mr);
2084        QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2085        g_free(mspr);
2086    }
2087    rcu_read_unlock();
2088}
2089
2090/**
2091 * ram_save_queue_pages: queue the page for transmission
2092 *
2093 * A request from postcopy destination for example.
2094 *
2095 * Returns zero on success or negative on error
2096 *
2097 * @rbname: Name of the RAMBLock of the request. NULL means the
2098 *          same that last one.
2099 * @start: starting address from the start of the RAMBlock
2100 * @len: length (in bytes) to send
2101 */
2102int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2103{
2104    RAMBlock *ramblock;
2105    RAMState *rs = ram_state;
2106
2107    ram_counters.postcopy_requests++;
2108    rcu_read_lock();
2109    if (!rbname) {
2110        /* Reuse last RAMBlock */
2111        ramblock = rs->last_req_rb;
2112
2113        if (!ramblock) {
2114            /*
2115             * Shouldn't happen, we can't reuse the last RAMBlock if
2116             * it's the 1st request.
2117             */
2118            error_report("ram_save_queue_pages no previous block");
2119            goto err;
2120        }
2121    } else {
2122        ramblock = qemu_ram_block_by_name(rbname);
2123
2124        if (!ramblock) {
2125            /* We shouldn't be asked for a non-existent RAMBlock */
2126            error_report("ram_save_queue_pages no block '%s'", rbname);
2127            goto err;
2128        }
2129        rs->last_req_rb = ramblock;
2130    }
2131    trace_ram_save_queue_pages(ramblock->idstr, start, len);
2132    if (start+len > ramblock->used_length) {
2133        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2134                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2135                     __func__, start, len, ramblock->used_length);
2136        goto err;
2137    }
2138
2139    struct RAMSrcPageRequest *new_entry =
2140        g_malloc0(sizeof(struct RAMSrcPageRequest));
2141    new_entry->rb = ramblock;
2142    new_entry->offset = start;
2143    new_entry->len = len;
2144
2145    memory_region_ref(ramblock->mr);
2146    qemu_mutex_lock(&rs->src_page_req_mutex);
2147    QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2148    migration_make_urgent_request();
2149    qemu_mutex_unlock(&rs->src_page_req_mutex);
2150    rcu_read_unlock();
2151
2152    return 0;
2153
2154err:
2155    rcu_read_unlock();
2156    return -1;
2157}
2158
2159static bool save_page_use_compression(RAMState *rs)
2160{
2161    if (!migrate_use_compression()) {
2162        return false;
2163    }
2164
2165    /*
2166     * If xbzrle is on, stop using the data compression after first
2167     * round of migration even if compression is enabled. In theory,
2168     * xbzrle can do better than compression.
2169     */
2170    if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
2171        return true;
2172    }
2173
2174    return false;
2175}
2176
2177/**
2178 * ram_save_target_page: save one target page
2179 *
2180 * Returns the number of pages written
2181 *
2182 * @rs: current RAM state
2183 * @pss: data about the page we want to send
2184 * @last_stage: if we are at the completion stage
2185 */
2186static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2187                                bool last_stage)
2188{
2189    RAMBlock *block = pss->block;
2190    ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
2191    int res;
2192
2193    if (control_save_page(rs, block, offset, &res)) {
2194        return res;
2195    }
2196
2197    /*
2198     * When starting the process of a new block, the first page of
2199     * the block should be sent out before other pages in the same
2200     * block, and all the pages in last block should have been sent
2201     * out, keeping this order is important, because the 'cont' flag
2202     * is used to avoid resending the block name.
2203     */
2204    if (block != rs->last_sent_block && save_page_use_compression(rs)) {
2205            flush_compressed_data(rs);
2206    }
2207
2208    res = save_zero_page(rs, block, offset);
2209    if (res > 0) {
2210        /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2211         * page would be stale
2212         */
2213        if (!save_page_use_compression(rs)) {
2214            XBZRLE_cache_lock();
2215            xbzrle_cache_zero_page(rs, block->offset + offset);
2216            XBZRLE_cache_unlock();
2217        }
2218        ram_release_pages(block->idstr, offset, res);
2219        return res;
2220    }
2221
2222    /*
2223     * Make sure the first page is sent out before other pages.
2224     *
2225     * we post it as normal page as compression will take much
2226     * CPU resource.
2227     */
2228    if (block == rs->last_sent_block && save_page_use_compression(rs)) {
2229        return compress_page_with_multi_thread(rs, block, offset);
2230    } else if (migrate_use_multifd()) {
2231        return ram_save_multifd_page(rs, block, offset);
2232    }
2233
2234    return ram_save_page(rs, pss, last_stage);
2235}
2236
2237/**
2238 * ram_save_host_page: save a whole host page
2239 *
2240 * Starting at *offset send pages up to the end of the current host
2241 * page. It's valid for the initial offset to point into the middle of
2242 * a host page in which case the remainder of the hostpage is sent.
2243 * Only dirty target pages are sent. Note that the host page size may
2244 * be a huge page for this block.
2245 * The saving stops at the boundary of the used_length of the block
2246 * if the RAMBlock isn't a multiple of the host page size.
2247 *
2248 * Returns the number of pages written or negative on error
2249 *
2250 * @rs: current RAM state
2251 * @ms: current migration state
2252 * @pss: data about the page we want to send
2253 * @last_stage: if we are at the completion stage
2254 */
2255static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2256                              bool last_stage)
2257{
2258    int tmppages, pages = 0;
2259    size_t pagesize_bits =
2260        qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2261
2262    if (!qemu_ram_is_migratable(pss->block)) {
2263        error_report("block %s should not be migrated !", pss->block->idstr);
2264        return 0;
2265    }
2266
2267    do {
2268        /* Check the pages is dirty and if it is send it */
2269        if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2270            pss->page++;
2271            continue;
2272        }
2273
2274        tmppages = ram_save_target_page(rs, pss, last_stage);
2275        if (tmppages < 0) {
2276            return tmppages;
2277        }
2278
2279        pages += tmppages;
2280        if (pss->block->unsentmap) {
2281            clear_bit(pss->page, pss->block->unsentmap);
2282        }
2283
2284        pss->page++;
2285    } while ((pss->page & (pagesize_bits - 1)) &&
2286             offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
2287
2288    /* The offset we leave with is the last one we looked at */
2289    pss->page--;
2290    return pages;
2291}
2292
2293/**
2294 * ram_find_and_save_block: finds a dirty page and sends it to f
2295 *
2296 * Called within an RCU critical section.
2297 *
2298 * Returns the number of pages written where zero means no dirty pages
2299 *
2300 * @rs: current RAM state
2301 * @last_stage: if we are at the completion stage
2302 *
2303 * On systems where host-page-size > target-page-size it will send all the
2304 * pages in a host page that are dirty.
2305 */
2306
2307static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2308{
2309    PageSearchStatus pss;
2310    int pages = 0;
2311    bool again, found;
2312
2313    /* No dirty page as there is zero RAM */
2314    if (!ram_bytes_total()) {
2315        return pages;
2316    }
2317
2318    pss.block = rs->last_seen_block;
2319    pss.page = rs->last_page;
2320    pss.complete_round = false;
2321
2322    if (!pss.block) {
2323        pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2324    }
2325
2326    do {
2327        again = true;
2328        found = get_queued_page(rs, &pss);
2329
2330        if (!found) {
2331            /* priority queue empty, so just search for something dirty */
2332            found = find_dirty_block(rs, &pss, &again);
2333        }
2334
2335        if (found) {
2336            pages = ram_save_host_page(rs, &pss, last_stage);
2337        }
2338    } while (!pages && again);
2339
2340    rs->last_seen_block = pss.block;
2341    rs->last_page = pss.page;
2342
2343    return pages;
2344}
2345
2346void acct_update_position(QEMUFile *f, size_t size, bool zero)
2347{
2348    uint64_t pages = size / TARGET_PAGE_SIZE;
2349
2350    if (zero) {
2351        ram_counters.duplicate += pages;
2352    } else {
2353        ram_counters.normal += pages;
2354        ram_counters.transferred += size;
2355        qemu_update_position(f, size);
2356    }
2357}
2358
2359uint64_t ram_bytes_total(void)
2360{
2361    RAMBlock *block;
2362    uint64_t total = 0;
2363
2364    rcu_read_lock();
2365    RAMBLOCK_FOREACH_MIGRATABLE(block) {
2366        total += block->used_length;
2367    }
2368    rcu_read_unlock();
2369    return total;
2370}
2371
2372static void xbzrle_load_setup(void)
2373{
2374    XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2375}
2376
2377static void xbzrle_load_cleanup(void)
2378{
2379    g_free(XBZRLE.decoded_buf);
2380    XBZRLE.decoded_buf = NULL;
2381}
2382
2383static void ram_state_cleanup(RAMState **rsp)
2384{
2385    if (*rsp) {
2386        migration_page_queue_free(*rsp);
2387        qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2388        qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2389        g_free(*rsp);
2390        *rsp = NULL;
2391    }
2392}
2393
2394static void xbzrle_cleanup(void)
2395{
2396    XBZRLE_cache_lock();
2397    if (XBZRLE.cache) {
2398        cache_fini(XBZRLE.cache);
2399        g_free(XBZRLE.encoded_buf);
2400        g_free(XBZRLE.current_buf);
2401        g_free(XBZRLE.zero_target_page);
2402        XBZRLE.cache = NULL;
2403        XBZRLE.encoded_buf = NULL;
2404        XBZRLE.current_buf = NULL;
2405        XBZRLE.zero_target_page = NULL;
2406    }
2407    XBZRLE_cache_unlock();
2408}
2409
2410static void ram_save_cleanup(void *opaque)
2411{
2412    RAMState **rsp = opaque;
2413    RAMBlock *block;
2414
2415    /* caller have hold iothread lock or is in a bh, so there is
2416     * no writing race against this migration_bitmap
2417     */
2418    memory_global_dirty_log_stop();
2419
2420    RAMBLOCK_FOREACH_MIGRATABLE(block) {
2421        g_free(block->bmap);
2422        block->bmap = NULL;
2423        g_free(block->unsentmap);
2424        block->unsentmap = NULL;
2425    }
2426
2427    xbzrle_cleanup();
2428    compress_threads_save_cleanup();
2429    ram_state_cleanup(rsp);
2430}
2431
2432static void ram_state_reset(RAMState *rs)
2433{
2434    rs->last_seen_block = NULL;
2435    rs->last_sent_block = NULL;
2436    rs->last_page = 0;
2437    rs->last_version = ram_list.version;
2438    rs->ram_bulk_stage = true;
2439}
2440
2441#define MAX_WAIT 50 /* ms, half buffered_file limit */
2442
2443/*
2444 * 'expected' is the value you expect the bitmap mostly to be full
2445 * of; it won't bother printing lines that are all this value.
2446 * If 'todump' is null the migration bitmap is dumped.
2447 */
2448void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2449                           unsigned long pages)
2450{
2451    int64_t cur;
2452    int64_t linelen = 128;
2453    char linebuf[129];
2454
2455    for (cur = 0; cur < pages; cur += linelen) {
2456        int64_t curb;
2457        bool found = false;
2458        /*
2459         * Last line; catch the case where the line length
2460         * is longer than remaining ram
2461         */
2462        if (cur + linelen > pages) {
2463            linelen = pages - cur;
2464        }
2465        for (curb = 0; curb < linelen; curb++) {
2466            bool thisbit = test_bit(cur + curb, todump);
2467            linebuf[curb] = thisbit ? '1' : '.';
2468            found = found || (thisbit != expected);
2469        }
2470        if (found) {
2471            linebuf[curb] = '\0';
2472            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2473        }
2474    }
2475}
2476
2477/* **** functions for postcopy ***** */
2478
2479void ram_postcopy_migrated_memory_release(MigrationState *ms)
2480{
2481    struct RAMBlock *block;
2482
2483    RAMBLOCK_FOREACH_MIGRATABLE(block) {
2484        unsigned long *bitmap = block->bmap;
2485        unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2486        unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2487
2488        while (run_start < range) {
2489            unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2490            ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
2491                              (run_end - run_start) << TARGET_PAGE_BITS);
2492            run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2493        }
2494    }
2495}
2496
2497/**
2498 * postcopy_send_discard_bm_ram: discard a RAMBlock
2499 *
2500 * Returns zero on success
2501 *
2502 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2503 * Note: At this point the 'unsentmap' is the processed bitmap combined
2504 *       with the dirtymap; so a '1' means it's either dirty or unsent.
2505 *
2506 * @ms: current migration state
2507 * @pds: state for postcopy
2508 * @start: RAMBlock starting page
2509 * @length: RAMBlock size
2510 */
2511static int postcopy_send_discard_bm_ram(MigrationState *ms,
2512                                        PostcopyDiscardState *pds,
2513                                        RAMBlock *block)
2514{
2515    unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2516    unsigned long current;
2517    unsigned long *unsentmap = block->unsentmap;
2518
2519    for (current = 0; current < end; ) {
2520        unsigned long one = find_next_bit(unsentmap, end, current);
2521
2522        if (one <= end) {
2523            unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
2524            unsigned long discard_length;
2525
2526            if (zero >= end) {
2527                discard_length = end - one;
2528            } else {
2529                discard_length = zero - one;
2530            }
2531            if (discard_length) {
2532                postcopy_discard_send_range(ms, pds, one, discard_length);
2533            }
2534            current = one + discard_length;
2535        } else {
2536            current = one;
2537        }
2538    }
2539
2540    return 0;
2541}
2542
2543/**
2544 * postcopy_each_ram_send_discard: discard all RAMBlocks
2545 *
2546 * Returns 0 for success or negative for error
2547 *
2548 * Utility for the outgoing postcopy code.
2549 *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2550 *   passing it bitmap indexes and name.
2551 * (qemu_ram_foreach_block ends up passing unscaled lengths
2552 *  which would mean postcopy code would have to deal with target page)
2553 *
2554 * @ms: current migration state
2555 */
2556static int postcopy_each_ram_send_discard(MigrationState *ms)
2557{
2558    struct RAMBlock *block;
2559    int ret;
2560
2561    RAMBLOCK_FOREACH_MIGRATABLE(block) {
2562        PostcopyDiscardState *pds =
2563            postcopy_discard_send_init(ms, block->idstr);
2564
2565        /*
2566         * Postcopy sends chunks of bitmap over the wire, but it
2567         * just needs indexes at this point, avoids it having
2568         * target page specific code.
2569         */
2570        ret = postcopy_send_discard_bm_ram(ms, pds, block);
2571        postcopy_discard_send_finish(ms, pds);
2572        if (ret) {
2573            return ret;
2574        }
2575    }
2576
2577    return 0;
2578}
2579
2580/**
2581 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
2582 *
2583 * Helper for postcopy_chunk_hostpages; it's called twice to
2584 * canonicalize the two bitmaps, that are similar, but one is
2585 * inverted.
2586 *
2587 * Postcopy requires that all target pages in a hostpage are dirty or
2588 * clean, not a mix.  This function canonicalizes the bitmaps.
2589 *
2590 * @ms: current migration state
2591 * @unsent_pass: if true we need to canonicalize partially unsent host pages
2592 *               otherwise we need to canonicalize partially dirty host pages
2593 * @block: block that contains the page we want to canonicalize
2594 * @pds: state for postcopy
2595 */
2596static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
2597                                          RAMBlock *block,
2598                                          PostcopyDiscardState *pds)
2599{
2600    RAMState *rs = ram_state;
2601    unsigned long *bitmap = block->bmap;
2602    unsigned long *unsentmap = block->unsentmap;
2603    unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2604    unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2605    unsigned long run_start;
2606
2607    if (block->page_size == TARGET_PAGE_SIZE) {
2608        /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2609        return;
2610    }
2611
2612    if (unsent_pass) {
2613        /* Find a sent page */
2614        run_start = find_next_zero_bit(unsentmap, pages, 0);
2615    } else {
2616        /* Find a dirty page */
2617        run_start = find_next_bit(bitmap, pages, 0);
2618    }
2619
2620    while (run_start < pages) {
2621        bool do_fixup = false;
2622        unsigned long fixup_start_addr;
2623        unsigned long host_offset;
2624
2625        /*
2626         * If the start of this run of pages is in the middle of a host
2627         * page, then we need to fixup this host page.
2628         */
2629        host_offset = run_start % host_ratio;
2630        if (host_offset) {
2631            do_fixup = true;
2632            run_start -= host_offset;
2633            fixup_start_addr = run_start;
2634            /* For the next pass */
2635            run_start = run_start + host_ratio;
2636        } else {
2637            /* Find the end of this run */
2638            unsigned long run_end;
2639            if (unsent_pass) {
2640                run_end = find_next_bit(unsentmap, pages, run_start + 1);
2641            } else {
2642                run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
2643            }
2644            /*
2645             * If the end isn't at the start of a host page, then the
2646             * run doesn't finish at the end of a host page
2647             * and we need to discard.
2648             */
2649            host_offset = run_end % host_ratio;
2650            if (host_offset) {
2651                do_fixup = true;
2652                fixup_start_addr = run_end - host_offset;
2653                /*
2654                 * This host page has gone, the next loop iteration starts
2655                 * from after the fixup
2656                 */
2657                run_start = fixup_start_addr + host_ratio;
2658            } else {
2659                /*
2660                 * No discards on this iteration, next loop starts from
2661                 * next sent/dirty page
2662                 */
2663                run_start = run_end + 1;
2664            }
2665        }
2666
2667        if (do_fixup) {
2668            unsigned long page;
2669
2670            /* Tell the destination to discard this page */
2671            if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
2672                /* For the unsent_pass we:
2673                 *     discard partially sent pages
2674                 * For the !unsent_pass (dirty) we:
2675                 *     discard partially dirty pages that were sent
2676                 *     (any partially sent pages were already discarded
2677                 *     by the previous unsent_pass)
2678                 */
2679                postcopy_discard_send_range(ms, pds, fixup_start_addr,
2680                                            host_ratio);
2681            }
2682
2683            /* Clean up the bitmap */
2684            for (page = fixup_start_addr;
2685                 page < fixup_start_addr + host_ratio; page++) {
2686                /* All pages in this host page are now not sent */
2687                set_bit(page, unsentmap);
2688
2689                /*
2690                 * Remark them as dirty, updating the count for any pages
2691                 * that weren't previously dirty.
2692                 */
2693                rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2694            }
2695        }
2696
2697        if (unsent_pass) {
2698            /* Find the next sent page for the next iteration */
2699            run_start = find_next_zero_bit(unsentmap, pages, run_start);
2700        } else {
2701            /* Find the next dirty page for the next iteration */
2702            run_start = find_next_bit(bitmap, pages, run_start);
2703        }
2704    }
2705}
2706
2707/**
2708 * postcopy_chuck_hostpages: discrad any partially sent host page
2709 *
2710 * Utility for the outgoing postcopy code.
2711 *
2712 * Discard any partially sent host-page size chunks, mark any partially
2713 * dirty host-page size chunks as all dirty.  In this case the host-page
2714 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2715 *
2716 * Returns zero on success
2717 *
2718 * @ms: current migration state
2719 * @block: block we want to work with
2720 */
2721static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2722{
2723    PostcopyDiscardState *pds =
2724        postcopy_discard_send_init(ms, block->idstr);
2725
2726    /* First pass: Discard all partially sent host pages */
2727    postcopy_chunk_hostpages_pass(ms, true, block, pds);
2728    /*
2729     * Second pass: Ensure that all partially dirty host pages are made
2730     * fully dirty.
2731     */
2732    postcopy_chunk_hostpages_pass(ms, false, block, pds);
2733
2734    postcopy_discard_send_finish(ms, pds);
2735    return 0;
2736}
2737
2738/**
2739 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2740 *
2741 * Returns zero on success
2742 *
2743 * Transmit the set of pages to be discarded after precopy to the target
2744 * these are pages that:
2745 *     a) Have been previously transmitted but are now dirty again
2746 *     b) Pages that have never been transmitted, this ensures that
2747 *        any pages on the destination that have been mapped by background
2748 *        tasks get discarded (transparent huge pages is the specific concern)
2749 * Hopefully this is pretty sparse
2750 *
2751 * @ms: current migration state
2752 */
2753int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2754{
2755    RAMState *rs = ram_state;
2756    RAMBlock *block;
2757    int ret;
2758
2759    rcu_read_lock();
2760
2761    /* This should be our last sync, the src is now paused */
2762    migration_bitmap_sync(rs);
2763
2764    /* Easiest way to make sure we don't resume in the middle of a host-page */
2765    rs->last_seen_block = NULL;
2766    rs->last_sent_block = NULL;
2767    rs->last_page = 0;
2768
2769    RAMBLOCK_FOREACH_MIGRATABLE(block) {
2770        unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2771        unsigned long *bitmap = block->bmap;
2772        unsigned long *unsentmap = block->unsentmap;
2773
2774        if (!unsentmap) {
2775            /* We don't have a safe way to resize the sentmap, so
2776             * if the bitmap was resized it will be NULL at this
2777             * point.
2778             */
2779            error_report("migration ram resized during precopy phase");
2780            rcu_read_unlock();
2781            return -EINVAL;
2782        }
2783        /* Deal with TPS != HPS and huge pages */
2784        ret = postcopy_chunk_hostpages(ms, block);
2785        if (ret) {
2786            rcu_read_unlock();
2787            return ret;
2788        }
2789
2790        /*
2791         * Update the unsentmap to be unsentmap = unsentmap | dirty
2792         */
2793        bitmap_or(unsentmap, unsentmap, bitmap, pages);
2794#ifdef DEBUG_POSTCOPY
2795        ram_debug_dump_bitmap(unsentmap, true, pages);
2796#endif
2797    }
2798    trace_ram_postcopy_send_discard_bitmap();
2799
2800    ret = postcopy_each_ram_send_discard(ms);
2801    rcu_read_unlock();
2802
2803    return ret;
2804}
2805
2806/**
2807 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2808 *
2809 * Returns zero on success
2810 *
2811 * @rbname: name of the RAMBlock of the request. NULL means the
2812 *          same that last one.
2813 * @start: RAMBlock starting page
2814 * @length: RAMBlock size
2815 */
2816int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2817{
2818    int ret = -1;
2819
2820    trace_ram_discard_range(rbname, start, length);
2821
2822    rcu_read_lock();
2823    RAMBlock *rb = qemu_ram_block_by_name(rbname);
2824
2825    if (!rb) {
2826        error_report("ram_discard_range: Failed to find block '%s'", rbname);
2827        goto err;
2828    }
2829
2830    /*
2831     * On source VM, we don't need to update the received bitmap since
2832     * we don't even have one.
2833     */
2834    if (rb->receivedmap) {
2835        bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2836                     length >> qemu_target_page_bits());
2837    }
2838
2839    ret = ram_block_discard_range(rb, start, length);
2840
2841err:
2842    rcu_read_unlock();
2843
2844    return ret;
2845}
2846
2847/*
2848 * For every allocation, we will try not to crash the VM if the
2849 * allocation failed.
2850 */
2851static int xbzrle_init(void)
2852{
2853    Error *local_err = NULL;
2854
2855    if (!migrate_use_xbzrle()) {
2856        return 0;
2857    }
2858
2859    XBZRLE_cache_lock();
2860
2861    XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2862    if (!XBZRLE.zero_target_page) {
2863        error_report("%s: Error allocating zero page", __func__);
2864        goto err_out;
2865    }
2866
2867    XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2868                              TARGET_PAGE_SIZE, &local_err);
2869    if (!XBZRLE.cache) {
2870        error_report_err(local_err);
2871        goto free_zero_page;
2872    }
2873
2874    XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2875    if (!XBZRLE.encoded_buf) {
2876        error_report("%s: Error allocating encoded_buf", __func__);
2877        goto free_cache;
2878    }
2879
2880    XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2881    if (!XBZRLE.current_buf) {
2882        error_report("%s: Error allocating current_buf", __func__);
2883        goto free_encoded_buf;
2884    }
2885
2886    /* We are all good */
2887    XBZRLE_cache_unlock();
2888    return 0;
2889
2890free_encoded_buf:
2891    g_free(XBZRLE.encoded_buf);
2892    XBZRLE.encoded_buf = NULL;
2893free_cache:
2894    cache_fini(XBZRLE.cache);
2895    XBZRLE.cache = NULL;
2896free_zero_page:
2897    g_free(XBZRLE.zero_target_page);
2898    XBZRLE.zero_target_page = NULL;
2899err_out:
2900    XBZRLE_cache_unlock();
2901    return -ENOMEM;
2902}
2903
2904static int ram_state_init(RAMState **rsp)
2905{
2906    *rsp = g_try_new0(RAMState, 1);
2907
2908    if (!*rsp) {
2909        error_report("%s: Init ramstate fail", __func__);
2910        return -1;
2911    }
2912
2913    qemu_mutex_init(&(*rsp)->bitmap_mutex);
2914    qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2915    QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2916
2917    /*
2918     * Count the total number of pages used by ram blocks not including any
2919     * gaps due to alignment or unplugs.
2920     */
2921    (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2922
2923    ram_state_reset(*rsp);
2924
2925    return 0;
2926}
2927
2928static void ram_list_init_bitmaps(void)
2929{
2930    RAMBlock *block;
2931    unsigned long pages;
2932
2933    /* Skip setting bitmap if there is no RAM */
2934    if (ram_bytes_total()) {
2935        RAMBLOCK_FOREACH_MIGRATABLE(block) {
2936            pages = block->max_length >> TARGET_PAGE_BITS;
2937            block->bmap = bitmap_new(pages);
2938            bitmap_set(block->bmap, 0, pages);
2939            if (migrate_postcopy_ram()) {
2940                block->unsentmap = bitmap_new(pages);
2941                bitmap_set(block->unsentmap, 0, pages);
2942            }
2943        }
2944    }
2945}
2946
2947static void ram_init_bitmaps(RAMState *rs)
2948{
2949    /* For memory_global_dirty_log_start below.  */
2950    qemu_mutex_lock_iothread();
2951    qemu_mutex_lock_ramlist();
2952    rcu_read_lock();
2953
2954    ram_list_init_bitmaps();
2955    memory_global_dirty_log_start();
2956    migration_bitmap_sync(rs);
2957
2958    rcu_read_unlock();
2959    qemu_mutex_unlock_ramlist();
2960    qemu_mutex_unlock_iothread();
2961}
2962
2963static int ram_init_all(RAMState **rsp)
2964{
2965    if (ram_state_init(rsp)) {
2966        return -1;
2967    }
2968
2969    if (xbzrle_init()) {
2970        ram_state_cleanup(rsp);
2971        return -1;
2972    }
2973
2974    ram_init_bitmaps(*rsp);
2975
2976    return 0;
2977}
2978
2979static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2980{
2981    RAMBlock *block;
2982    uint64_t pages = 0;
2983
2984    /*
2985     * Postcopy is not using xbzrle/compression, so no need for that.
2986     * Also, since source are already halted, we don't need to care
2987     * about dirty page logging as well.
2988     */
2989
2990    RAMBLOCK_FOREACH_MIGRATABLE(block) {
2991        pages += bitmap_count_one(block->bmap,
2992                                  block->used_length >> TARGET_PAGE_BITS);
2993    }
2994
2995    /* This may not be aligned with current bitmaps. Recalculate. */
2996    rs->migration_dirty_pages = pages;
2997
2998    rs->last_seen_block = NULL;
2999    rs->last_sent_block = NULL;
3000    rs->last_page = 0;
3001    rs->last_version = ram_list.version;
3002    /*
3003     * Disable the bulk stage, otherwise we'll resend the whole RAM no
3004     * matter what we have sent.
3005     */
3006    rs->ram_bulk_stage = false;
3007
3008    /* Update RAMState cache of output QEMUFile */
3009    rs->f = out;
3010
3011    trace_ram_state_resume_prepare(pages);
3012}
3013
3014/*
3015 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3016 * long-running RCU critical section.  When rcu-reclaims in the code
3017 * start to become numerous it will be necessary to reduce the
3018 * granularity of these critical sections.
3019 */
3020
3021/**
3022 * ram_save_setup: Setup RAM for migration
3023 *
3024 * Returns zero to indicate success and negative for error
3025 *
3026 * @f: QEMUFile where to send the data
3027 * @opaque: RAMState pointer
3028 */
3029static int ram_save_setup(QEMUFile *f, void *opaque)
3030{
3031    RAMState **rsp = opaque;
3032    RAMBlock *block;
3033
3034    if (compress_threads_save_setup()) {
3035        return -1;
3036    }
3037
3038    /* migration has already setup the bitmap, reuse it. */
3039    if (!migration_in_colo_state()) {
3040        if (ram_init_all(rsp) != 0) {
3041            compress_threads_save_cleanup();
3042            return -1;
3043        }
3044    }
3045    (*rsp)->f = f;
3046
3047    rcu_read_lock();
3048
3049    qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
3050
3051    RAMBLOCK_FOREACH_MIGRATABLE(block) {
3052        qemu_put_byte(f, strlen(block->idstr));
3053        qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3054        qemu_put_be64(f, block->used_length);
3055        if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
3056            qemu_put_be64(f, block->page_size);
3057        }
3058    }
3059
3060    rcu_read_unlock();
3061
3062    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3063    ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3064
3065    multifd_send_sync_main();
3066    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3067    qemu_fflush(f);
3068
3069    return 0;
3070}
3071
3072/**
3073 * ram_save_iterate: iterative stage for migration
3074 *
3075 * Returns zero to indicate success and negative for error
3076 *
3077 * @f: QEMUFile where to send the data
3078 * @opaque: RAMState pointer
3079 */
3080static int ram_save_iterate(QEMUFile *f, void *opaque)
3081{
3082    RAMState **temp = opaque;
3083    RAMState *rs = *temp;
3084    int ret;
3085    int i;
3086    int64_t t0;
3087    int done = 0;
3088
3089    if (blk_mig_bulk_active()) {
3090        /* Avoid transferring ram during bulk phase of block migration as
3091         * the bulk phase will usually take a long time and transferring
3092         * ram updates during that time is pointless. */
3093        goto out;
3094    }
3095
3096    rcu_read_lock();
3097    if (ram_list.version != rs->last_version) {
3098        ram_state_reset(rs);
3099    }
3100
3101    /* Read version before ram_list.blocks */
3102    smp_rmb();
3103
3104    ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3105
3106    t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3107    i = 0;
3108    while ((ret = qemu_file_rate_limit(f)) == 0 ||
3109            !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3110        int pages;
3111
3112        if (qemu_file_get_error(f)) {
3113            break;
3114        }
3115
3116        pages = ram_find_and_save_block(rs, false);
3117        /* no more pages to sent */
3118        if (pages == 0) {
3119            done = 1;
3120            break;
3121        }
3122        rs->iterations++;
3123
3124        /* we want to check in the 1st loop, just in case it was the 1st time
3125           and we had to sync the dirty bitmap.
3126           qemu_get_clock_ns() is a bit expensive, so we only check each some
3127           iterations
3128        */
3129        if ((i & 63) == 0) {
3130            uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
3131            if (t1 > MAX_WAIT) {
3132                trace_ram_save_iterate_big_wait(t1, i);
3133                break;
3134            }
3135        }
3136        i++;
3137    }
3138    flush_compressed_data(rs);
3139    rcu_read_unlock();
3140
3141    /*
3142     * Must occur before EOS (or any QEMUFile operation)
3143     * because of RDMA protocol.
3144     */
3145    ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3146
3147    multifd_send_sync_main();
3148out:
3149    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3150    qemu_fflush(f);
3151    ram_counters.transferred += 8;
3152
3153    ret = qemu_file_get_error(f);
3154    if (ret < 0) {
3155        return ret;
3156    }
3157
3158    return done;
3159}
3160
3161/**
3162 * ram_save_complete: function called to send the remaining amount of ram
3163 *
3164 * Returns zero to indicate success
3165 *
3166 * Called with iothread lock
3167 *
3168 * @f: QEMUFile where to send the data
3169 * @opaque: RAMState pointer
3170 */
3171static int ram_save_complete(QEMUFile *f, void *opaque)
3172{
3173    RAMState **temp = opaque;
3174    RAMState *rs = *temp;
3175
3176    rcu_read_lock();
3177
3178    if (!migration_in_postcopy()) {
3179        migration_bitmap_sync(rs);
3180    }
3181
3182    ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3183
3184    /* try transferring iterative blocks of memory */
3185
3186    /* flush all remaining blocks regardless of rate limiting */
3187    while (true) {
3188        int pages;
3189
3190        pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3191        /* no more blocks to sent */
3192        if (pages == 0) {
3193            break;
3194        }
3195    }
3196
3197    flush_compressed_data(rs);
3198    ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3199
3200    rcu_read_unlock();
3201
3202    multifd_send_sync_main();
3203    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3204    qemu_fflush(f);
3205
3206    return 0;
3207}
3208
3209static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3210                             uint64_t *res_precopy_only,
3211                             uint64_t *res_compatible,
3212                             uint64_t *res_postcopy_only)
3213{
3214    RAMState **temp = opaque;
3215    RAMState *rs = *temp;
3216    uint64_t remaining_size;
3217
3218    remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3219
3220    if (!migration_in_postcopy() &&
3221        remaining_size < max_size) {
3222        qemu_mutex_lock_iothread();
3223        rcu_read_lock();
3224        migration_bitmap_sync(rs);
3225        rcu_read_unlock();
3226        qemu_mutex_unlock_iothread();
3227        remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3228    }
3229
3230    if (migrate_postcopy_ram()) {
3231        /* We can do postcopy, and all the data is postcopiable */
3232        *res_compatible += remaining_size;
3233    } else {
3234        *res_precopy_only += remaining_size;
3235    }
3236}
3237
3238static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3239{
3240    unsigned int xh_len;
3241    int xh_flags;
3242    uint8_t *loaded_data;
3243
3244    /* extract RLE header */
3245    xh_flags = qemu_get_byte(f);
3246    xh_len = qemu_get_be16(f);
3247
3248    if (xh_flags != ENCODING_FLAG_XBZRLE) {
3249        error_report("Failed to load XBZRLE page - wrong compression!");
3250        return -1;
3251    }
3252
3253    if (xh_len > TARGET_PAGE_SIZE) {
3254        error_report("Failed to load XBZRLE page - len overflow!");
3255        return -1;
3256    }
3257    loaded_data = XBZRLE.decoded_buf;
3258    /* load data and decode */
3259    /* it can change loaded_data to point to an internal buffer */
3260    qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3261
3262    /* decode RLE */
3263    if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3264                             TARGET_PAGE_SIZE) == -1) {
3265        error_report("Failed to load XBZRLE page - decode error!");
3266        return -1;
3267    }
3268
3269    return 0;
3270}
3271
3272/**
3273 * ram_block_from_stream: read a RAMBlock id from the migration stream
3274 *
3275 * Must be called from within a rcu critical section.
3276 *
3277 * Returns a pointer from within the RCU-protected ram_list.
3278 *
3279 * @f: QEMUFile where to read the data from
3280 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3281 */
3282static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3283{
3284    static RAMBlock *block = NULL;
3285    char id[256];
3286    uint8_t len;
3287
3288    if (flags & RAM_SAVE_FLAG_CONTINUE) {
3289        if (!block) {
3290            error_report("Ack, bad migration stream!");
3291            return NULL;
3292        }
3293        return block;
3294    }
3295
3296    len = qemu_get_byte(f);
3297    qemu_get_buffer(f, (uint8_t *)id, len);
3298    id[len] = 0;
3299
3300    block = qemu_ram_block_by_name(id);
3301    if (!block) {
3302        error_report("Can't find block %s", id);
3303        return NULL;
3304    }
3305
3306    if (!qemu_ram_is_migratable(block)) {
3307        error_report("block %s should not be migrated !", id);
3308        return NULL;
3309    }
3310
3311    return block;
3312}
3313
3314static inline void *host_from_ram_block_offset(RAMBlock *block,
3315                                               ram_addr_t offset)
3316{
3317    if (!offset_in_ramblock(block, offset)) {
3318        return NULL;
3319    }
3320
3321    return block->host + offset;
3322}
3323
3324/**
3325 * ram_handle_compressed: handle the zero page case
3326 *
3327 * If a page (or a whole RDMA chunk) has been
3328 * determined to be zero, then zap it.
3329 *
3330 * @host: host address for the zero page
3331 * @ch: what the page is filled from.  We only support zero
3332 * @size: size of the zero page
3333 */
3334void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3335{
3336    if (ch != 0 || !is_zero_range(host, size)) {
3337        memset(host, ch, size);
3338    }
3339}
3340
3341/* return the size after decompression, or negative value on error */
3342static int
3343qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3344                     const uint8_t *source, size_t source_len)
3345{
3346    int err;
3347
3348    err = inflateReset(stream);
3349    if (err != Z_OK) {
3350        return -1;
3351    }
3352
3353    stream->avail_in = source_len;
3354    stream->next_in = (uint8_t *)source;
3355    stream->avail_out = dest_len;
3356    stream->next_out = dest;
3357
3358    err = inflate(stream, Z_NO_FLUSH);
3359    if (err != Z_STREAM_END) {
3360        return -1;
3361    }
3362
3363    return stream->total_out;
3364}
3365
3366static void *do_data_decompress(void *opaque)
3367{
3368    DecompressParam *param = opaque;
3369    unsigned long pagesize;
3370    uint8_t *des;
3371    int len, ret;
3372
3373    qemu_mutex_lock(&param->mutex);
3374    while (!param->quit) {
3375        if (param->des) {
3376            des = param->des;
3377            len = param->len;
3378            param->des = 0;
3379            qemu_mutex_unlock(&param->mutex);
3380
3381            pagesize = TARGET_PAGE_SIZE;
3382
3383            ret = qemu_uncompress_data(&param->stream, des, pagesize,
3384                                       param->compbuf, len);
3385            if (ret < 0 && migrate_get_current()->decompress_error_check) {
3386                error_report("decompress data failed");
3387                qemu_file_set_error(decomp_file, ret);
3388            }
3389
3390            qemu_mutex_lock(&decomp_done_lock);
3391            param->done = true;
3392            qemu_cond_signal(&decomp_done_cond);
3393            qemu_mutex_unlock(&decomp_done_lock);
3394
3395            qemu_mutex_lock(&param->mutex);
3396        } else {
3397            qemu_cond_wait(&param->cond, &param->mutex);
3398        }
3399    }
3400    qemu_mutex_unlock(&param->mutex);
3401
3402    return NULL;
3403}
3404
3405static int wait_for_decompress_done(void)
3406{
3407    int idx, thread_count;
3408
3409    if (!migrate_use_compression()) {
3410        return 0;
3411    }
3412
3413    thread_count = migrate_decompress_threads();
3414    qemu_mutex_lock(&decomp_done_lock);
3415    for (idx = 0; idx < thread_count; idx++) {
3416        while (!decomp_param[idx].done) {
3417            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3418        }
3419    }
3420    qemu_mutex_unlock(&decomp_done_lock);
3421    return qemu_file_get_error(decomp_file);
3422}
3423
3424static void compress_threads_load_cleanup(void)
3425{
3426    int i, thread_count;
3427
3428    if (!migrate_use_compression()) {
3429        return;
3430    }
3431    thread_count = migrate_decompress_threads();
3432    for (i = 0; i < thread_count; i++) {
3433        /*
3434         * we use it as a indicator which shows if the thread is
3435         * properly init'd or not
3436         */
3437        if (!decomp_param[i].compbuf) {
3438            break;
3439        }
3440
3441        qemu_mutex_lock(&decomp_param[i].mutex);
3442        decomp_param[i].quit = true;
3443        qemu_cond_signal(&decomp_param[i].cond);
3444        qemu_mutex_unlock(&decomp_param[i].mutex);
3445    }
3446    for (i = 0; i < thread_count; i++) {
3447        if (!decomp_param[i].compbuf) {
3448            break;
3449        }
3450
3451        qemu_thread_join(decompress_threads + i);
3452        qemu_mutex_destroy(&decomp_param[i].mutex);
3453        qemu_cond_destroy(&decomp_param[i].cond);
3454        inflateEnd(&decomp_param[i].stream);
3455        g_free(decomp_param[i].compbuf);
3456        decomp_param[i].compbuf = NULL;
3457    }
3458    g_free(decompress_threads);
3459    g_free(decomp_param);
3460    decompress_threads = NULL;
3461    decomp_param = NULL;
3462    decomp_file = NULL;
3463}
3464
3465static int compress_threads_load_setup(QEMUFile *f)
3466{
3467    int i, thread_count;
3468
3469    if (!migrate_use_compression()) {
3470        return 0;
3471    }
3472
3473    thread_count = migrate_decompress_threads();
3474    decompress_threads = g_new0(QemuThread, thread_count);
3475    decomp_param = g_new0(DecompressParam, thread_count);
3476    qemu_mutex_init(&decomp_done_lock);
3477    qemu_cond_init(&decomp_done_cond);
3478    decomp_file = f;
3479    for (i = 0; i < thread_count; i++) {
3480        if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3481            goto exit;
3482        }
3483
3484        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3485        qemu_mutex_init(&decomp_param[i].mutex);
3486        qemu_cond_init(&decomp_param[i].cond);
3487        decomp_param[i].done = true;
3488        decomp_param[i].quit = false;
3489        qemu_thread_create(decompress_threads + i, "decompress",
3490                           do_data_decompress, decomp_param + i,
3491                           QEMU_THREAD_JOINABLE);
3492    }
3493    return 0;
3494exit:
3495    compress_threads_load_cleanup();
3496    return -1;
3497}
3498
3499static void decompress_data_with_multi_threads(QEMUFile *f,
3500                                               void *host, int len)
3501{
3502    int idx, thread_count;
3503
3504    thread_count = migrate_decompress_threads();
3505    qemu_mutex_lock(&decomp_done_lock);
3506    while (true) {
3507        for (idx = 0; idx < thread_count; idx++) {
3508            if (decomp_param[idx].done) {
3509                decomp_param[idx].done = false;
3510                qemu_mutex_lock(&decomp_param[idx].mutex);
3511                qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3512                decomp_param[idx].des = host;
3513                decomp_param[idx].len = len;
3514                qemu_cond_signal(&decomp_param[idx].cond);
3515                qemu_mutex_unlock(&decomp_param[idx].mutex);
3516                break;
3517            }
3518        }
3519        if (idx < thread_count) {
3520            break;
3521        } else {
3522            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3523        }
3524    }
3525    qemu_mutex_unlock(&decomp_done_lock);
3526}
3527
3528/**
3529 * ram_load_setup: Setup RAM for migration incoming side
3530 *
3531 * Returns zero to indicate success and negative for error
3532 *
3533 * @f: QEMUFile where to receive the data
3534 * @opaque: RAMState pointer
3535 */
3536static int ram_load_setup(QEMUFile *f, void *opaque)
3537{
3538    if (compress_threads_load_setup(f)) {
3539        return -1;
3540    }
3541
3542    xbzrle_load_setup();
3543    ramblock_recv_map_init();
3544    return 0;
3545}
3546
3547static int ram_load_cleanup(void *opaque)
3548{
3549    RAMBlock *rb;
3550    xbzrle_load_cleanup();
3551    compress_threads_load_cleanup();
3552
3553    RAMBLOCK_FOREACH_MIGRATABLE(rb) {
3554        g_free(rb->receivedmap);
3555        rb->receivedmap = NULL;
3556    }
3557    return 0;
3558}
3559
3560/**
3561 * ram_postcopy_incoming_init: allocate postcopy data structures
3562 *
3563 * Returns 0 for success and negative if there was one error
3564 *
3565 * @mis: current migration incoming state
3566 *
3567 * Allocate data structures etc needed by incoming migration with
3568 * postcopy-ram. postcopy-ram's similarly names
3569 * postcopy_ram_incoming_init does the work.
3570 */
3571int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3572{
3573    return postcopy_ram_incoming_init(mis);
3574}
3575
3576/**
3577 * ram_load_postcopy: load a page in postcopy case
3578 *
3579 * Returns 0 for success or -errno in case of error
3580 *
3581 * Called in postcopy mode by ram_load().
3582 * rcu_read_lock is taken prior to this being called.
3583 *
3584 * @f: QEMUFile where to send the data
3585 */
3586static int ram_load_postcopy(QEMUFile *f)
3587{
3588    int flags = 0, ret = 0;
3589    bool place_needed = false;
3590    bool matches_target_page_size = false;
3591    MigrationIncomingState *mis = migration_incoming_get_current();
3592    /* Temporary page that is later 'placed' */
3593    void *postcopy_host_page = postcopy_get_tmp_page(mis);
3594    void *last_host = NULL;
3595    bool all_zero = false;
3596
3597    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3598        ram_addr_t addr;
3599        void *host = NULL;
3600        void *page_buffer = NULL;
3601        void *place_source = NULL;
3602        RAMBlock *block = NULL;
3603        uint8_t ch;
3604
3605        addr = qemu_get_be64(f);
3606
3607        /*
3608         * If qemu file error, we should stop here, and then "addr"
3609         * may be invalid
3610         */
3611        ret = qemu_file_get_error(f);
3612        if (ret) {
3613            break;
3614        }
3615
3616        flags = addr & ~TARGET_PAGE_MASK;
3617        addr &= TARGET_PAGE_MASK;
3618
3619        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3620        place_needed = false;
3621        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
3622            block = ram_block_from_stream(f, flags);
3623
3624            host = host_from_ram_block_offset(block, addr);
3625            if (!host) {
3626                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3627                ret = -EINVAL;
3628                break;
3629            }
3630            matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3631            /*
3632             * Postcopy requires that we place whole host pages atomically;
3633             * these may be huge pages for RAMBlocks that are backed by
3634             * hugetlbfs.
3635             * To make it atomic, the data is read into a temporary page
3636             * that's moved into place later.
3637             * The migration protocol uses,  possibly smaller, target-pages
3638             * however the source ensures it always sends all the components
3639             * of a host page in order.
3640             */
3641            page_buffer = postcopy_host_page +
3642                          ((uintptr_t)host & (block->page_size - 1));
3643            /* If all TP are zero then we can optimise the place */
3644            if (!((uintptr_t)host & (block->page_size - 1))) {
3645                all_zero = true;
3646            } else {
3647                /* not the 1st TP within the HP */
3648                if (host != (last_host + TARGET_PAGE_SIZE)) {
3649                    error_report("Non-sequential target page %p/%p",
3650                                  host, last_host);
3651                    ret = -EINVAL;
3652                    break;
3653                }
3654            }
3655
3656
3657            /*
3658             * If it's the last part of a host page then we place the host
3659             * page
3660             */
3661            place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
3662                                     (block->page_size - 1)) == 0;
3663            place_source = postcopy_host_page;
3664        }
3665        last_host = host;
3666
3667        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3668        case RAM_SAVE_FLAG_ZERO:
3669            ch = qemu_get_byte(f);
3670            memset(page_buffer, ch, TARGET_PAGE_SIZE);
3671            if (ch) {
3672                all_zero = false;
3673            }
3674            break;
3675
3676        case RAM_SAVE_FLAG_PAGE:
3677            all_zero = false;
3678            if (!matches_target_page_size) {
3679                /* For huge pages, we always use temporary buffer */
3680                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3681            } else {
3682                /*
3683                 * For small pages that matches target page size, we
3684                 * avoid the qemu_file copy.  Instead we directly use
3685                 * the buffer of QEMUFile to place the page.  Note: we
3686                 * cannot do any QEMUFile operation before using that
3687                 * buffer to make sure the buffer is valid when
3688                 * placing the page.
3689                 */
3690                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3691                                         TARGET_PAGE_SIZE);
3692            }
3693            break;
3694        case RAM_SAVE_FLAG_EOS:
3695            /* normal exit */
3696            multifd_recv_sync_main();
3697            break;
3698        default:
3699            error_report("Unknown combination of migration flags: %#x"
3700                         " (postcopy mode)", flags);
3701            ret = -EINVAL;
3702            break;
3703        }
3704
3705        /* Detect for any possible file errors */
3706        if (!ret && qemu_file_get_error(f)) {
3707            ret = qemu_file_get_error(f);
3708        }
3709
3710        if (!ret && place_needed) {
3711            /* This gets called at the last target page in the host page */
3712            void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
3713
3714            if (all_zero) {
3715                ret = postcopy_place_page_zero(mis, place_dest,
3716                                               block);
3717            } else {
3718                ret = postcopy_place_page(mis, place_dest,
3719                                          place_source, block);
3720            }
3721        }
3722    }
3723
3724    return ret;
3725}
3726
3727static bool postcopy_is_advised(void)
3728{
3729    PostcopyState ps = postcopy_state_get();
3730    return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3731}
3732
3733static bool postcopy_is_running(void)
3734{
3735    PostcopyState ps = postcopy_state_get();
3736    return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3737}
3738
3739static int ram_load(QEMUFile *f, void *opaque, int version_id)
3740{
3741    int flags = 0, ret = 0, invalid_flags = 0;
3742    static uint64_t seq_iter;
3743    int len = 0;
3744    /*
3745     * If system is running in postcopy mode, page inserts to host memory must
3746     * be atomic
3747     */
3748    bool postcopy_running = postcopy_is_running();
3749    /* ADVISE is earlier, it shows the source has the postcopy capability on */
3750    bool postcopy_advised = postcopy_is_advised();
3751
3752    seq_iter++;
3753
3754    if (version_id != 4) {
3755        ret = -EINVAL;
3756    }
3757
3758    if (!migrate_use_compression()) {
3759        invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3760    }
3761    /* This RCU critical section can be very long running.
3762     * When RCU reclaims in the code start to become numerous,
3763     * it will be necessary to reduce the granularity of this
3764     * critical section.
3765     */
3766    rcu_read_lock();
3767
3768    if (postcopy_running) {
3769        ret = ram_load_postcopy(f);
3770    }
3771
3772    while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3773        ram_addr_t addr, total_ram_bytes;
3774        void *host = NULL;
3775        uint8_t ch;
3776
3777        addr = qemu_get_be64(f);
3778        flags = addr & ~TARGET_PAGE_MASK;
3779        addr &= TARGET_PAGE_MASK;
3780
3781        if (flags & invalid_flags) {
3782            if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3783                error_report("Received an unexpected compressed page");
3784            }
3785
3786            ret = -EINVAL;
3787            break;
3788        }
3789
3790        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3791                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3792            RAMBlock *block = ram_block_from_stream(f, flags);
3793
3794            host = host_from_ram_block_offset(block, addr);
3795            if (!host) {
3796                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3797                ret = -EINVAL;
3798                break;
3799            }
3800            ramblock_recv_bitmap_set(block, host);
3801            trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3802        }
3803
3804        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3805        case RAM_SAVE_FLAG_MEM_SIZE:
3806            /* Synchronize RAM block list */
3807            total_ram_bytes = addr;
3808            while (!ret && total_ram_bytes) {
3809                RAMBlock *block;
3810                char id[256];
3811                ram_addr_t length;
3812
3813                len = qemu_get_byte(f);
3814                qemu_get_buffer(f, (uint8_t *)id, len);
3815                id[len] = 0;
3816                length = qemu_get_be64(f);
3817
3818                block = qemu_ram_block_by_name(id);
3819                if (block && !qemu_ram_is_migratable(block)) {
3820                    error_report("block %s should not be migrated !", id);
3821                    ret = -EINVAL;
3822                } else if (block) {
3823                    if (length != block->used_length) {
3824                        Error *local_err = NULL;
3825
3826                        ret = qemu_ram_resize(block, length,
3827                                              &local_err);
3828                        if (local_err) {
3829                            error_report_err(local_err);
3830                        }
3831                    }
3832                    /* For postcopy we need to check hugepage sizes match */
3833                    if (postcopy_advised &&
3834                        block->page_size != qemu_host_page_size) {
3835                        uint64_t remote_page_size = qemu_get_be64(f);
3836                        if (remote_page_size != block->page_size) {
3837                            error_report("Mismatched RAM page size %s "
3838                                         "(local) %zd != %" PRId64,
3839                                         id, block->page_size,
3840                                         remote_page_size);
3841                            ret = -EINVAL;
3842                        }
3843                    }
3844                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3845                                          block->idstr);
3846                } else {
3847                    error_report("Unknown ramblock \"%s\", cannot "
3848                                 "accept migration", id);
3849                    ret = -EINVAL;
3850                }
3851
3852                total_ram_bytes -= length;
3853            }
3854            break;
3855
3856        case RAM_SAVE_FLAG_ZERO:
3857            ch = qemu_get_byte(f);
3858            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3859            break;
3860
3861        case RAM_SAVE_FLAG_PAGE:
3862            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3863            break;
3864
3865        case RAM_SAVE_FLAG_COMPRESS_PAGE:
3866            len = qemu_get_be32(f);
3867            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3868                error_report("Invalid compressed data length: %d", len);
3869                ret = -EINVAL;
3870                break;
3871            }
3872            decompress_data_with_multi_threads(f, host, len);
3873            break;
3874
3875        case RAM_SAVE_FLAG_XBZRLE:
3876            if (load_xbzrle(f, addr, host) < 0) {
3877                error_report("Failed to decompress XBZRLE page at "
3878                             RAM_ADDR_FMT, addr);
3879                ret = -EINVAL;
3880                break;
3881            }
3882            break;
3883        case RAM_SAVE_FLAG_EOS:
3884            /* normal exit */
3885            multifd_recv_sync_main();
3886            break;
3887        default:
3888            if (flags & RAM_SAVE_FLAG_HOOK) {
3889                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3890            } else {
3891                error_report("Unknown combination of migration flags: %#x",
3892                             flags);
3893                ret = -EINVAL;
3894            }
3895        }
3896        if (!ret) {
3897            ret = qemu_file_get_error(f);
3898        }
3899    }
3900
3901    ret |= wait_for_decompress_done();
3902    rcu_read_unlock();
3903    trace_ram_load_complete(ret, seq_iter);
3904    return ret;
3905}
3906
3907static bool ram_has_postcopy(void *opaque)
3908{
3909    return migrate_postcopy_ram();
3910}
3911
3912/* Sync all the dirty bitmap with destination VM.  */
3913static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3914{
3915    RAMBlock *block;
3916    QEMUFile *file = s->to_dst_file;
3917    int ramblock_count = 0;
3918
3919    trace_ram_dirty_bitmap_sync_start();
3920
3921    RAMBLOCK_FOREACH_MIGRATABLE(block) {
3922        qemu_savevm_send_recv_bitmap(file, block->idstr);
3923        trace_ram_dirty_bitmap_request(block->idstr);
3924        ramblock_count++;
3925    }
3926
3927    trace_ram_dirty_bitmap_sync_wait();
3928
3929    /* Wait until all the ramblocks' dirty bitmap synced */
3930    while (ramblock_count--) {
3931        qemu_sem_wait(&s->rp_state.rp_sem);
3932    }
3933
3934    trace_ram_dirty_bitmap_sync_complete();
3935
3936    return 0;
3937}
3938
3939static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3940{
3941    qemu_sem_post(&s->rp_state.rp_sem);
3942}
3943
3944/*
3945 * Read the received bitmap, revert it as the initial dirty bitmap.
3946 * This is only used when the postcopy migration is paused but wants
3947 * to resume from a middle point.
3948 */
3949int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3950{
3951    int ret = -EINVAL;
3952    QEMUFile *file = s->rp_state.from_dst_file;
3953    unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3954    uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3955    uint64_t size, end_mark;
3956
3957    trace_ram_dirty_bitmap_reload_begin(block->idstr);
3958
3959    if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3960        error_report("%s: incorrect state %s", __func__,
3961                     MigrationStatus_str(s->state));
3962        return -EINVAL;
3963    }
3964
3965    /*
3966     * Note: see comments in ramblock_recv_bitmap_send() on why we
3967     * need the endianess convertion, and the paddings.
3968     */
3969    local_size = ROUND_UP(local_size, 8);
3970
3971    /* Add paddings */
3972    le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3973
3974    size = qemu_get_be64(file);
3975
3976    /* The size of the bitmap should match with our ramblock */
3977    if (size != local_size) {
3978        error_report("%s: ramblock '%s' bitmap size mismatch "
3979                     "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3980                     block->idstr, size, local_size);
3981        ret = -EINVAL;
3982        goto out;
3983    }
3984
3985    size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3986    end_mark = qemu_get_be64(file);
3987
3988    ret = qemu_file_get_error(file);
3989    if (ret || size != local_size) {
3990        error_report("%s: read bitmap failed for ramblock '%s': %d"
3991                     " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3992                     __func__, block->idstr, ret, local_size, size);
3993        ret = -EIO;
3994        goto out;
3995    }
3996
3997    if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3998        error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3999                     __func__, block->idstr, end_mark);
4000        ret = -EINVAL;
4001        goto out;
4002    }
4003
4004    /*
4005     * Endianess convertion. We are during postcopy (though paused).
4006     * The dirty bitmap won't change. We can directly modify it.
4007     */
4008    bitmap_from_le(block->bmap, le_bitmap, nbits);
4009
4010    /*
4011     * What we received is "received bitmap". Revert it as the initial
4012     * dirty bitmap for this ramblock.
4013     */
4014    bitmap_complement(block->bmap, block->bmap, nbits);
4015
4016    trace_ram_dirty_bitmap_reload_complete(block->idstr);
4017
4018    /*
4019     * We succeeded to sync bitmap for current ramblock. If this is
4020     * the last one to sync, we need to notify the main send thread.
4021     */
4022    ram_dirty_bitmap_reload_notify(s);
4023
4024    ret = 0;
4025out:
4026    g_free(le_bitmap);
4027    return ret;
4028}
4029
4030static int ram_resume_prepare(MigrationState *s, void *opaque)
4031{
4032    RAMState *rs = *(RAMState **)opaque;
4033    int ret;
4034
4035    ret = ram_dirty_bitmap_sync_all(s, rs);
4036    if (ret) {
4037        return ret;
4038    }
4039
4040    ram_state_resume_prepare(rs, s->to_dst_file);
4041
4042    return 0;
4043}
4044
4045static SaveVMHandlers savevm_ram_handlers = {
4046    .save_setup = ram_save_setup,
4047    .save_live_iterate = ram_save_iterate,
4048    .save_live_complete_postcopy = ram_save_complete,
4049    .save_live_complete_precopy = ram_save_complete,
4050    .has_postcopy = ram_has_postcopy,
4051    .save_live_pending = ram_save_pending,
4052    .load_state = ram_load,
4053    .save_cleanup = ram_save_cleanup,
4054    .load_setup = ram_load_setup,
4055    .load_cleanup = ram_load_cleanup,
4056    .resume_prepare = ram_resume_prepare,
4057};
4058
4059void ram_mig_init(void)
4060{
4061    qemu_mutex_init(&XBZRLE.lock);
4062    register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
4063}
4064