LXR qemu/migration/ram.c

   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2011-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include "cpu.h"
  31#include "qemu/cutils.h"
  32#include "qemu/bitops.h"
  33#include "qemu/bitmap.h"
  34#include "qemu/main-loop.h"
  35#include "xbzrle.h"
  36#include "ram.h"
  37#include "migration.h"
  38#include "migration/register.h"
  39#include "migration/misc.h"
  40#include "qemu-file.h"
  41#include "postcopy-ram.h"
  42#include "page_cache.h"
  43#include "qemu/error-report.h"
  44#include "qapi/error.h"
  45#include "qapi/qapi-types-migration.h"
  46#include "qapi/qapi-events-migration.h"
  47#include "qapi/qmp/qerror.h"
  48#include "trace.h"
  49#include "exec/ram_addr.h"
  50#include "exec/target_page.h"
  51#include "qemu/rcu_queue.h"
  52#include "migration/colo.h"
  53#include "block.h"
  54#include "sysemu/sysemu.h"
  55#include "savevm.h"
  56#include "qemu/iov.h"
  57#include "multifd.h"
  58
  59/***********************************************************/
  60/* ram save/restore */
  61
  62/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  63 * worked for pages that where filled with the same char.  We switched
  64 * it to only search for the zero value.  And to avoid confusion with
  65 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  66 */
  67
  68#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  69#define RAM_SAVE_FLAG_ZERO     0x02
  70#define RAM_SAVE_FLAG_MEM_SIZE 0x04
  71#define RAM_SAVE_FLAG_PAGE     0x08
  72#define RAM_SAVE_FLAG_EOS      0x10
  73#define RAM_SAVE_FLAG_CONTINUE 0x20
  74#define RAM_SAVE_FLAG_XBZRLE   0x40
  75/* 0x80 is reserved in migration.h start with 0x100 next */
  76#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  77
  78static inline bool is_zero_range(uint8_t *p, uint64_t size)
  79{
  80    return buffer_is_zero(p, size);
  81}
  82
  83XBZRLECacheStats xbzrle_counters;
  84
  85/* struct contains XBZRLE cache and a static page
  86   used by the compression */
  87static struct {
  88    /* buffer used for XBZRLE encoding */
  89    uint8_t *encoded_buf;
  90    /* buffer for storing page content */
  91    uint8_t *current_buf;
  92    /* Cache for XBZRLE, Protected by lock. */
  93    PageCache *cache;
  94    QemuMutex lock;
  95    /* it will store a page full of zeros */
  96    uint8_t *zero_target_page;
  97    /* buffer used for XBZRLE decoding */
  98    uint8_t *decoded_buf;
  99} XBZRLE;
 100
 101static void XBZRLE_cache_lock(void)
 102{
 103    if (migrate_use_xbzrle())
 104        qemu_mutex_lock(&XBZRLE.lock);
 105}
 106
 107static void XBZRLE_cache_unlock(void)
 108{
 109    if (migrate_use_xbzrle())
 110        qemu_mutex_unlock(&XBZRLE.lock);
 111}
 112
 113/**
 114 * xbzrle_cache_resize: resize the xbzrle cache
 115 *
 116 * This function is called from qmp_migrate_set_cache_size in main
 117 * thread, possibly while a migration is in progress.  A running
 118 * migration may be using the cache and might finish during this call,
 119 * hence changes to the cache are protected by XBZRLE.lock().
 120 *
 121 * Returns 0 for success or -1 for error
 122 *
 123 * @new_size: new cache size
 124 * @errp: set *errp if the check failed, with reason
 125 */
 126int xbzrle_cache_resize(int64_t new_size, Error **errp)
 127{
 128    PageCache *new_cache;
 129    int64_t ret = 0;
 130
 131    /* Check for truncation */
 132    if (new_size != (size_t)new_size) {
 133        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 134                   "exceeding address space");
 135        return -1;
 136    }
 137
 138    if (new_size == migrate_xbzrle_cache_size()) {
 139        /* nothing to do */
 140        return 0;
 141    }
 142
 143    XBZRLE_cache_lock();
 144
 145    if (XBZRLE.cache != NULL) {
 146        new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 147        if (!new_cache) {
 148            ret = -1;
 149            goto out;
 150        }
 151
 152        cache_fini(XBZRLE.cache);
 153        XBZRLE.cache = new_cache;
 154    }
 155out:
 156    XBZRLE_cache_unlock();
 157    return ret;
 158}
 159
 160static bool ramblock_is_ignored(RAMBlock *block)
 161{
 162    return !qemu_ram_is_migratable(block) ||
 163           (migrate_ignore_shared() && qemu_ram_is_shared(block));
 164}
 165
 166/* Should be holding either ram_list.mutex, or the RCU lock. */
 167#define RAMBLOCK_FOREACH_NOT_IGNORED(block)            \
 168    INTERNAL_RAMBLOCK_FOREACH(block)                   \
 169        if (ramblock_is_ignored(block)) {} else
 170
 171#define RAMBLOCK_FOREACH_MIGRATABLE(block)             \
 172    INTERNAL_RAMBLOCK_FOREACH(block)                   \
 173        if (!qemu_ram_is_migratable(block)) {} else
 174
 175#undef RAMBLOCK_FOREACH
 176
 177int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 178{
 179    RAMBlock *block;
 180    int ret = 0;
 181
 182    RCU_READ_LOCK_GUARD();
 183
 184    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 185        ret = func(block, opaque);
 186        if (ret) {
 187            break;
 188        }
 189    }
 190    return ret;
 191}
 192
 193static void ramblock_recv_map_init(void)
 194{
 195    RAMBlock *rb;
 196
 197    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 198        assert(!rb->receivedmap);
 199        rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 200    }
 201}
 202
 203int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 204{
 205    return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 206                    rb->receivedmap);
 207}
 208
 209bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 210{
 211    return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 212}
 213
 214void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 215{
 216    set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 217}
 218
 219void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 220                                    size_t nr)
 221{
 222    bitmap_set_atomic(rb->receivedmap,
 223                      ramblock_recv_bitmap_offset(host_addr, rb),
 224                      nr);
 225}
 226
 227#define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 228
 229/*
 230 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 231 *
 232 * Returns >0 if success with sent bytes, or <0 if error.
 233 */
 234int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 235                                  const char *block_name)
 236{
 237    RAMBlock *block = qemu_ram_block_by_name(block_name);
 238    unsigned long *le_bitmap, nbits;
 239    uint64_t size;
 240
 241    if (!block) {
 242        error_report("%s: invalid block name: %s", __func__, block_name);
 243        return -1;
 244    }
 245
 246    nbits = block->used_length >> TARGET_PAGE_BITS;
 247
 248    /*
 249     * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 250     * machines we may need 4 more bytes for padding (see below
 251     * comment). So extend it a bit before hand.
 252     */
 253    le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 254
 255    /*
 256     * Always use little endian when sending the bitmap. This is
 257     * required that when source and destination VMs are not using the
 258     * same endianess. (Note: big endian won't work.)
 259     */
 260    bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 261
 262    /* Size of the bitmap, in bytes */
 263    size = DIV_ROUND_UP(nbits, 8);
 264
 265    /*
 266     * size is always aligned to 8 bytes for 64bit machines, but it
 267     * may not be true for 32bit machines. We need this padding to
 268     * make sure the migration can survive even between 32bit and
 269     * 64bit machines.
 270     */
 271    size = ROUND_UP(size, 8);
 272
 273    qemu_put_be64(file, size);
 274    qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 275    /*
 276     * Mark as an end, in case the middle part is screwed up due to
 277     * some "misterious" reason.
 278     */
 279    qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 280    qemu_fflush(file);
 281
 282    g_free(le_bitmap);
 283
 284    if (qemu_file_get_error(file)) {
 285        return qemu_file_get_error(file);
 286    }
 287
 288    return size + sizeof(size);
 289}
 290
 291/*
 292 * An outstanding page request, on the source, having been received
 293 * and queued
 294 */
 295struct RAMSrcPageRequest {
 296    RAMBlock *rb;
 297    hwaddr    offset;
 298    hwaddr    len;
 299
 300    QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 301};
 302
 303/* State of RAM for migration */
 304struct RAMState {
 305    /* QEMUFile used for this migration */
 306    QEMUFile *f;
 307    /* Last block that we have visited searching for dirty pages */
 308    RAMBlock *last_seen_block;
 309    /* Last block from where we have sent data */
 310    RAMBlock *last_sent_block;
 311    /* Last dirty target page we have sent */
 312    ram_addr_t last_page;
 313    /* last ram version we have seen */
 314    uint32_t last_version;
 315    /* We are in the first round */
 316    bool ram_bulk_stage;
 317    /* The free page optimization is enabled */
 318    bool fpo_enabled;
 319    /* How many times we have dirty too many pages */
 320    int dirty_rate_high_cnt;
 321    /* these variables are used for bitmap sync */
 322    /* last time we did a full bitmap_sync */
 323    int64_t time_last_bitmap_sync;
 324    /* bytes transferred at start_time */
 325    uint64_t bytes_xfer_prev;
 326    /* number of dirty pages since start_time */
 327    uint64_t num_dirty_pages_period;
 328    /* xbzrle misses since the beginning of the period */
 329    uint64_t xbzrle_cache_miss_prev;
 330
 331    /* compression statistics since the beginning of the period */
 332    /* amount of count that no free thread to compress data */
 333    uint64_t compress_thread_busy_prev;
 334    /* amount bytes after compression */
 335    uint64_t compressed_size_prev;
 336    /* amount of compressed pages */
 337    uint64_t compress_pages_prev;
 338
 339    /* total handled target pages at the beginning of period */
 340    uint64_t target_page_count_prev;
 341    /* total handled target pages since start */
 342    uint64_t target_page_count;
 343    /* number of dirty bits in the bitmap */
 344    uint64_t migration_dirty_pages;
 345    /* Protects modification of the bitmap and migration dirty pages */
 346    QemuMutex bitmap_mutex;
 347    /* The RAMBlock used in the last src_page_requests */
 348    RAMBlock *last_req_rb;
 349    /* Queue of outstanding page requests from the destination */
 350    QemuMutex src_page_req_mutex;
 351    QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 352};
 353typedef struct RAMState RAMState;
 354
 355static RAMState *ram_state;
 356
 357static NotifierWithReturnList precopy_notifier_list;
 358
 359void precopy_infrastructure_init(void)
 360{
 361    notifier_with_return_list_init(&precopy_notifier_list);
 362}
 363
 364void precopy_add_notifier(NotifierWithReturn *n)
 365{
 366    notifier_with_return_list_add(&precopy_notifier_list, n);
 367}
 368
 369void precopy_remove_notifier(NotifierWithReturn *n)
 370{
 371    notifier_with_return_remove(n);
 372}
 373
 374int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 375{
 376    PrecopyNotifyData pnd;
 377    pnd.reason = reason;
 378    pnd.errp = errp;
 379
 380    return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 381}
 382
 383void precopy_enable_free_page_optimization(void)
 384{
 385    if (!ram_state) {
 386        return;
 387    }
 388
 389    ram_state->fpo_enabled = true;
 390}
 391
 392uint64_t ram_bytes_remaining(void)
 393{
 394    return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 395                       0;
 396}
 397
 398MigrationStats ram_counters;
 399
 400/* used by the search for pages to send */
 401struct PageSearchStatus {
 402    /* Current block being searched */
 403    RAMBlock    *block;
 404    /* Current page to search from */
 405    unsigned long page;
 406    /* Set once we wrap around */
 407    bool         complete_round;
 408};
 409typedef struct PageSearchStatus PageSearchStatus;
 410
 411CompressionStats compression_counters;
 412
 413struct CompressParam {
 414    bool done;
 415    bool quit;
 416    bool zero_page;
 417    QEMUFile *file;
 418    QemuMutex mutex;
 419    QemuCond cond;
 420    RAMBlock *block;
 421    ram_addr_t offset;
 422
 423    /* internally used fields */
 424    z_stream stream;
 425    uint8_t *originbuf;
 426};
 427typedef struct CompressParam CompressParam;
 428
 429struct DecompressParam {
 430    bool done;
 431    bool quit;
 432    QemuMutex mutex;
 433    QemuCond cond;
 434    void *des;
 435    uint8_t *compbuf;
 436    int len;
 437    z_stream stream;
 438};
 439typedef struct DecompressParam DecompressParam;
 440
 441static CompressParam *comp_param;
 442static QemuThread *compress_threads;
 443/* comp_done_cond is used to wake up the migration thread when
 444 * one of the compression threads has finished the compression.
 445 * comp_done_lock is used to co-work with comp_done_cond.
 446 */
 447static QemuMutex comp_done_lock;
 448static QemuCond comp_done_cond;
 449/* The empty QEMUFileOps will be used by file in CompressParam */
 450static const QEMUFileOps empty_ops = { };
 451
 452static QEMUFile *decomp_file;
 453static DecompressParam *decomp_param;
 454static QemuThread *decompress_threads;
 455static QemuMutex decomp_done_lock;
 456static QemuCond decomp_done_cond;
 457
 458static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 459                                 ram_addr_t offset, uint8_t *source_buf);
 460
 461static void *do_data_compress(void *opaque)
 462{
 463    CompressParam *param = opaque;
 464    RAMBlock *block;
 465    ram_addr_t offset;
 466    bool zero_page;
 467
 468    qemu_mutex_lock(&param->mutex);
 469    while (!param->quit) {
 470        if (param->block) {
 471            block = param->block;
 472            offset = param->offset;
 473            param->block = NULL;
 474            qemu_mutex_unlock(&param->mutex);
 475
 476            zero_page = do_compress_ram_page(param->file, &param->stream,
 477                                             block, offset, param->originbuf);
 478
 479            qemu_mutex_lock(&comp_done_lock);
 480            param->done = true;
 481            param->zero_page = zero_page;
 482            qemu_cond_signal(&comp_done_cond);
 483            qemu_mutex_unlock(&comp_done_lock);
 484
 485            qemu_mutex_lock(&param->mutex);
 486        } else {
 487            qemu_cond_wait(&param->cond, &param->mutex);
 488        }
 489    }
 490    qemu_mutex_unlock(&param->mutex);
 491
 492    return NULL;
 493}
 494
 495static void compress_threads_save_cleanup(void)
 496{
 497    int i, thread_count;
 498
 499    if (!migrate_use_compression() || !comp_param) {
 500        return;
 501    }
 502
 503    thread_count = migrate_compress_threads();
 504    for (i = 0; i < thread_count; i++) {
 505        /*
 506         * we use it as a indicator which shows if the thread is
 507         * properly init'd or not
 508         */
 509        if (!comp_param[i].file) {
 510            break;
 511        }
 512
 513        qemu_mutex_lock(&comp_param[i].mutex);
 514        comp_param[i].quit = true;
 515        qemu_cond_signal(&comp_param[i].cond);
 516        qemu_mutex_unlock(&comp_param[i].mutex);
 517
 518        qemu_thread_join(compress_threads + i);
 519        qemu_mutex_destroy(&comp_param[i].mutex);
 520        qemu_cond_destroy(&comp_param[i].cond);
 521        deflateEnd(&comp_param[i].stream);
 522        g_free(comp_param[i].originbuf);
 523        qemu_fclose(comp_param[i].file);
 524        comp_param[i].file = NULL;
 525    }
 526    qemu_mutex_destroy(&comp_done_lock);
 527    qemu_cond_destroy(&comp_done_cond);
 528    g_free(compress_threads);
 529    g_free(comp_param);
 530    compress_threads = NULL;
 531    comp_param = NULL;
 532}
 533
 534static int compress_threads_save_setup(void)
 535{
 536    int i, thread_count;
 537
 538    if (!migrate_use_compression()) {
 539        return 0;
 540    }
 541    thread_count = migrate_compress_threads();
 542    compress_threads = g_new0(QemuThread, thread_count);
 543    comp_param = g_new0(CompressParam, thread_count);
 544    qemu_cond_init(&comp_done_cond);
 545    qemu_mutex_init(&comp_done_lock);
 546    for (i = 0; i < thread_count; i++) {
 547        comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 548        if (!comp_param[i].originbuf) {
 549            goto exit;
 550        }
 551
 552        if (deflateInit(&comp_param[i].stream,
 553                        migrate_compress_level()) != Z_OK) {
 554            g_free(comp_param[i].originbuf);
 555            goto exit;
 556        }
 557
 558        /* comp_param[i].file is just used as a dummy buffer to save data,
 559         * set its ops to empty.
 560         */
 561        comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 562        comp_param[i].done = true;
 563        comp_param[i].quit = false;
 564        qemu_mutex_init(&comp_param[i].mutex);
 565        qemu_cond_init(&comp_param[i].cond);
 566        qemu_thread_create(compress_threads + i, "compress",
 567                           do_data_compress, comp_param + i,
 568                           QEMU_THREAD_JOINABLE);
 569    }
 570    return 0;
 571
 572exit:
 573    compress_threads_save_cleanup();
 574    return -1;
 575}
 576
 577/**
 578 * save_page_header: write page header to wire
 579 *
 580 * If this is the 1st block, it also writes the block identification
 581 *
 582 * Returns the number of bytes written
 583 *
 584 * @f: QEMUFile where to send the data
 585 * @block: block that contains the page we want to send
 586 * @offset: offset inside the block for the page
 587 *          in the lower bits, it contains flags
 588 */
 589static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 590                               ram_addr_t offset)
 591{
 592    size_t size, len;
 593
 594    if (block == rs->last_sent_block) {
 595        offset |= RAM_SAVE_FLAG_CONTINUE;
 596    }
 597    qemu_put_be64(f, offset);
 598    size = 8;
 599
 600    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 601        len = strlen(block->idstr);
 602        qemu_put_byte(f, len);
 603        qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 604        size += 1 + len;
 605        rs->last_sent_block = block;
 606    }
 607    return size;
 608}
 609
 610/**
 611 * mig_throttle_guest_down: throotle down the guest
 612 *
 613 * Reduce amount of guest cpu execution to hopefully slow down memory
 614 * writes. If guest dirty memory rate is reduced below the rate at
 615 * which we can transfer pages to the destination then we should be
 616 * able to complete migration. Some workloads dirty memory way too
 617 * fast and will not effectively converge, even with auto-converge.
 618 */
 619static void mig_throttle_guest_down(void)
 620{
 621    MigrationState *s = migrate_get_current();
 622    uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 623    uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 624    int pct_max = s->parameters.max_cpu_throttle;
 625
 626    /* We have not started throttling yet. Let's start it. */
 627    if (!cpu_throttle_active()) {
 628        cpu_throttle_set(pct_initial);
 629    } else {
 630        /* Throttling already on, just increase the rate */
 631        cpu_throttle_set(MIN(cpu_throttle_get_percentage() + pct_icrement,
 632                         pct_max));
 633    }
 634}
 635
 636/**
 637 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 638 *
 639 * @rs: current RAM state
 640 * @current_addr: address for the zero page
 641 *
 642 * Update the xbzrle cache to reflect a page that's been sent as all 0.
 643 * The important thing is that a stale (not-yet-0'd) page be replaced
 644 * by the new data.
 645 * As a bonus, if the page wasn't in the cache it gets added so that
 646 * when a small write is made into the 0'd page it gets XBZRLE sent.
 647 */
 648static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 649{
 650    if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 651        return;
 652    }
 653
 654    /* We don't care if this fails to allocate a new cache page
 655     * as long as it updated an old one */
 656    cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 657                 ram_counters.dirty_sync_count);
 658}
 659
 660#define ENCODING_FLAG_XBZRLE 0x1
 661
 662/**
 663 * save_xbzrle_page: compress and send current page
 664 *
 665 * Returns: 1 means that we wrote the page
 666 *          0 means that page is identical to the one already sent
 667 *          -1 means that xbzrle would be longer than normal
 668 *
 669 * @rs: current RAM state
 670 * @current_data: pointer to the address of the page contents
 671 * @current_addr: addr of the page
 672 * @block: block that contains the page we want to send
 673 * @offset: offset inside the block for the page
 674 * @last_stage: if we are at the completion stage
 675 */
 676static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 677                            ram_addr_t current_addr, RAMBlock *block,
 678                            ram_addr_t offset, bool last_stage)
 679{
 680    int encoded_len = 0, bytes_xbzrle;
 681    uint8_t *prev_cached_page;
 682
 683    if (!cache_is_cached(XBZRLE.cache, current_addr,
 684                         ram_counters.dirty_sync_count)) {
 685        xbzrle_counters.cache_miss++;
 686        if (!last_stage) {
 687            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 688                             ram_counters.dirty_sync_count) == -1) {
 689                return -1;
 690            } else {
 691                /* update *current_data when the page has been
 692                   inserted into cache */
 693                *current_data = get_cached_data(XBZRLE.cache, current_addr);
 694            }
 695        }
 696        return -1;
 697    }
 698
 699    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 700
 701    /* save current buffer into memory */
 702    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 703
 704    /* XBZRLE encoding (if there is no overflow) */
 705    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 706                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 707                                       TARGET_PAGE_SIZE);
 708
 709    /*
 710     * Update the cache contents, so that it corresponds to the data
 711     * sent, in all cases except where we skip the page.
 712     */
 713    if (!last_stage && encoded_len != 0) {
 714        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 715        /*
 716         * In the case where we couldn't compress, ensure that the caller
 717         * sends the data from the cache, since the guest might have
 718         * changed the RAM since we copied it.
 719         */
 720        *current_data = prev_cached_page;
 721    }
 722
 723    if (encoded_len == 0) {
 724        trace_save_xbzrle_page_skipping();
 725        return 0;
 726    } else if (encoded_len == -1) {
 727        trace_save_xbzrle_page_overflow();
 728        xbzrle_counters.overflow++;
 729        return -1;
 730    }
 731
 732    /* Send XBZRLE based compressed page */
 733    bytes_xbzrle = save_page_header(rs, rs->f, block,
 734                                    offset | RAM_SAVE_FLAG_XBZRLE);
 735    qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 736    qemu_put_be16(rs->f, encoded_len);
 737    qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 738    bytes_xbzrle += encoded_len + 1 + 2;
 739    xbzrle_counters.pages++;
 740    xbzrle_counters.bytes += bytes_xbzrle;
 741    ram_counters.transferred += bytes_xbzrle;
 742
 743    return 1;
 744}
 745
 746/**
 747 * migration_bitmap_find_dirty: find the next dirty page from start
 748 *
 749 * Returns the page offset within memory region of the start of a dirty page
 750 *
 751 * @rs: current RAM state
 752 * @rb: RAMBlock where to search for dirty pages
 753 * @start: page where we start the search
 754 */
 755static inline
 756unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 757                                          unsigned long start)
 758{
 759    unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 760    unsigned long *bitmap = rb->bmap;
 761    unsigned long next;
 762
 763    if (ramblock_is_ignored(rb)) {
 764        return size;
 765    }
 766
 767    /*
 768     * When the free page optimization is enabled, we need to check the bitmap
 769     * to send the non-free pages rather than all the pages in the bulk stage.
 770     */
 771    if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
 772        next = start + 1;
 773    } else {
 774        next = find_next_bit(bitmap, size, start);
 775    }
 776
 777    return next;
 778}
 779
 780static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 781                                                RAMBlock *rb,
 782                                                unsigned long page)
 783{
 784    bool ret;
 785
 786    qemu_mutex_lock(&rs->bitmap_mutex);
 787
 788    /*
 789     * Clear dirty bitmap if needed.  This _must_ be called before we
 790     * send any of the page in the chunk because we need to make sure
 791     * we can capture further page content changes when we sync dirty
 792     * log the next time.  So as long as we are going to send any of
 793     * the page in the chunk we clear the remote dirty bitmap for all.
 794     * Clearing it earlier won't be a problem, but too late will.
 795     */
 796    if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 797        uint8_t shift = rb->clear_bmap_shift;
 798        hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 799        hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 800
 801        /*
 802         * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 803         * can make things easier sometimes since then start address
 804         * of the small chunk will always be 64 pages aligned so the
 805         * bitmap will always be aligned to unsigned long.  We should
 806         * even be able to remove this restriction but I'm simply
 807         * keeping it.
 808         */
 809        assert(shift >= 6);
 810        trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 811        memory_region_clear_dirty_bitmap(rb->mr, start, size);
 812    }
 813
 814    ret = test_and_clear_bit(page, rb->bmap);
 815
 816    if (ret) {
 817        rs->migration_dirty_pages--;
 818    }
 819    qemu_mutex_unlock(&rs->bitmap_mutex);
 820
 821    return ret;
 822}
 823
 824/* Called with RCU critical section */
 825static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 826{
 827    rs->migration_dirty_pages +=
 828        cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length,
 829                                              &rs->num_dirty_pages_period);
 830}
 831
 832/**
 833 * ram_pagesize_summary: calculate all the pagesizes of a VM
 834 *
 835 * Returns a summary bitmap of the page sizes of all RAMBlocks
 836 *
 837 * For VMs with just normal pages this is equivalent to the host page
 838 * size. If it's got some huge pages then it's the OR of all the
 839 * different page sizes.
 840 */
 841uint64_t ram_pagesize_summary(void)
 842{
 843    RAMBlock *block;
 844    uint64_t summary = 0;
 845
 846    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 847        summary |= block->page_size;
 848    }
 849
 850    return summary;
 851}
 852
 853uint64_t ram_get_total_transferred_pages(void)
 854{
 855    return  ram_counters.normal + ram_counters.duplicate +
 856                compression_counters.pages + xbzrle_counters.pages;
 857}
 858
 859static void migration_update_rates(RAMState *rs, int64_t end_time)
 860{
 861    uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 862    double compressed_size;
 863
 864    /* calculate period counters */
 865    ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 866                / (end_time - rs->time_last_bitmap_sync);
 867
 868    if (!page_count) {
 869        return;
 870    }
 871
 872    if (migrate_use_xbzrle()) {
 873        xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 874            rs->xbzrle_cache_miss_prev) / page_count;
 875        rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 876    }
 877
 878    if (migrate_use_compression()) {
 879        compression_counters.busy_rate = (double)(compression_counters.busy -
 880            rs->compress_thread_busy_prev) / page_count;
 881        rs->compress_thread_busy_prev = compression_counters.busy;
 882
 883        compressed_size = compression_counters.compressed_size -
 884                          rs->compressed_size_prev;
 885        if (compressed_size) {
 886            double uncompressed_size = (compression_counters.pages -
 887                                    rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 888
 889            /* Compression-Ratio = Uncompressed-size / Compressed-size */
 890            compression_counters.compression_rate =
 891                                        uncompressed_size / compressed_size;
 892
 893            rs->compress_pages_prev = compression_counters.pages;
 894            rs->compressed_size_prev = compression_counters.compressed_size;
 895        }
 896    }
 897}
 898
 899static void migration_trigger_throttle(RAMState *rs)
 900{
 901    MigrationState *s = migrate_get_current();
 902    uint64_t threshold = s->parameters.throttle_trigger_threshold;
 903
 904    uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 905    uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 906    uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 907
 908    /* During block migration the auto-converge logic incorrectly detects
 909     * that ram migration makes no progress. Avoid this by disabling the
 910     * throttling logic during the bulk phase of block migration. */
 911    if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 912        /* The following detection logic can be refined later. For now:
 913           Check to see if the ratio between dirtied bytes and the approx.
 914           amount of bytes that just got transferred since the last time
 915           we were in this routine reaches the threshold. If that happens
 916           twice, start or increase throttling. */
 917
 918        if ((bytes_dirty_period > bytes_dirty_threshold) &&
 919            (++rs->dirty_rate_high_cnt >= 2)) {
 920            trace_migration_throttle();
 921            rs->dirty_rate_high_cnt = 0;
 922            mig_throttle_guest_down();
 923        }
 924    }
 925}
 926
 927static void migration_bitmap_sync(RAMState *rs)
 928{
 929    RAMBlock *block;
 930    int64_t end_time;
 931
 932    ram_counters.dirty_sync_count++;
 933
 934    if (!rs->time_last_bitmap_sync) {
 935        rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 936    }
 937
 938    trace_migration_bitmap_sync_start();
 939    memory_global_dirty_log_sync();
 940
 941    qemu_mutex_lock(&rs->bitmap_mutex);
 942    WITH_RCU_READ_LOCK_GUARD() {
 943        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 944            ramblock_sync_dirty_bitmap(rs, block);
 945        }
 946        ram_counters.remaining = ram_bytes_remaining();
 947    }
 948    qemu_mutex_unlock(&rs->bitmap_mutex);
 949
 950    memory_global_after_dirty_log_sync();
 951    trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 952
 953    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 954
 955    /* more than 1 second = 1000 millisecons */
 956    if (end_time > rs->time_last_bitmap_sync + 1000) {
 957        migration_trigger_throttle(rs);
 958
 959        migration_update_rates(rs, end_time);
 960
 961        rs->target_page_count_prev = rs->target_page_count;
 962
 963        /* reset period counters */
 964        rs->time_last_bitmap_sync = end_time;
 965        rs->num_dirty_pages_period = 0;
 966        rs->bytes_xfer_prev = ram_counters.transferred;
 967    }
 968    if (migrate_use_events()) {
 969        qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
 970    }
 971}
 972
 973static void migration_bitmap_sync_precopy(RAMState *rs)
 974{
 975    Error *local_err = NULL;
 976
 977    /*
 978     * The current notifier usage is just an optimization to migration, so we
 979     * don't stop the normal migration process in the error case.
 980     */
 981    if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
 982        error_report_err(local_err);
 983        local_err = NULL;
 984    }
 985
 986    migration_bitmap_sync(rs);
 987
 988    if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
 989        error_report_err(local_err);
 990    }
 991}
 992
 993/**
 994 * save_zero_page_to_file: send the zero page to the file
 995 *
 996 * Returns the size of data written to the file, 0 means the page is not
 997 * a zero page
 998 *
 999 * @rs: current RAM state
1000 * @file: the file where the data is saved

1001 * @block: block that contains the page we want to send
1002 * @offset: offset inside the block for the page
1003 */
1004static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1005                                  RAMBlock *block, ram_addr_t offset)
1006{
1007    uint8_t *p = block->host + offset;
1008    int len = 0;
1009
1010    if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1011        len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1012        qemu_put_byte(file, 0);
1013        len += 1;
1014    }
1015    return len;
1016}
1017
1018/**
1019 * save_zero_page: send the zero page to the stream
1020 *
1021 * Returns the number of pages written.
1022 *
1023 * @rs: current RAM state
1024 * @block: block that contains the page we want to send
1025 * @offset: offset inside the block for the page
1026 */
1027static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1028{
1029    int len = save_zero_page_to_file(rs, rs->f, block, offset);
1030
1031    if (len) {
1032        ram_counters.duplicate++;
1033        ram_counters.transferred += len;
1034        return 1;
1035    }
1036    return -1;
1037}
1038
1039static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1040{
1041    if (!migrate_release_ram() || !migration_in_postcopy()) {
1042        return;
1043    }
1044
1045    ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1046}
1047
1048/*
1049 * @pages: the number of pages written by the control path,
1050 *        < 0 - error
1051 *        > 0 - number of pages written
1052 *
1053 * Return true if the pages has been saved, otherwise false is returned.
1054 */
1055static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1056                              int *pages)
1057{
1058    uint64_t bytes_xmit = 0;
1059    int ret;
1060
1061    *pages = -1;
1062    ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1063                                &bytes_xmit);
1064    if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1065        return false;
1066    }
1067
1068    if (bytes_xmit) {
1069        ram_counters.transferred += bytes_xmit;
1070        *pages = 1;
1071    }
1072
1073    if (ret == RAM_SAVE_CONTROL_DELAYED) {
1074        return true;
1075    }
1076
1077    if (bytes_xmit > 0) {
1078        ram_counters.normal++;
1079    } else if (bytes_xmit == 0) {
1080        ram_counters.duplicate++;
1081    }
1082
1083    return true;
1084}
1085
1086/*
1087 * directly send the page to the stream
1088 *
1089 * Returns the number of pages written.
1090 *
1091 * @rs: current RAM state
1092 * @block: block that contains the page we want to send
1093 * @offset: offset inside the block for the page
1094 * @buf: the page to be sent
1095 * @async: send to page asyncly
1096 */
1097static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1098                            uint8_t *buf, bool async)
1099{
1100    ram_counters.transferred += save_page_header(rs, rs->f, block,
1101                                                 offset | RAM_SAVE_FLAG_PAGE);
1102    if (async) {
1103        qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1104                              migrate_release_ram() &
1105                              migration_in_postcopy());
1106    } else {
1107        qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1108    }
1109    ram_counters.transferred += TARGET_PAGE_SIZE;
1110    ram_counters.normal++;
1111    return 1;
1112}
1113
1114/**
1115 * ram_save_page: send the given page to the stream
1116 *
1117 * Returns the number of pages written.
1118 *          < 0 - error
1119 *          >=0 - Number of pages written - this might legally be 0
1120 *                if xbzrle noticed the page was the same.
1121 *
1122 * @rs: current RAM state
1123 * @block: block that contains the page we want to send
1124 * @offset: offset inside the block for the page
1125 * @last_stage: if we are at the completion stage
1126 */
1127static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1128{
1129    int pages = -1;
1130    uint8_t *p;
1131    bool send_async = true;
1132    RAMBlock *block = pss->block;
1133    ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1134    ram_addr_t current_addr = block->offset + offset;
1135
1136    p = block->host + offset;
1137    trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1138
1139    XBZRLE_cache_lock();
1140    if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1141        migrate_use_xbzrle()) {
1142        pages = save_xbzrle_page(rs, &p, current_addr, block,
1143                                 offset, last_stage);
1144        if (!last_stage) {
1145            /* Can't send this cached data async, since the cache page
1146             * might get updated before it gets to the wire
1147             */
1148            send_async = false;
1149        }
1150    }
1151
1152    /* XBZRLE overflow or normal page */
1153    if (pages == -1) {
1154        pages = save_normal_page(rs, block, offset, p, send_async);
1155    }
1156
1157    XBZRLE_cache_unlock();
1158
1159    return pages;
1160}
1161
1162static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1163                                 ram_addr_t offset)
1164{
1165    if (multifd_queue_page(rs->f, block, offset) < 0) {
1166        return -1;
1167    }
1168    ram_counters.normal++;
1169
1170    return 1;
1171}
1172
1173static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1174                                 ram_addr_t offset, uint8_t *source_buf)
1175{
1176    RAMState *rs = ram_state;
1177    uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1178    bool zero_page = false;
1179    int ret;
1180
1181    if (save_zero_page_to_file(rs, f, block, offset)) {
1182        zero_page = true;
1183        goto exit;
1184    }
1185
1186    save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1187
1188    /*
1189     * copy it to a internal buffer to avoid it being modified by VM
1190     * so that we can catch up the error during compression and
1191     * decompression
1192     */
1193    memcpy(source_buf, p, TARGET_PAGE_SIZE);
1194    ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1195    if (ret < 0) {
1196        qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1197        error_report("compressed data failed!");
1198        return false;
1199    }
1200
1201exit:
1202    ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1203    return zero_page;
1204}
1205
1206static void
1207update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1208{
1209    ram_counters.transferred += bytes_xmit;
1210
1211    if (param->zero_page) {
1212        ram_counters.duplicate++;
1213        return;
1214    }
1215
1216    /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1217    compression_counters.compressed_size += bytes_xmit - 8;
1218    compression_counters.pages++;
1219}
1220
1221static bool save_page_use_compression(RAMState *rs);
1222
1223static void flush_compressed_data(RAMState *rs)
1224{
1225    int idx, len, thread_count;
1226
1227    if (!save_page_use_compression(rs)) {
1228        return;
1229    }
1230    thread_count = migrate_compress_threads();
1231
1232    qemu_mutex_lock(&comp_done_lock);
1233    for (idx = 0; idx < thread_count; idx++) {
1234        while (!comp_param[idx].done) {
1235            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1236        }
1237    }
1238    qemu_mutex_unlock(&comp_done_lock);
1239
1240    for (idx = 0; idx < thread_count; idx++) {
1241        qemu_mutex_lock(&comp_param[idx].mutex);
1242        if (!comp_param[idx].quit) {
1243            len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1244            /*
1245             * it's safe to fetch zero_page without holding comp_done_lock
1246             * as there is no further request submitted to the thread,
1247             * i.e, the thread should be waiting for a request at this point.
1248             */
1249            update_compress_thread_counts(&comp_param[idx], len);
1250        }
1251        qemu_mutex_unlock(&comp_param[idx].mutex);
1252    }
1253}
1254
1255static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1256                                       ram_addr_t offset)
1257{
1258    param->block = block;
1259    param->offset = offset;
1260}
1261
1262static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1263                                           ram_addr_t offset)
1264{
1265    int idx, thread_count, bytes_xmit = -1, pages = -1;
1266    bool wait = migrate_compress_wait_thread();
1267
1268    thread_count = migrate_compress_threads();
1269    qemu_mutex_lock(&comp_done_lock);
1270retry:
1271    for (idx = 0; idx < thread_count; idx++) {
1272        if (comp_param[idx].done) {
1273            comp_param[idx].done = false;
1274            bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1275            qemu_mutex_lock(&comp_param[idx].mutex);
1276            set_compress_params(&comp_param[idx], block, offset);
1277            qemu_cond_signal(&comp_param[idx].cond);
1278            qemu_mutex_unlock(&comp_param[idx].mutex);
1279            pages = 1;
1280            update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1281            break;
1282        }
1283    }
1284
1285    /*
1286     * wait for the free thread if the user specifies 'compress-wait-thread',
1287     * otherwise we will post the page out in the main thread as normal page.
1288     */
1289    if (pages < 0 && wait) {
1290        qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1291        goto retry;
1292    }
1293    qemu_mutex_unlock(&comp_done_lock);
1294
1295    return pages;
1296}
1297
1298/**
1299 * find_dirty_block: find the next dirty page and update any state
1300 * associated with the search process.
1301 *
1302 * Returns true if a page is found
1303 *
1304 * @rs: current RAM state
1305 * @pss: data about the state of the current dirty page scan
1306 * @again: set to false if the search has scanned the whole of RAM
1307 */
1308static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1309{
1310    pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1311    if (pss->complete_round && pss->block == rs->last_seen_block &&
1312        pss->page >= rs->last_page) {
1313        /*
1314         * We've been once around the RAM and haven't found anything.
1315         * Give up.
1316         */
1317        *again = false;
1318        return false;
1319    }
1320    if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1321        >= pss->block->used_length) {
1322        /* Didn't find anything in this RAM Block */
1323        pss->page = 0;
1324        pss->block = QLIST_NEXT_RCU(pss->block, next);
1325        if (!pss->block) {
1326            /*
1327             * If memory migration starts over, we will meet a dirtied page
1328             * which may still exists in compression threads's ring, so we
1329             * should flush the compressed data to make sure the new page
1330             * is not overwritten by the old one in the destination.
1331             *
1332             * Also If xbzrle is on, stop using the data compression at this
1333             * point. In theory, xbzrle can do better than compression.
1334             */
1335            flush_compressed_data(rs);
1336
1337            /* Hit the end of the list */
1338            pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1339            /* Flag that we've looped */
1340            pss->complete_round = true;
1341            rs->ram_bulk_stage = false;
1342        }
1343        /* Didn't find anything this time, but try again on the new block */
1344        *again = true;
1345        return false;
1346    } else {
1347        /* Can go around again, but... */
1348        *again = true;
1349        /* We've found something so probably don't need to */
1350        return true;
1351    }
1352}
1353
1354/**
1355 * unqueue_page: gets a page of the queue
1356 *
1357 * Helper for 'get_queued_page' - gets a page off the queue
1358 *
1359 * Returns the block of the page (or NULL if none available)
1360 *
1361 * @rs: current RAM state
1362 * @offset: used to return the offset within the RAMBlock
1363 */
1364static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1365{
1366    RAMBlock *block = NULL;
1367
1368    if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1369        return NULL;
1370    }
1371
1372    qemu_mutex_lock(&rs->src_page_req_mutex);
1373    if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1374        struct RAMSrcPageRequest *entry =
1375                                QSIMPLEQ_FIRST(&rs->src_page_requests);
1376        block = entry->rb;
1377        *offset = entry->offset;
1378
1379        if (entry->len > TARGET_PAGE_SIZE) {
1380            entry->len -= TARGET_PAGE_SIZE;
1381            entry->offset += TARGET_PAGE_SIZE;
1382        } else {
1383            memory_region_unref(block->mr);
1384            QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1385            g_free(entry);
1386            migration_consume_urgent_request();
1387        }
1388    }
1389    qemu_mutex_unlock(&rs->src_page_req_mutex);
1390
1391    return block;
1392}
1393
1394/**
1395 * get_queued_page: unqueue a page from the postcopy requests
1396 *
1397 * Skips pages that are already sent (!dirty)
1398 *
1399 * Returns true if a queued page is found
1400 *
1401 * @rs: current RAM state
1402 * @pss: data about the state of the current dirty page scan
1403 */
1404static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1405{
1406    RAMBlock  *block;
1407    ram_addr_t offset;
1408    bool dirty;
1409
1410    do {
1411        block = unqueue_page(rs, &offset);
1412        /*
1413         * We're sending this page, and since it's postcopy nothing else
1414         * will dirty it, and we must make sure it doesn't get sent again
1415         * even if this queue request was received after the background
1416         * search already sent it.
1417         */
1418        if (block) {
1419            unsigned long page;
1420
1421            page = offset >> TARGET_PAGE_BITS;
1422            dirty = test_bit(page, block->bmap);
1423            if (!dirty) {
1424                trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1425                                                page);
1426            } else {
1427                trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1428            }
1429        }
1430
1431    } while (block && !dirty);
1432
1433    if (block) {
1434        /*
1435         * As soon as we start servicing pages out of order, then we have
1436         * to kill the bulk stage, since the bulk stage assumes
1437         * in (migration_bitmap_find_and_reset_dirty) that every page is
1438         * dirty, that's no longer true.
1439         */
1440        rs->ram_bulk_stage = false;
1441
1442        /*
1443         * We want the background search to continue from the queued page
1444         * since the guest is likely to want other pages near to the page
1445         * it just requested.
1446         */
1447        pss->block = block;
1448        pss->page = offset >> TARGET_PAGE_BITS;
1449
1450        /*
1451         * This unqueued page would break the "one round" check, even is
1452         * really rare.
1453         */
1454        pss->complete_round = false;
1455    }
1456
1457    return !!block;
1458}
1459
1460/**
1461 * migration_page_queue_free: drop any remaining pages in the ram
1462 * request queue
1463 *
1464 * It should be empty at the end anyway, but in error cases there may
1465 * be some left.  in case that there is any page left, we drop it.
1466 *
1467 */
1468static void migration_page_queue_free(RAMState *rs)
1469{
1470    struct RAMSrcPageRequest *mspr, *next_mspr;
1471    /* This queue generally should be empty - but in the case of a failed
1472     * migration might have some droppings in.
1473     */
1474    RCU_READ_LOCK_GUARD();
1475    QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1476        memory_region_unref(mspr->rb->mr);
1477        QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1478        g_free(mspr);
1479    }
1480}
1481
1482/**
1483 * ram_save_queue_pages: queue the page for transmission
1484 *
1485 * A request from postcopy destination for example.
1486 *
1487 * Returns zero on success or negative on error
1488 *
1489 * @rbname: Name of the RAMBLock of the request. NULL means the
1490 *          same that last one.
1491 * @start: starting address from the start of the RAMBlock
1492 * @len: length (in bytes) to send
1493 */
1494int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1495{
1496    RAMBlock *ramblock;
1497    RAMState *rs = ram_state;
1498
1499    ram_counters.postcopy_requests++;
1500    RCU_READ_LOCK_GUARD();
1501
1502    if (!rbname) {
1503        /* Reuse last RAMBlock */
1504        ramblock = rs->last_req_rb;
1505
1506        if (!ramblock) {
1507            /*
1508             * Shouldn't happen, we can't reuse the last RAMBlock if
1509             * it's the 1st request.
1510             */
1511            error_report("ram_save_queue_pages no previous block");
1512            return -1;
1513        }
1514    } else {
1515        ramblock = qemu_ram_block_by_name(rbname);
1516
1517        if (!ramblock) {
1518            /* We shouldn't be asked for a non-existent RAMBlock */
1519            error_report("ram_save_queue_pages no block '%s'", rbname);
1520            return -1;
1521        }
1522        rs->last_req_rb = ramblock;
1523    }
1524    trace_ram_save_queue_pages(ramblock->idstr, start, len);
1525    if (start+len > ramblock->used_length) {
1526        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1527                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1528                     __func__, start, len, ramblock->used_length);
1529        return -1;
1530    }
1531
1532    struct RAMSrcPageRequest *new_entry =
1533        g_malloc0(sizeof(struct RAMSrcPageRequest));
1534    new_entry->rb = ramblock;
1535    new_entry->offset = start;
1536    new_entry->len = len;
1537
1538    memory_region_ref(ramblock->mr);
1539    qemu_mutex_lock(&rs->src_page_req_mutex);
1540    QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1541    migration_make_urgent_request();
1542    qemu_mutex_unlock(&rs->src_page_req_mutex);
1543
1544    return 0;
1545}
1546
1547static bool save_page_use_compression(RAMState *rs)
1548{
1549    if (!migrate_use_compression()) {
1550        return false;
1551    }
1552
1553    /*
1554     * If xbzrle is on, stop using the data compression after first
1555     * round of migration even if compression is enabled. In theory,
1556     * xbzrle can do better than compression.
1557     */
1558    if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1559        return true;
1560    }
1561
1562    return false;
1563}
1564
1565/*
1566 * try to compress the page before posting it out, return true if the page
1567 * has been properly handled by compression, otherwise needs other
1568 * paths to handle it
1569 */
1570static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1571{
1572    if (!save_page_use_compression(rs)) {
1573        return false;
1574    }
1575
1576    /*
1577     * When starting the process of a new block, the first page of
1578     * the block should be sent out before other pages in the same
1579     * block, and all the pages in last block should have been sent
1580     * out, keeping this order is important, because the 'cont' flag
1581     * is used to avoid resending the block name.
1582     *
1583     * We post the fist page as normal page as compression will take
1584     * much CPU resource.
1585     */
1586    if (block != rs->last_sent_block) {
1587        flush_compressed_data(rs);
1588        return false;
1589    }
1590
1591    if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1592        return true;
1593    }
1594
1595    compression_counters.busy++;
1596    return false;
1597}
1598
1599/**
1600 * ram_save_target_page: save one target page
1601 *
1602 * Returns the number of pages written
1603 *
1604 * @rs: current RAM state
1605 * @pss: data about the page we want to send
1606 * @last_stage: if we are at the completion stage
1607 */
1608static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1609                                bool last_stage)
1610{
1611    RAMBlock *block = pss->block;
1612    ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1613    int res;
1614
1615    if (control_save_page(rs, block, offset, &res)) {
1616        return res;
1617    }
1618
1619    if (save_compress_page(rs, block, offset)) {
1620        return 1;
1621    }
1622
1623    res = save_zero_page(rs, block, offset);
1624    if (res > 0) {
1625        /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1626         * page would be stale
1627         */
1628        if (!save_page_use_compression(rs)) {
1629            XBZRLE_cache_lock();
1630            xbzrle_cache_zero_page(rs, block->offset + offset);
1631            XBZRLE_cache_unlock();
1632        }
1633        ram_release_pages(block->idstr, offset, res);
1634        return res;
1635    }
1636
1637    /*
1638     * Do not use multifd for:
1639     * 1. Compression as the first page in the new block should be posted out
1640     *    before sending the compressed page
1641     * 2. In postcopy as one whole host page should be placed
1642     */
1643    if (!save_page_use_compression(rs) && migrate_use_multifd()
1644        && !migration_in_postcopy()) {
1645        return ram_save_multifd_page(rs, block, offset);
1646    }
1647
1648    return ram_save_page(rs, pss, last_stage);
1649}
1650
1651/**
1652 * ram_save_host_page: save a whole host page
1653 *
1654 * Starting at *offset send pages up to the end of the current host
1655 * page. It's valid for the initial offset to point into the middle of
1656 * a host page in which case the remainder of the hostpage is sent.
1657 * Only dirty target pages are sent. Note that the host page size may
1658 * be a huge page for this block.
1659 * The saving stops at the boundary of the used_length of the block
1660 * if the RAMBlock isn't a multiple of the host page size.
1661 *
1662 * Returns the number of pages written or negative on error
1663 *
1664 * @rs: current RAM state
1665 * @ms: current migration state
1666 * @pss: data about the page we want to send
1667 * @last_stage: if we are at the completion stage
1668 */
1669static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1670                              bool last_stage)
1671{
1672    int tmppages, pages = 0;
1673    size_t pagesize_bits =
1674        qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1675
1676    if (ramblock_is_ignored(pss->block)) {
1677        error_report("block %s should not be migrated !", pss->block->idstr);
1678        return 0;
1679    }
1680
1681    do {
1682        /* Check the pages is dirty and if it is send it */
1683        if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1684            pss->page++;
1685            continue;
1686        }
1687
1688        tmppages = ram_save_target_page(rs, pss, last_stage);
1689        if (tmppages < 0) {
1690            return tmppages;
1691        }
1692
1693        pages += tmppages;
1694        pss->page++;
1695        /* Allow rate limiting to happen in the middle of huge pages */
1696        migration_rate_limit();
1697    } while ((pss->page & (pagesize_bits - 1)) &&
1698             offset_in_ramblock(pss->block,
1699                                ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1700
1701    /* The offset we leave with is the last one we looked at */
1702    pss->page--;
1703    return pages;
1704}
1705
1706/**
1707 * ram_find_and_save_block: finds a dirty page and sends it to f
1708 *
1709 * Called within an RCU critical section.
1710 *
1711 * Returns the number of pages written where zero means no dirty pages,
1712 * or negative on error
1713 *
1714 * @rs: current RAM state
1715 * @last_stage: if we are at the completion stage
1716 *
1717 * On systems where host-page-size > target-page-size it will send all the
1718 * pages in a host page that are dirty.
1719 */
1720
1721static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1722{
1723    PageSearchStatus pss;
1724    int pages = 0;
1725    bool again, found;
1726
1727    /* No dirty page as there is zero RAM */
1728    if (!ram_bytes_total()) {
1729        return pages;
1730    }
1731
1732    pss.block = rs->last_seen_block;
1733    pss.page = rs->last_page;
1734    pss.complete_round = false;
1735
1736    if (!pss.block) {
1737        pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1738    }
1739
1740    do {
1741        again = true;
1742        found = get_queued_page(rs, &pss);
1743
1744        if (!found) {
1745            /* priority queue empty, so just search for something dirty */
1746            found = find_dirty_block(rs, &pss, &again);
1747        }
1748
1749        if (found) {
1750            pages = ram_save_host_page(rs, &pss, last_stage);
1751        }
1752    } while (!pages && again);
1753
1754    rs->last_seen_block = pss.block;
1755    rs->last_page = pss.page;
1756
1757    return pages;
1758}
1759
1760void acct_update_position(QEMUFile *f, size_t size, bool zero)
1761{
1762    uint64_t pages = size / TARGET_PAGE_SIZE;
1763
1764    if (zero) {
1765        ram_counters.duplicate += pages;
1766    } else {
1767        ram_counters.normal += pages;
1768        ram_counters.transferred += size;
1769        qemu_update_position(f, size);
1770    }
1771}
1772
1773static uint64_t ram_bytes_total_common(bool count_ignored)
1774{
1775    RAMBlock *block;
1776    uint64_t total = 0;
1777
1778    RCU_READ_LOCK_GUARD();
1779
1780    if (count_ignored) {
1781        RAMBLOCK_FOREACH_MIGRATABLE(block) {
1782            total += block->used_length;
1783        }
1784    } else {
1785        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1786            total += block->used_length;
1787        }
1788    }
1789    return total;
1790}
1791
1792uint64_t ram_bytes_total(void)
1793{
1794    return ram_bytes_total_common(false);
1795}
1796
1797static void xbzrle_load_setup(void)
1798{
1799    XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1800}
1801
1802static void xbzrle_load_cleanup(void)
1803{
1804    g_free(XBZRLE.decoded_buf);
1805    XBZRLE.decoded_buf = NULL;
1806}
1807
1808static void ram_state_cleanup(RAMState **rsp)
1809{
1810    if (*rsp) {
1811        migration_page_queue_free(*rsp);
1812        qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1813        qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1814        g_free(*rsp);
1815        *rsp = NULL;
1816    }
1817}
1818
1819static void xbzrle_cleanup(void)
1820{
1821    XBZRLE_cache_lock();
1822    if (XBZRLE.cache) {
1823        cache_fini(XBZRLE.cache);
1824        g_free(XBZRLE.encoded_buf);
1825        g_free(XBZRLE.current_buf);
1826        g_free(XBZRLE.zero_target_page);
1827        XBZRLE.cache = NULL;
1828        XBZRLE.encoded_buf = NULL;
1829        XBZRLE.current_buf = NULL;
1830        XBZRLE.zero_target_page = NULL;
1831    }
1832    XBZRLE_cache_unlock();
1833}
1834
1835static void ram_save_cleanup(void *opaque)
1836{
1837    RAMState **rsp = opaque;
1838    RAMBlock *block;
1839
1840    /* caller have hold iothread lock or is in a bh, so there is
1841     * no writing race against the migration bitmap
1842     */
1843    memory_global_dirty_log_stop();
1844
1845    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1846        g_free(block->clear_bmap);
1847        block->clear_bmap = NULL;
1848        g_free(block->bmap);
1849        block->bmap = NULL;
1850    }
1851
1852    xbzrle_cleanup();
1853    compress_threads_save_cleanup();
1854    ram_state_cleanup(rsp);
1855}
1856
1857static void ram_state_reset(RAMState *rs)
1858{
1859    rs->last_seen_block = NULL;
1860    rs->last_sent_block = NULL;
1861    rs->last_page = 0;
1862    rs->last_version = ram_list.version;
1863    rs->ram_bulk_stage = true;
1864    rs->fpo_enabled = false;
1865}
1866
1867#define MAX_WAIT 50 /* ms, half buffered_file limit */
1868
1869/*
1870 * 'expected' is the value you expect the bitmap mostly to be full
1871 * of; it won't bother printing lines that are all this value.
1872 * If 'todump' is null the migration bitmap is dumped.
1873 */
1874void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1875                           unsigned long pages)
1876{
1877    int64_t cur;
1878    int64_t linelen = 128;
1879    char linebuf[129];
1880
1881    for (cur = 0; cur < pages; cur += linelen) {
1882        int64_t curb;
1883        bool found = false;
1884        /*
1885         * Last line; catch the case where the line length
1886         * is longer than remaining ram
1887         */
1888        if (cur + linelen > pages) {
1889            linelen = pages - cur;
1890        }
1891        for (curb = 0; curb < linelen; curb++) {
1892            bool thisbit = test_bit(cur + curb, todump);
1893            linebuf[curb] = thisbit ? '1' : '.';
1894            found = found || (thisbit != expected);
1895        }
1896        if (found) {
1897            linebuf[curb] = '\0';
1898            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1899        }
1900    }
1901}
1902
1903/* **** functions for postcopy ***** */
1904
1905void ram_postcopy_migrated_memory_release(MigrationState *ms)
1906{
1907    struct RAMBlock *block;
1908
1909    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1910        unsigned long *bitmap = block->bmap;
1911        unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1912        unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1913
1914        while (run_start < range) {
1915            unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1916            ram_discard_range(block->idstr,
1917                              ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1918                              ((ram_addr_t)(run_end - run_start))
1919                                << TARGET_PAGE_BITS);
1920            run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1921        }
1922    }
1923}
1924
1925/**
1926 * postcopy_send_discard_bm_ram: discard a RAMBlock
1927 *
1928 * Returns zero on success
1929 *
1930 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1931 *
1932 * @ms: current migration state
1933 * @block: RAMBlock to discard
1934 */
1935static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1936{
1937    unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1938    unsigned long current;
1939    unsigned long *bitmap = block->bmap;
1940
1941    for (current = 0; current < end; ) {
1942        unsigned long one = find_next_bit(bitmap, end, current);
1943        unsigned long zero, discard_length;
1944
1945        if (one >= end) {
1946            break;
1947        }
1948
1949        zero = find_next_zero_bit(bitmap, end, one + 1);
1950
1951        if (zero >= end) {
1952            discard_length = end - one;
1953        } else {
1954            discard_length = zero - one;
1955        }
1956        postcopy_discard_send_range(ms, one, discard_length);
1957        current = one + discard_length;
1958    }
1959
1960    return 0;
1961}
1962
1963/**
1964 * postcopy_each_ram_send_discard: discard all RAMBlocks
1965 *
1966 * Returns 0 for success or negative for error
1967 *
1968 * Utility for the outgoing postcopy code.
1969 *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1970 *   passing it bitmap indexes and name.
1971 * (qemu_ram_foreach_block ends up passing unscaled lengths
1972 *  which would mean postcopy code would have to deal with target page)
1973 *
1974 * @ms: current migration state
1975 */
1976static int postcopy_each_ram_send_discard(MigrationState *ms)
1977{
1978    struct RAMBlock *block;
1979    int ret;
1980
1981    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1982        postcopy_discard_send_init(ms, block->idstr);
1983
1984        /*
1985         * Postcopy sends chunks of bitmap over the wire, but it
1986         * just needs indexes at this point, avoids it having
1987         * target page specific code.
1988         */
1989        ret = postcopy_send_discard_bm_ram(ms, block);
1990        postcopy_discard_send_finish(ms);
1991        if (ret) {
1992            return ret;
1993        }
1994    }
1995
1996    return 0;
1997}
1998
1999/**
2000 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages

2001 *
2002 * Helper for postcopy_chunk_hostpages; it's called twice to
2003 * canonicalize the two bitmaps, that are similar, but one is
2004 * inverted.
2005 *
2006 * Postcopy requires that all target pages in a hostpage are dirty or
2007 * clean, not a mix.  This function canonicalizes the bitmaps.
2008 *
2009 * @ms: current migration state
2010 * @block: block that contains the page we want to canonicalize
2011 */
2012static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2013{
2014    RAMState *rs = ram_state;
2015    unsigned long *bitmap = block->bmap;
2016    unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2017    unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2018    unsigned long run_start;
2019
2020    if (block->page_size == TARGET_PAGE_SIZE) {
2021        /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2022        return;
2023    }
2024
2025    /* Find a dirty page */
2026    run_start = find_next_bit(bitmap, pages, 0);
2027
2028    while (run_start < pages) {
2029
2030        /*
2031         * If the start of this run of pages is in the middle of a host
2032         * page, then we need to fixup this host page.
2033         */
2034        if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2035            /* Find the end of this run */
2036            run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2037            /*
2038             * If the end isn't at the start of a host page, then the
2039             * run doesn't finish at the end of a host page
2040             * and we need to discard.
2041             */
2042        }
2043
2044        if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2045            unsigned long page;
2046            unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2047                                                             host_ratio);
2048            run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2049
2050            /* Clean up the bitmap */
2051            for (page = fixup_start_addr;
2052                 page < fixup_start_addr + host_ratio; page++) {
2053                /*
2054                 * Remark them as dirty, updating the count for any pages
2055                 * that weren't previously dirty.
2056                 */
2057                rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2058            }
2059        }
2060
2061        /* Find the next dirty page for the next iteration */
2062        run_start = find_next_bit(bitmap, pages, run_start);
2063    }
2064}
2065
2066/**
2067 * postcopy_chunk_hostpages: discard any partially sent host page
2068 *
2069 * Utility for the outgoing postcopy code.
2070 *
2071 * Discard any partially sent host-page size chunks, mark any partially
2072 * dirty host-page size chunks as all dirty.  In this case the host-page
2073 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2074 *
2075 * Returns zero on success
2076 *
2077 * @ms: current migration state
2078 * @block: block we want to work with
2079 */
2080static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2081{
2082    postcopy_discard_send_init(ms, block->idstr);
2083
2084    /*
2085     * Ensure that all partially dirty host pages are made fully dirty.
2086     */
2087    postcopy_chunk_hostpages_pass(ms, block);
2088
2089    postcopy_discard_send_finish(ms);
2090    return 0;
2091}
2092
2093/**
2094 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2095 *
2096 * Returns zero on success
2097 *
2098 * Transmit the set of pages to be discarded after precopy to the target
2099 * these are pages that:
2100 *     a) Have been previously transmitted but are now dirty again
2101 *     b) Pages that have never been transmitted, this ensures that
2102 *        any pages on the destination that have been mapped by background
2103 *        tasks get discarded (transparent huge pages is the specific concern)
2104 * Hopefully this is pretty sparse
2105 *
2106 * @ms: current migration state
2107 */
2108int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2109{
2110    RAMState *rs = ram_state;
2111    RAMBlock *block;
2112    int ret;
2113
2114    RCU_READ_LOCK_GUARD();
2115
2116    /* This should be our last sync, the src is now paused */
2117    migration_bitmap_sync(rs);
2118
2119    /* Easiest way to make sure we don't resume in the middle of a host-page */
2120    rs->last_seen_block = NULL;
2121    rs->last_sent_block = NULL;
2122    rs->last_page = 0;
2123
2124    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2125        /* Deal with TPS != HPS and huge pages */
2126        ret = postcopy_chunk_hostpages(ms, block);
2127        if (ret) {
2128            return ret;
2129        }
2130
2131#ifdef DEBUG_POSTCOPY
2132        ram_debug_dump_bitmap(block->bmap, true,
2133                              block->used_length >> TARGET_PAGE_BITS);
2134#endif
2135    }
2136    trace_ram_postcopy_send_discard_bitmap();
2137
2138    ret = postcopy_each_ram_send_discard(ms);
2139
2140    return ret;
2141}
2142
2143/**
2144 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2145 *
2146 * Returns zero on success
2147 *
2148 * @rbname: name of the RAMBlock of the request. NULL means the
2149 *          same that last one.
2150 * @start: RAMBlock starting page
2151 * @length: RAMBlock size
2152 */
2153int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2154{
2155    trace_ram_discard_range(rbname, start, length);
2156
2157    RCU_READ_LOCK_GUARD();
2158    RAMBlock *rb = qemu_ram_block_by_name(rbname);
2159
2160    if (!rb) {
2161        error_report("ram_discard_range: Failed to find block '%s'", rbname);
2162        return -1;
2163    }
2164
2165    /*
2166     * On source VM, we don't need to update the received bitmap since
2167     * we don't even have one.
2168     */
2169    if (rb->receivedmap) {
2170        bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2171                     length >> qemu_target_page_bits());
2172    }
2173
2174    return ram_block_discard_range(rb, start, length);
2175}
2176
2177/*
2178 * For every allocation, we will try not to crash the VM if the
2179 * allocation failed.
2180 */
2181static int xbzrle_init(void)
2182{
2183    Error *local_err = NULL;
2184
2185    if (!migrate_use_xbzrle()) {
2186        return 0;
2187    }
2188
2189    XBZRLE_cache_lock();
2190
2191    XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2192    if (!XBZRLE.zero_target_page) {
2193        error_report("%s: Error allocating zero page", __func__);
2194        goto err_out;
2195    }
2196
2197    XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2198                              TARGET_PAGE_SIZE, &local_err);
2199    if (!XBZRLE.cache) {
2200        error_report_err(local_err);
2201        goto free_zero_page;
2202    }
2203
2204    XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2205    if (!XBZRLE.encoded_buf) {
2206        error_report("%s: Error allocating encoded_buf", __func__);
2207        goto free_cache;
2208    }
2209
2210    XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2211    if (!XBZRLE.current_buf) {
2212        error_report("%s: Error allocating current_buf", __func__);
2213        goto free_encoded_buf;
2214    }
2215
2216    /* We are all good */
2217    XBZRLE_cache_unlock();
2218    return 0;
2219
2220free_encoded_buf:
2221    g_free(XBZRLE.encoded_buf);
2222    XBZRLE.encoded_buf = NULL;
2223free_cache:
2224    cache_fini(XBZRLE.cache);
2225    XBZRLE.cache = NULL;
2226free_zero_page:
2227    g_free(XBZRLE.zero_target_page);
2228    XBZRLE.zero_target_page = NULL;
2229err_out:
2230    XBZRLE_cache_unlock();
2231    return -ENOMEM;
2232}
2233
2234static int ram_state_init(RAMState **rsp)
2235{
2236    *rsp = g_try_new0(RAMState, 1);
2237
2238    if (!*rsp) {
2239        error_report("%s: Init ramstate fail", __func__);
2240        return -1;
2241    }
2242
2243    qemu_mutex_init(&(*rsp)->bitmap_mutex);
2244    qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2245    QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2246
2247    /*
2248     * Count the total number of pages used by ram blocks not including any
2249     * gaps due to alignment or unplugs.
2250     * This must match with the initial values of dirty bitmap.
2251     */
2252    (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2253    ram_state_reset(*rsp);
2254
2255    return 0;
2256}
2257
2258static void ram_list_init_bitmaps(void)
2259{
2260    MigrationState *ms = migrate_get_current();
2261    RAMBlock *block;
2262    unsigned long pages;
2263    uint8_t shift;
2264
2265    /* Skip setting bitmap if there is no RAM */
2266    if (ram_bytes_total()) {
2267        shift = ms->clear_bitmap_shift;
2268        if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2269            error_report("clear_bitmap_shift (%u) too big, using "
2270                         "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2271            shift = CLEAR_BITMAP_SHIFT_MAX;
2272        } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2273            error_report("clear_bitmap_shift (%u) too small, using "
2274                         "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2275            shift = CLEAR_BITMAP_SHIFT_MIN;
2276        }
2277
2278        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2279            pages = block->max_length >> TARGET_PAGE_BITS;
2280            /*
2281             * The initial dirty bitmap for migration must be set with all
2282             * ones to make sure we'll migrate every guest RAM page to
2283             * destination.
2284             * Here we set RAMBlock.bmap all to 1 because when rebegin a
2285             * new migration after a failed migration, ram_list.
2286             * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2287             * guest memory.
2288             */
2289            block->bmap = bitmap_new(pages);
2290            bitmap_set(block->bmap, 0, pages);
2291            block->clear_bmap_shift = shift;
2292            block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2293        }
2294    }
2295}
2296
2297static void ram_init_bitmaps(RAMState *rs)
2298{
2299    /* For memory_global_dirty_log_start below.  */
2300    qemu_mutex_lock_iothread();
2301    qemu_mutex_lock_ramlist();
2302
2303    WITH_RCU_READ_LOCK_GUARD() {
2304        ram_list_init_bitmaps();
2305        memory_global_dirty_log_start();
2306        migration_bitmap_sync_precopy(rs);
2307    }
2308    qemu_mutex_unlock_ramlist();
2309    qemu_mutex_unlock_iothread();
2310}
2311
2312static int ram_init_all(RAMState **rsp)
2313{
2314    if (ram_state_init(rsp)) {
2315        return -1;
2316    }
2317
2318    if (xbzrle_init()) {
2319        ram_state_cleanup(rsp);
2320        return -1;
2321    }
2322
2323    ram_init_bitmaps(*rsp);
2324
2325    return 0;
2326}
2327
2328static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2329{
2330    RAMBlock *block;
2331    uint64_t pages = 0;
2332
2333    /*
2334     * Postcopy is not using xbzrle/compression, so no need for that.
2335     * Also, since source are already halted, we don't need to care
2336     * about dirty page logging as well.
2337     */
2338
2339    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2340        pages += bitmap_count_one(block->bmap,
2341                                  block->used_length >> TARGET_PAGE_BITS);
2342    }
2343
2344    /* This may not be aligned with current bitmaps. Recalculate. */
2345    rs->migration_dirty_pages = pages;
2346
2347    rs->last_seen_block = NULL;
2348    rs->last_sent_block = NULL;
2349    rs->last_page = 0;
2350    rs->last_version = ram_list.version;
2351    /*
2352     * Disable the bulk stage, otherwise we'll resend the whole RAM no
2353     * matter what we have sent.
2354     */
2355    rs->ram_bulk_stage = false;
2356
2357    /* Update RAMState cache of output QEMUFile */
2358    rs->f = out;
2359
2360    trace_ram_state_resume_prepare(pages);
2361}
2362
2363/*
2364 * This function clears bits of the free pages reported by the caller from the
2365 * migration dirty bitmap. @addr is the host address corresponding to the
2366 * start of the continuous guest free pages, and @len is the total bytes of
2367 * those pages.
2368 */
2369void qemu_guest_free_page_hint(void *addr, size_t len)
2370{
2371    RAMBlock *block;
2372    ram_addr_t offset;
2373    size_t used_len, start, npages;
2374    MigrationState *s = migrate_get_current();
2375
2376    /* This function is currently expected to be used during live migration */
2377    if (!migration_is_setup_or_active(s->state)) {
2378        return;
2379    }
2380
2381    for (; len > 0; len -= used_len, addr += used_len) {
2382        block = qemu_ram_block_from_host(addr, false, &offset);
2383        if (unlikely(!block || offset >= block->used_length)) {
2384            /*
2385             * The implementation might not support RAMBlock resize during
2386             * live migration, but it could happen in theory with future
2387             * updates. So we add a check here to capture that case.
2388             */
2389            error_report_once("%s unexpected error", __func__);
2390            return;
2391        }
2392
2393        if (len <= block->used_length - offset) {
2394            used_len = len;
2395        } else {
2396            used_len = block->used_length - offset;
2397        }
2398
2399        start = offset >> TARGET_PAGE_BITS;
2400        npages = used_len >> TARGET_PAGE_BITS;
2401
2402        qemu_mutex_lock(&ram_state->bitmap_mutex);
2403        ram_state->migration_dirty_pages -=
2404                      bitmap_count_one_with_offset(block->bmap, start, npages);
2405        bitmap_clear(block->bmap, start, npages);
2406        qemu_mutex_unlock(&ram_state->bitmap_mutex);
2407    }
2408}
2409
2410/*
2411 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2412 * long-running RCU critical section.  When rcu-reclaims in the code
2413 * start to become numerous it will be necessary to reduce the
2414 * granularity of these critical sections.
2415 */
2416
2417/**
2418 * ram_save_setup: Setup RAM for migration
2419 *
2420 * Returns zero to indicate success and negative for error
2421 *
2422 * @f: QEMUFile where to send the data
2423 * @opaque: RAMState pointer
2424 */
2425static int ram_save_setup(QEMUFile *f, void *opaque)
2426{
2427    RAMState **rsp = opaque;
2428    RAMBlock *block;
2429
2430    if (compress_threads_save_setup()) {
2431        return -1;
2432    }
2433
2434    /* migration has already setup the bitmap, reuse it. */
2435    if (!migration_in_colo_state()) {
2436        if (ram_init_all(rsp) != 0) {
2437            compress_threads_save_cleanup();
2438            return -1;
2439        }
2440    }
2441    (*rsp)->f = f;
2442
2443    WITH_RCU_READ_LOCK_GUARD() {
2444        qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2445
2446        RAMBLOCK_FOREACH_MIGRATABLE(block) {
2447            qemu_put_byte(f, strlen(block->idstr));
2448            qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2449            qemu_put_be64(f, block->used_length);
2450            if (migrate_postcopy_ram() && block->page_size !=
2451                                          qemu_host_page_size) {
2452                qemu_put_be64(f, block->page_size);
2453            }
2454            if (migrate_ignore_shared()) {
2455                qemu_put_be64(f, block->mr->addr);
2456            }
2457        }
2458    }
2459
2460    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2461    ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2462
2463    multifd_send_sync_main(f);
2464    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2465    qemu_fflush(f);
2466
2467    return 0;
2468}
2469
2470/**
2471 * ram_save_iterate: iterative stage for migration
2472 *
2473 * Returns zero to indicate success and negative for error
2474 *
2475 * @f: QEMUFile where to send the data
2476 * @opaque: RAMState pointer
2477 */
2478static int ram_save_iterate(QEMUFile *f, void *opaque)
2479{
2480    RAMState **temp = opaque;
2481    RAMState *rs = *temp;
2482    int ret = 0;
2483    int i;
2484    int64_t t0;
2485    int done = 0;
2486
2487    if (blk_mig_bulk_active()) {
2488        /* Avoid transferring ram during bulk phase of block migration as
2489         * the bulk phase will usually take a long time and transferring
2490         * ram updates during that time is pointless. */
2491        goto out;
2492    }
2493
2494    WITH_RCU_READ_LOCK_GUARD() {
2495        if (ram_list.version != rs->last_version) {
2496            ram_state_reset(rs);
2497        }
2498
2499        /* Read version before ram_list.blocks */
2500        smp_rmb();
2501
2502        ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2503
2504        t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2505        i = 0;
2506        while ((ret = qemu_file_rate_limit(f)) == 0 ||
2507                !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2508            int pages;
2509
2510            if (qemu_file_get_error(f)) {
2511                break;
2512            }
2513
2514            pages = ram_find_and_save_block(rs, false);
2515            /* no more pages to sent */
2516            if (pages == 0) {
2517                done = 1;
2518                break;
2519            }
2520
2521            if (pages < 0) {
2522                qemu_file_set_error(f, pages);
2523                break;
2524            }
2525
2526            rs->target_page_count += pages;
2527
2528            /*
2529             * During postcopy, it is necessary to make sure one whole host
2530             * page is sent in one chunk.
2531             */
2532            if (migrate_postcopy_ram()) {
2533                flush_compressed_data(rs);
2534            }
2535
2536            /*
2537             * we want to check in the 1st loop, just in case it was the 1st
2538             * time and we had to sync the dirty bitmap.
2539             * qemu_clock_get_ns() is a bit expensive, so we only check each
2540             * some iterations
2541             */
2542            if ((i & 63) == 0) {
2543                uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2544                              1000000;
2545                if (t1 > MAX_WAIT) {
2546                    trace_ram_save_iterate_big_wait(t1, i);
2547                    break;
2548                }
2549            }
2550            i++;
2551        }
2552    }
2553
2554    /*
2555     * Must occur before EOS (or any QEMUFile operation)
2556     * because of RDMA protocol.
2557     */
2558    ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2559
2560out:
2561    if (ret >= 0
2562        && migration_is_setup_or_active(migrate_get_current()->state)) {
2563        multifd_send_sync_main(rs->f);
2564        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2565        qemu_fflush(f);
2566        ram_counters.transferred += 8;
2567
2568        ret = qemu_file_get_error(f);
2569    }
2570    if (ret < 0) {
2571        return ret;
2572    }
2573
2574    return done;
2575}
2576
2577/**
2578 * ram_save_complete: function called to send the remaining amount of ram
2579 *
2580 * Returns zero to indicate success or negative on error
2581 *
2582 * Called with iothread lock
2583 *
2584 * @f: QEMUFile where to send the data
2585 * @opaque: RAMState pointer
2586 */
2587static int ram_save_complete(QEMUFile *f, void *opaque)
2588{
2589    RAMState **temp = opaque;
2590    RAMState *rs = *temp;
2591    int ret = 0;
2592
2593    WITH_RCU_READ_LOCK_GUARD() {
2594        if (!migration_in_postcopy()) {
2595            migration_bitmap_sync_precopy(rs);
2596        }
2597
2598        ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2599
2600        /* try transferring iterative blocks of memory */
2601
2602        /* flush all remaining blocks regardless of rate limiting */
2603        while (true) {
2604            int pages;
2605
2606            pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2607            /* no more blocks to sent */
2608            if (pages == 0) {
2609                break;
2610            }
2611            if (pages < 0) {
2612                ret = pages;
2613                break;
2614            }
2615        }
2616
2617        flush_compressed_data(rs);
2618        ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2619    }
2620
2621    if (ret >= 0) {
2622        multifd_send_sync_main(rs->f);
2623        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2624        qemu_fflush(f);
2625    }
2626
2627    return ret;
2628}
2629
2630static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2631                             uint64_t *res_precopy_only,
2632                             uint64_t *res_compatible,
2633                             uint64_t *res_postcopy_only)
2634{
2635    RAMState **temp = opaque;
2636    RAMState *rs = *temp;
2637    uint64_t remaining_size;
2638
2639    remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2640
2641    if (!migration_in_postcopy() &&
2642        remaining_size < max_size) {
2643        qemu_mutex_lock_iothread();
2644        WITH_RCU_READ_LOCK_GUARD() {
2645            migration_bitmap_sync_precopy(rs);
2646        }
2647        qemu_mutex_unlock_iothread();
2648        remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2649    }
2650
2651    if (migrate_postcopy_ram()) {
2652        /* We can do postcopy, and all the data is postcopiable */
2653        *res_compatible += remaining_size;
2654    } else {
2655        *res_precopy_only += remaining_size;
2656    }
2657}
2658
2659static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2660{
2661    unsigned int xh_len;
2662    int xh_flags;
2663    uint8_t *loaded_data;
2664
2665    /* extract RLE header */
2666    xh_flags = qemu_get_byte(f);
2667    xh_len = qemu_get_be16(f);
2668
2669    if (xh_flags != ENCODING_FLAG_XBZRLE) {
2670        error_report("Failed to load XBZRLE page - wrong compression!");
2671        return -1;
2672    }
2673
2674    if (xh_len > TARGET_PAGE_SIZE) {
2675        error_report("Failed to load XBZRLE page - len overflow!");
2676        return -1;
2677    }
2678    loaded_data = XBZRLE.decoded_buf;
2679    /* load data and decode */
2680    /* it can change loaded_data to point to an internal buffer */
2681    qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2682
2683    /* decode RLE */
2684    if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2685                             TARGET_PAGE_SIZE) == -1) {
2686        error_report("Failed to load XBZRLE page - decode error!");
2687        return -1;
2688    }
2689
2690    return 0;
2691}
2692
2693/**
2694 * ram_block_from_stream: read a RAMBlock id from the migration stream
2695 *
2696 * Must be called from within a rcu critical section.
2697 *
2698 * Returns a pointer from within the RCU-protected ram_list.
2699 *
2700 * @f: QEMUFile where to read the data from
2701 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2702 */
2703static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2704{
2705    static RAMBlock *block = NULL;
2706    char id[256];
2707    uint8_t len;
2708
2709    if (flags & RAM_SAVE_FLAG_CONTINUE) {
2710        if (!block) {
2711            error_report("Ack, bad migration stream!");
2712            return NULL;
2713        }
2714        return block;
2715    }
2716
2717    len = qemu_get_byte(f);
2718    qemu_get_buffer(f, (uint8_t *)id, len);
2719    id[len] = 0;
2720
2721    block = qemu_ram_block_by_name(id);
2722    if (!block) {
2723        error_report("Can't find block %s", id);
2724        return NULL;
2725    }
2726
2727    if (ramblock_is_ignored(block)) {
2728        error_report("block %s should not be migrated !", id);
2729        return NULL;
2730    }
2731
2732    return block;
2733}
2734
2735static inline void *host_from_ram_block_offset(RAMBlock *block,
2736                                               ram_addr_t offset)
2737{
2738    if (!offset_in_ramblock(block, offset)) {
2739        return NULL;
2740    }
2741
2742    return block->host + offset;
2743}
2744
2745static inline void *colo_cache_from_block_offset(RAMBlock *block,
2746                             ram_addr_t offset, bool record_bitmap)
2747{
2748    if (!offset_in_ramblock(block, offset)) {
2749        return NULL;
2750    }
2751    if (!block->colo_cache) {
2752        error_report("%s: colo_cache is NULL in block :%s",
2753                     __func__, block->idstr);
2754        return NULL;
2755    }
2756
2757    /*
2758    * During colo checkpoint, we need bitmap of these migrated pages.
2759    * It help us to decide which pages in ram cache should be flushed
2760    * into VM's RAM later.
2761    */
2762    if (record_bitmap &&
2763        !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2764        ram_state->migration_dirty_pages++;
2765    }
2766    return block->colo_cache + offset;
2767}
2768
2769/**
2770 * ram_handle_compressed: handle the zero page case
2771 *
2772 * If a page (or a whole RDMA chunk) has been
2773 * determined to be zero, then zap it.
2774 *
2775 * @host: host address for the zero page
2776 * @ch: what the page is filled from.  We only support zero
2777 * @size: size of the zero page
2778 */
2779void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2780{
2781    if (ch != 0 || !is_zero_range(host, size)) {
2782        memset(host, ch, size);
2783    }
2784}
2785
2786/* return the size after decompression, or negative value on error */
2787static int
2788qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2789                     const uint8_t *source, size_t source_len)
2790{
2791    int err;
2792
2793    err = inflateReset(stream);
2794    if (err != Z_OK) {
2795        return -1;
2796    }
2797
2798    stream->avail_in = source_len;
2799    stream->next_in = (uint8_t *)source;
2800    stream->avail_out = dest_len;
2801    stream->next_out = dest;
2802
2803    err = inflate(stream, Z_NO_FLUSH);
2804    if (err != Z_STREAM_END) {
2805        return -1;
2806    }
2807
2808    return stream->total_out;
2809}
2810
2811static void *do_data_decompress(void *opaque)
2812{
2813    DecompressParam *param = opaque;
2814    unsigned long pagesize;
2815    uint8_t *des;
2816    int len, ret;
2817
2818    qemu_mutex_lock(&param->mutex);
2819    while (!param->quit) {
2820        if (param->des) {
2821            des = param->des;
2822            len = param->len;
2823            param->des = 0;
2824            qemu_mutex_unlock(&param->mutex);
2825
2826            pagesize = TARGET_PAGE_SIZE;
2827
2828            ret = qemu_uncompress_data(&param->stream, des, pagesize,
2829                                       param->compbuf, len);
2830            if (ret < 0 && migrate_get_current()->decompress_error_check) {
2831                error_report("decompress data failed");
2832                qemu_file_set_error(decomp_file, ret);
2833            }
2834
2835            qemu_mutex_lock(&decomp_done_lock);
2836            param->done = true;
2837            qemu_cond_signal(&decomp_done_cond);
2838            qemu_mutex_unlock(&decomp_done_lock);
2839
2840            qemu_mutex_lock(&param->mutex);
2841        } else {
2842            qemu_cond_wait(&param->cond, &param->mutex);
2843        }
2844    }
2845    qemu_mutex_unlock(&param->mutex);
2846
2847    return NULL;
2848}
2849
2850static int wait_for_decompress_done(void)
2851{
2852    int idx, thread_count;
2853
2854    if (!migrate_use_compression()) {
2855        return 0;
2856    }
2857
2858    thread_count = migrate_decompress_threads();
2859    qemu_mutex_lock(&decomp_done_lock);
2860    for (idx = 0; idx < thread_count; idx++) {
2861        while (!decomp_param[idx].done) {
2862            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2863        }
2864    }
2865    qemu_mutex_unlock(&decomp_done_lock);
2866    return qemu_file_get_error(decomp_file);
2867}
2868
2869static void compress_threads_load_cleanup(void)
2870{
2871    int i, thread_count;
2872
2873    if (!migrate_use_compression()) {
2874        return;
2875    }
2876    thread_count = migrate_decompress_threads();
2877    for (i = 0; i < thread_count; i++) {
2878        /*
2879         * we use it as a indicator which shows if the thread is
2880         * properly init'd or not
2881         */
2882        if (!decomp_param[i].compbuf) {
2883            break;
2884        }
2885
2886        qemu_mutex_lock(&decomp_param[i].mutex);
2887        decomp_param[i].quit = true;
2888        qemu_cond_signal(&decomp_param[i].cond);
2889        qemu_mutex_unlock(&decomp_param[i].mutex);
2890    }
2891    for (i = 0; i < thread_count; i++) {
2892        if (!decomp_param[i].compbuf) {
2893            break;
2894        }
2895
2896        qemu_thread_join(decompress_threads + i);
2897        qemu_mutex_destroy(&decomp_param[i].mutex);
2898        qemu_cond_destroy(&decomp_param[i].cond);
2899        inflateEnd(&decomp_param[i].stream);
2900        g_free(decomp_param[i].compbuf);
2901        decomp_param[i].compbuf = NULL;
2902    }
2903    g_free(decompress_threads);
2904    g_free(decomp_param);
2905    decompress_threads = NULL;
2906    decomp_param = NULL;
2907    decomp_file = NULL;
2908}
2909
2910static int compress_threads_load_setup(QEMUFile *f)
2911{
2912    int i, thread_count;
2913
2914    if (!migrate_use_compression()) {
2915        return 0;
2916    }
2917
2918    thread_count = migrate_decompress_threads();
2919    decompress_threads = g_new0(QemuThread, thread_count);
2920    decomp_param = g_new0(DecompressParam, thread_count);
2921    qemu_mutex_init(&decomp_done_lock);
2922    qemu_cond_init(&decomp_done_cond);
2923    decomp_file = f;
2924    for (i = 0; i < thread_count; i++) {
2925        if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2926            goto exit;
2927        }
2928
2929        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2930        qemu_mutex_init(&decomp_param[i].mutex);
2931        qemu_cond_init(&decomp_param[i].cond);
2932        decomp_param[i].done = true;
2933        decomp_param[i].quit = false;
2934        qemu_thread_create(decompress_threads + i, "decompress",
2935                           do_data_decompress, decomp_param + i,
2936                           QEMU_THREAD_JOINABLE);
2937    }
2938    return 0;
2939exit:
2940    compress_threads_load_cleanup();
2941    return -1;
2942}
2943
2944static void decompress_data_with_multi_threads(QEMUFile *f,
2945                                               void *host, int len)
2946{
2947    int idx, thread_count;
2948
2949    thread_count = migrate_decompress_threads();
2950    qemu_mutex_lock(&decomp_done_lock);
2951    while (true) {
2952        for (idx = 0; idx < thread_count; idx++) {
2953            if (decomp_param[idx].done) {
2954                decomp_param[idx].done = false;
2955                qemu_mutex_lock(&decomp_param[idx].mutex);
2956                qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2957                decomp_param[idx].des = host;
2958                decomp_param[idx].len = len;
2959                qemu_cond_signal(&decomp_param[idx].cond);
2960                qemu_mutex_unlock(&decomp_param[idx].mutex);
2961                break;
2962            }
2963        }
2964        if (idx < thread_count) {
2965            break;
2966        } else {
2967            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2968        }
2969    }
2970    qemu_mutex_unlock(&decomp_done_lock);
2971}
2972
2973/*
2974 * colo cache: this is for secondary VM, we cache the whole
2975 * memory of the secondary VM, it is need to hold the global lock
2976 * to call this helper.
2977 */
2978int colo_init_ram_cache(void)
2979{
2980    RAMBlock *block;
2981
2982    WITH_RCU_READ_LOCK_GUARD() {
2983        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2984            block->colo_cache = qemu_anon_ram_alloc(block->used_length,
2985                                                    NULL,
2986                                                    false);
2987            if (!block->colo_cache) {
2988                error_report("%s: Can't alloc memory for COLO cache of block %s,"
2989                             "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
2990                             block->used_length);
2991                RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2992                    if (block->colo_cache) {
2993                        qemu_anon_ram_free(block->colo_cache, block->used_length);
2994                        block->colo_cache = NULL;
2995                    }
2996                }
2997                return -errno;
2998            }
2999        }
3000    }

3001
3002    /*
3003    * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3004    * with to decide which page in cache should be flushed into SVM's RAM. Here
3005    * we use the same name 'ram_bitmap' as for migration.
3006    */
3007    if (ram_bytes_total()) {
3008        RAMBlock *block;
3009
3010        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3011            unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3012            block->bmap = bitmap_new(pages);
3013        }
3014    }
3015
3016    ram_state_init(&ram_state);
3017    return 0;
3018}
3019
3020/* TODO: duplicated with ram_init_bitmaps */
3021void colo_incoming_start_dirty_log(void)
3022{
3023    RAMBlock *block = NULL;
3024    /* For memory_global_dirty_log_start below. */
3025    qemu_mutex_lock_iothread();
3026    qemu_mutex_lock_ramlist();
3027
3028    memory_global_dirty_log_sync();
3029    WITH_RCU_READ_LOCK_GUARD() {
3030        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3031            ramblock_sync_dirty_bitmap(ram_state, block);
3032            /* Discard this dirty bitmap record */
3033            bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3034        }
3035        memory_global_dirty_log_start();
3036    }
3037    ram_state->migration_dirty_pages = 0;
3038    qemu_mutex_unlock_ramlist();
3039    qemu_mutex_unlock_iothread();
3040}
3041
3042/* It is need to hold the global lock to call this helper */
3043void colo_release_ram_cache(void)
3044{
3045    RAMBlock *block;
3046
3047    memory_global_dirty_log_stop();
3048    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3049        g_free(block->bmap);
3050        block->bmap = NULL;
3051    }
3052
3053    WITH_RCU_READ_LOCK_GUARD() {
3054        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3055            if (block->colo_cache) {
3056                qemu_anon_ram_free(block->colo_cache, block->used_length);
3057                block->colo_cache = NULL;
3058            }
3059        }
3060    }
3061    ram_state_cleanup(&ram_state);
3062}
3063
3064/**
3065 * ram_load_setup: Setup RAM for migration incoming side
3066 *
3067 * Returns zero to indicate success and negative for error
3068 *
3069 * @f: QEMUFile where to receive the data
3070 * @opaque: RAMState pointer
3071 */
3072static int ram_load_setup(QEMUFile *f, void *opaque)
3073{
3074    if (compress_threads_load_setup(f)) {
3075        return -1;
3076    }
3077
3078    xbzrle_load_setup();
3079    ramblock_recv_map_init();
3080
3081    return 0;
3082}
3083
3084static int ram_load_cleanup(void *opaque)
3085{
3086    RAMBlock *rb;
3087
3088    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3089        qemu_ram_block_writeback(rb);
3090    }
3091
3092    xbzrle_load_cleanup();
3093    compress_threads_load_cleanup();
3094
3095    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3096        g_free(rb->receivedmap);
3097        rb->receivedmap = NULL;
3098    }
3099
3100    return 0;
3101}
3102
3103/**
3104 * ram_postcopy_incoming_init: allocate postcopy data structures
3105 *
3106 * Returns 0 for success and negative if there was one error
3107 *
3108 * @mis: current migration incoming state
3109 *
3110 * Allocate data structures etc needed by incoming migration with
3111 * postcopy-ram. postcopy-ram's similarly names
3112 * postcopy_ram_incoming_init does the work.
3113 */
3114int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3115{
3116    return postcopy_ram_incoming_init(mis);
3117}
3118
3119/**
3120 * ram_load_postcopy: load a page in postcopy case
3121 *
3122 * Returns 0 for success or -errno in case of error
3123 *
3124 * Called in postcopy mode by ram_load().
3125 * rcu_read_lock is taken prior to this being called.
3126 *
3127 * @f: QEMUFile where to send the data
3128 */
3129static int ram_load_postcopy(QEMUFile *f)
3130{
3131    int flags = 0, ret = 0;
3132    bool place_needed = false;
3133    bool matches_target_page_size = false;
3134    MigrationIncomingState *mis = migration_incoming_get_current();
3135    /* Temporary page that is later 'placed' */
3136    void *postcopy_host_page = mis->postcopy_tmp_page;
3137    void *this_host = NULL;
3138    bool all_zero = false;
3139    int target_pages = 0;
3140
3141    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3142        ram_addr_t addr;
3143        void *host = NULL;
3144        void *page_buffer = NULL;
3145        void *place_source = NULL;
3146        RAMBlock *block = NULL;
3147        uint8_t ch;
3148        int len;
3149
3150        addr = qemu_get_be64(f);
3151
3152        /*
3153         * If qemu file error, we should stop here, and then "addr"
3154         * may be invalid
3155         */
3156        ret = qemu_file_get_error(f);
3157        if (ret) {
3158            break;
3159        }
3160
3161        flags = addr & ~TARGET_PAGE_MASK;
3162        addr &= TARGET_PAGE_MASK;
3163
3164        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3165        place_needed = false;
3166        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3167                     RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3168            block = ram_block_from_stream(f, flags);
3169
3170            host = host_from_ram_block_offset(block, addr);
3171            if (!host) {
3172                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3173                ret = -EINVAL;
3174                break;
3175            }
3176            target_pages++;
3177            matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3178            /*
3179             * Postcopy requires that we place whole host pages atomically;
3180             * these may be huge pages for RAMBlocks that are backed by
3181             * hugetlbfs.
3182             * To make it atomic, the data is read into a temporary page
3183             * that's moved into place later.
3184             * The migration protocol uses,  possibly smaller, target-pages
3185             * however the source ensures it always sends all the components
3186             * of a host page in one chunk.
3187             */
3188            page_buffer = postcopy_host_page +
3189                          ((uintptr_t)host & (block->page_size - 1));
3190            /* If all TP are zero then we can optimise the place */
3191            if (target_pages == 1) {
3192                all_zero = true;
3193                this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3194                                                    block->page_size);
3195            } else {
3196                /* not the 1st TP within the HP */
3197                if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3198                    (uintptr_t)this_host) {
3199                    error_report("Non-same host page %p/%p",
3200                                  host, this_host);
3201                    ret = -EINVAL;
3202                    break;
3203                }
3204            }
3205
3206            /*
3207             * If it's the last part of a host page then we place the host
3208             * page
3209             */
3210            if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3211                place_needed = true;
3212                target_pages = 0;
3213            }
3214            place_source = postcopy_host_page;
3215        }
3216
3217        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3218        case RAM_SAVE_FLAG_ZERO:
3219            ch = qemu_get_byte(f);
3220            /*
3221             * Can skip to set page_buffer when
3222             * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3223             */
3224            if (ch || !matches_target_page_size) {
3225                memset(page_buffer, ch, TARGET_PAGE_SIZE);
3226            }
3227            if (ch) {
3228                all_zero = false;
3229            }
3230            break;
3231
3232        case RAM_SAVE_FLAG_PAGE:
3233            all_zero = false;
3234            if (!matches_target_page_size) {
3235                /* For huge pages, we always use temporary buffer */
3236                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3237            } else {
3238                /*
3239                 * For small pages that matches target page size, we
3240                 * avoid the qemu_file copy.  Instead we directly use
3241                 * the buffer of QEMUFile to place the page.  Note: we
3242                 * cannot do any QEMUFile operation before using that
3243                 * buffer to make sure the buffer is valid when
3244                 * placing the page.
3245                 */
3246                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3247                                         TARGET_PAGE_SIZE);
3248            }
3249            break;
3250        case RAM_SAVE_FLAG_COMPRESS_PAGE:
3251            all_zero = false;
3252            len = qemu_get_be32(f);
3253            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3254                error_report("Invalid compressed data length: %d", len);
3255                ret = -EINVAL;
3256                break;
3257            }
3258            decompress_data_with_multi_threads(f, page_buffer, len);
3259            break;
3260
3261        case RAM_SAVE_FLAG_EOS:
3262            /* normal exit */
3263            multifd_recv_sync_main();
3264            break;
3265        default:
3266            error_report("Unknown combination of migration flags: %#x"
3267                         " (postcopy mode)", flags);
3268            ret = -EINVAL;
3269            break;
3270        }
3271
3272        /* Got the whole host page, wait for decompress before placing. */
3273        if (place_needed) {
3274            ret |= wait_for_decompress_done();
3275        }
3276
3277        /* Detect for any possible file errors */
3278        if (!ret && qemu_file_get_error(f)) {
3279            ret = qemu_file_get_error(f);
3280        }
3281
3282        if (!ret && place_needed) {
3283            /* This gets called at the last target page in the host page */
3284            void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3285                                                       block->page_size);
3286
3287            if (all_zero) {
3288                ret = postcopy_place_page_zero(mis, place_dest,
3289                                               block);
3290            } else {
3291                ret = postcopy_place_page(mis, place_dest,
3292                                          place_source, block);
3293            }
3294        }
3295    }
3296
3297    return ret;
3298}
3299
3300static bool postcopy_is_advised(void)
3301{
3302    PostcopyState ps = postcopy_state_get();
3303    return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3304}
3305
3306static bool postcopy_is_running(void)
3307{
3308    PostcopyState ps = postcopy_state_get();
3309    return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3310}
3311
3312/*
3313 * Flush content of RAM cache into SVM's memory.
3314 * Only flush the pages that be dirtied by PVM or SVM or both.
3315 */
3316static void colo_flush_ram_cache(void)
3317{
3318    RAMBlock *block = NULL;
3319    void *dst_host;
3320    void *src_host;
3321    unsigned long offset = 0;
3322
3323    memory_global_dirty_log_sync();
3324    WITH_RCU_READ_LOCK_GUARD() {
3325        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3326            ramblock_sync_dirty_bitmap(ram_state, block);
3327        }
3328    }
3329
3330    trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3331    WITH_RCU_READ_LOCK_GUARD() {
3332        block = QLIST_FIRST_RCU(&ram_list.blocks);
3333
3334        while (block) {
3335            offset = migration_bitmap_find_dirty(ram_state, block, offset);
3336
3337            if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3338                >= block->used_length) {
3339                offset = 0;
3340                block = QLIST_NEXT_RCU(block, next);
3341            } else {
3342                migration_bitmap_clear_dirty(ram_state, block, offset);
3343                dst_host = block->host
3344                         + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3345                src_host = block->colo_cache
3346                         + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3347                memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3348            }
3349        }
3350    }
3351    trace_colo_flush_ram_cache_end();
3352}
3353
3354/**
3355 * ram_load_precopy: load pages in precopy case
3356 *
3357 * Returns 0 for success or -errno in case of error
3358 *
3359 * Called in precopy mode by ram_load().
3360 * rcu_read_lock is taken prior to this being called.
3361 *
3362 * @f: QEMUFile where to send the data
3363 */
3364static int ram_load_precopy(QEMUFile *f)
3365{
3366    int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3367    /* ADVISE is earlier, it shows the source has the postcopy capability on */
3368    bool postcopy_advised = postcopy_is_advised();
3369    if (!migrate_use_compression()) {
3370        invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3371    }
3372
3373    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3374        ram_addr_t addr, total_ram_bytes;
3375        void *host = NULL, *host_bak = NULL;
3376        uint8_t ch;
3377
3378        /*
3379         * Yield periodically to let main loop run, but an iteration of
3380         * the main loop is expensive, so do it each some iterations
3381         */
3382        if ((i & 32767) == 0 && qemu_in_coroutine()) {
3383            aio_co_schedule(qemu_get_current_aio_context(),
3384                            qemu_coroutine_self());
3385            qemu_coroutine_yield();
3386        }
3387        i++;
3388
3389        addr = qemu_get_be64(f);
3390        flags = addr & ~TARGET_PAGE_MASK;
3391        addr &= TARGET_PAGE_MASK;
3392
3393        if (flags & invalid_flags) {
3394            if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3395                error_report("Received an unexpected compressed page");
3396            }
3397
3398            ret = -EINVAL;
3399            break;
3400        }
3401
3402        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3403                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3404            RAMBlock *block = ram_block_from_stream(f, flags);
3405
3406            host = host_from_ram_block_offset(block, addr);
3407            /*
3408             * After going into COLO stage, we should not load the page
3409             * into SVM's memory directly, we put them into colo_cache firstly.
3410             * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3411             * Previously, we copied all these memory in preparing stage of COLO
3412             * while we need to stop VM, which is a time-consuming process.
3413             * Here we optimize it by a trick, back-up every page while in
3414             * migration process while COLO is enabled, though it affects the
3415             * speed of the migration, but it obviously reduce the downtime of
3416             * back-up all SVM'S memory in COLO preparing stage.
3417             */
3418            if (migration_incoming_colo_enabled()) {
3419                if (migration_incoming_in_colo_state()) {
3420                    /* In COLO stage, put all pages into cache temporarily */
3421                    host = colo_cache_from_block_offset(block, addr, true);
3422                } else {
3423                   /*
3424                    * In migration stage but before COLO stage,
3425                    * Put all pages into both cache and SVM's memory.
3426                    */
3427                    host_bak = colo_cache_from_block_offset(block, addr, false);
3428                }
3429            }
3430            if (!host) {
3431                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3432                ret = -EINVAL;
3433                break;
3434            }
3435            if (!migration_incoming_in_colo_state()) {
3436                ramblock_recv_bitmap_set(block, host);
3437            }
3438
3439            trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3440        }
3441
3442        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3443        case RAM_SAVE_FLAG_MEM_SIZE:
3444            /* Synchronize RAM block list */
3445            total_ram_bytes = addr;
3446            while (!ret && total_ram_bytes) {
3447                RAMBlock *block;
3448                char id[256];
3449                ram_addr_t length;
3450
3451                len = qemu_get_byte(f);
3452                qemu_get_buffer(f, (uint8_t *)id, len);
3453                id[len] = 0;
3454                length = qemu_get_be64(f);
3455
3456                block = qemu_ram_block_by_name(id);
3457                if (block && !qemu_ram_is_migratable(block)) {
3458                    error_report("block %s should not be migrated !", id);
3459                    ret = -EINVAL;
3460                } else if (block) {
3461                    if (length != block->used_length) {
3462                        Error *local_err = NULL;
3463
3464                        ret = qemu_ram_resize(block, length,
3465                                              &local_err);
3466                        if (local_err) {
3467                            error_report_err(local_err);
3468                        }
3469                    }
3470                    /* For postcopy we need to check hugepage sizes match */
3471                    if (postcopy_advised &&
3472                        block->page_size != qemu_host_page_size) {
3473                        uint64_t remote_page_size = qemu_get_be64(f);
3474                        if (remote_page_size != block->page_size) {
3475                            error_report("Mismatched RAM page size %s "
3476                                         "(local) %zd != %" PRId64,
3477                                         id, block->page_size,
3478                                         remote_page_size);
3479                            ret = -EINVAL;
3480                        }
3481                    }
3482                    if (migrate_ignore_shared()) {
3483                        hwaddr addr = qemu_get_be64(f);
3484                        if (ramblock_is_ignored(block) &&
3485                            block->mr->addr != addr) {
3486                            error_report("Mismatched GPAs for block %s "
3487                                         "%" PRId64 "!= %" PRId64,
3488                                         id, (uint64_t)addr,
3489                                         (uint64_t)block->mr->addr);
3490                            ret = -EINVAL;
3491                        }
3492                    }
3493                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3494                                          block->idstr);
3495                } else {
3496                    error_report("Unknown ramblock \"%s\", cannot "
3497                                 "accept migration", id);
3498                    ret = -EINVAL;
3499                }
3500
3501                total_ram_bytes -= length;
3502            }
3503            break;
3504
3505        case RAM_SAVE_FLAG_ZERO:
3506            ch = qemu_get_byte(f);
3507            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3508            break;
3509
3510        case RAM_SAVE_FLAG_PAGE:
3511            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3512            break;
3513
3514        case RAM_SAVE_FLAG_COMPRESS_PAGE:
3515            len = qemu_get_be32(f);
3516            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3517                error_report("Invalid compressed data length: %d", len);
3518                ret = -EINVAL;
3519                break;
3520            }
3521            decompress_data_with_multi_threads(f, host, len);
3522            break;
3523
3524        case RAM_SAVE_FLAG_XBZRLE:
3525            if (load_xbzrle(f, addr, host) < 0) {
3526                error_report("Failed to decompress XBZRLE page at "
3527                             RAM_ADDR_FMT, addr);
3528                ret = -EINVAL;
3529                break;
3530            }
3531            break;
3532        case RAM_SAVE_FLAG_EOS:
3533            /* normal exit */
3534            multifd_recv_sync_main();
3535            break;
3536        default:
3537            if (flags & RAM_SAVE_FLAG_HOOK) {
3538                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3539            } else {
3540                error_report("Unknown combination of migration flags: %#x",
3541                             flags);
3542                ret = -EINVAL;
3543            }
3544        }
3545        if (!ret) {
3546            ret = qemu_file_get_error(f);
3547        }
3548        if (!ret && host_bak) {
3549            memcpy(host_bak, host, TARGET_PAGE_SIZE);
3550        }
3551    }
3552
3553    ret |= wait_for_decompress_done();
3554    return ret;
3555}
3556
3557static int ram_load(QEMUFile *f, void *opaque, int version_id)
3558{
3559    int ret = 0;
3560    static uint64_t seq_iter;
3561    /*
3562     * If system is running in postcopy mode, page inserts to host memory must
3563     * be atomic
3564     */
3565    bool postcopy_running = postcopy_is_running();
3566
3567    seq_iter++;
3568
3569    if (version_id != 4) {
3570        return -EINVAL;
3571    }
3572
3573    /*
3574     * This RCU critical section can be very long running.
3575     * When RCU reclaims in the code start to become numerous,
3576     * it will be necessary to reduce the granularity of this
3577     * critical section.
3578     */
3579    WITH_RCU_READ_LOCK_GUARD() {
3580        if (postcopy_running) {
3581            ret = ram_load_postcopy(f);
3582        } else {
3583            ret = ram_load_precopy(f);
3584        }
3585    }
3586    trace_ram_load_complete(ret, seq_iter);
3587
3588    if (!ret  && migration_incoming_in_colo_state()) {
3589        colo_flush_ram_cache();
3590    }
3591    return ret;
3592}
3593
3594static bool ram_has_postcopy(void *opaque)
3595{
3596    RAMBlock *rb;
3597    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3598        if (ramblock_is_pmem(rb)) {
3599            info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3600                         "is not supported now!", rb->idstr, rb->host);
3601            return false;
3602        }
3603    }
3604
3605    return migrate_postcopy_ram();
3606}
3607
3608/* Sync all the dirty bitmap with destination VM.  */
3609static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3610{
3611    RAMBlock *block;
3612    QEMUFile *file = s->to_dst_file;
3613    int ramblock_count = 0;
3614
3615    trace_ram_dirty_bitmap_sync_start();
3616
3617    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3618        qemu_savevm_send_recv_bitmap(file, block->idstr);
3619        trace_ram_dirty_bitmap_request(block->idstr);
3620        ramblock_count++;
3621    }
3622
3623    trace_ram_dirty_bitmap_sync_wait();
3624
3625    /* Wait until all the ramblocks' dirty bitmap synced */
3626    while (ramblock_count--) {
3627        qemu_sem_wait(&s->rp_state.rp_sem);
3628    }
3629
3630    trace_ram_dirty_bitmap_sync_complete();
3631
3632    return 0;
3633}
3634
3635static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3636{
3637    qemu_sem_post(&s->rp_state.rp_sem);
3638}
3639
3640/*
3641 * Read the received bitmap, revert it as the initial dirty bitmap.
3642 * This is only used when the postcopy migration is paused but wants
3643 * to resume from a middle point.
3644 */
3645int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3646{
3647    int ret = -EINVAL;
3648    QEMUFile *file = s->rp_state.from_dst_file;
3649    unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3650    uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3651    uint64_t size, end_mark;
3652
3653    trace_ram_dirty_bitmap_reload_begin(block->idstr);
3654
3655    if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3656        error_report("%s: incorrect state %s", __func__,
3657                     MigrationStatus_str(s->state));
3658        return -EINVAL;
3659    }
3660
3661    /*
3662     * Note: see comments in ramblock_recv_bitmap_send() on why we
3663     * need the endianess convertion, and the paddings.
3664     */
3665    local_size = ROUND_UP(local_size, 8);
3666
3667    /* Add paddings */
3668    le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3669
3670    size = qemu_get_be64(file);
3671
3672    /* The size of the bitmap should match with our ramblock */
3673    if (size != local_size) {
3674        error_report("%s: ramblock '%s' bitmap size mismatch "
3675                     "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3676                     block->idstr, size, local_size);
3677        ret = -EINVAL;
3678        goto out;
3679    }
3680
3681    size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3682    end_mark = qemu_get_be64(file);
3683
3684    ret = qemu_file_get_error(file);
3685    if (ret || size != local_size) {
3686        error_report("%s: read bitmap failed for ramblock '%s': %d"
3687                     " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3688                     __func__, block->idstr, ret, local_size, size);
3689        ret = -EIO;
3690        goto out;
3691    }
3692
3693    if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3694        error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIu64,
3695                     __func__, block->idstr, end_mark);
3696        ret = -EINVAL;
3697        goto out;
3698    }
3699
3700    /*
3701     * Endianess convertion. We are during postcopy (though paused).
3702     * The dirty bitmap won't change. We can directly modify it.
3703     */
3704    bitmap_from_le(block->bmap, le_bitmap, nbits);
3705
3706    /*
3707     * What we received is "received bitmap". Revert it as the initial
3708     * dirty bitmap for this ramblock.
3709     */
3710    bitmap_complement(block->bmap, block->bmap, nbits);
3711
3712    trace_ram_dirty_bitmap_reload_complete(block->idstr);
3713
3714    /*
3715     * We succeeded to sync bitmap for current ramblock. If this is
3716     * the last one to sync, we need to notify the main send thread.
3717     */
3718    ram_dirty_bitmap_reload_notify(s);
3719
3720    ret = 0;
3721out:
3722    g_free(le_bitmap);
3723    return ret;
3724}
3725
3726static int ram_resume_prepare(MigrationState *s, void *opaque)
3727{
3728    RAMState *rs = *(RAMState **)opaque;
3729    int ret;
3730
3731    ret = ram_dirty_bitmap_sync_all(s, rs);
3732    if (ret) {
3733        return ret;
3734    }
3735
3736    ram_state_resume_prepare(rs, s->to_dst_file);
3737
3738    return 0;
3739}
3740
3741static SaveVMHandlers savevm_ram_handlers = {
3742    .save_setup = ram_save_setup,
3743    .save_live_iterate = ram_save_iterate,
3744    .save_live_complete_postcopy = ram_save_complete,
3745    .save_live_complete_precopy = ram_save_complete,
3746    .has_postcopy = ram_has_postcopy,
3747    .save_live_pending = ram_save_pending,
3748    .load_state = ram_load,
3749    .save_cleanup = ram_save_cleanup,
3750    .load_setup = ram_load_setup,
3751    .load_cleanup = ram_load_cleanup,
3752    .resume_prepare = ram_resume_prepare,
3753};
3754
3755void ram_mig_init(void)
3756{
3757    qemu_mutex_init(&XBZRLE.lock);
3758    register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
3759}
3760