qemu/migration/ram.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2011-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include "qemu/cutils.h"
  31#include "qemu/bitops.h"
  32#include "qemu/bitmap.h"
  33#include "qemu/main-loop.h"
  34#include "xbzrle.h"
  35#include "ram.h"
  36#include "migration.h"
  37#include "migration/register.h"
  38#include "migration/misc.h"
  39#include "qemu-file.h"
  40#include "postcopy-ram.h"
  41#include "page_cache.h"
  42#include "qemu/error-report.h"
  43#include "qapi/error.h"
  44#include "qapi/qapi-types-migration.h"
  45#include "qapi/qapi-events-migration.h"
  46#include "qapi/qmp/qerror.h"
  47#include "trace.h"
  48#include "exec/ram_addr.h"
  49#include "exec/target_page.h"
  50#include "qemu/rcu_queue.h"
  51#include "migration/colo.h"
  52#include "block.h"
  53#include "sysemu/cpu-throttle.h"
  54#include "savevm.h"
  55#include "qemu/iov.h"
  56#include "multifd.h"
  57#include "sysemu/runstate.h"
  58
  59#if defined(__linux__)
  60#include "qemu/userfaultfd.h"
  61#endif /* defined(__linux__) */
  62
  63/***********************************************************/
  64/* ram save/restore */
  65
  66/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  67 * worked for pages that where filled with the same char.  We switched
  68 * it to only search for the zero value.  And to avoid confusion with
  69 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  70 */
  71
  72#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  73#define RAM_SAVE_FLAG_ZERO     0x02
  74#define RAM_SAVE_FLAG_MEM_SIZE 0x04
  75#define RAM_SAVE_FLAG_PAGE     0x08
  76#define RAM_SAVE_FLAG_EOS      0x10
  77#define RAM_SAVE_FLAG_CONTINUE 0x20
  78#define RAM_SAVE_FLAG_XBZRLE   0x40
  79/* 0x80 is reserved in migration.h start with 0x100 next */
  80#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  81
  82static inline bool is_zero_range(uint8_t *p, uint64_t size)
  83{
  84    return buffer_is_zero(p, size);
  85}
  86
  87XBZRLECacheStats xbzrle_counters;
  88
  89/* struct contains XBZRLE cache and a static page
  90   used by the compression */
  91static struct {
  92    /* buffer used for XBZRLE encoding */
  93    uint8_t *encoded_buf;
  94    /* buffer for storing page content */
  95    uint8_t *current_buf;
  96    /* Cache for XBZRLE, Protected by lock. */
  97    PageCache *cache;
  98    QemuMutex lock;
  99    /* it will store a page full of zeros */
 100    uint8_t *zero_target_page;
 101    /* buffer used for XBZRLE decoding */
 102    uint8_t *decoded_buf;
 103} XBZRLE;
 104
 105static void XBZRLE_cache_lock(void)
 106{
 107    if (migrate_use_xbzrle()) {
 108        qemu_mutex_lock(&XBZRLE.lock);
 109    }
 110}
 111
 112static void XBZRLE_cache_unlock(void)
 113{
 114    if (migrate_use_xbzrle()) {
 115        qemu_mutex_unlock(&XBZRLE.lock);
 116    }
 117}
 118
 119/**
 120 * xbzrle_cache_resize: resize the xbzrle cache
 121 *
 122 * This function is called from migrate_params_apply in main
 123 * thread, possibly while a migration is in progress.  A running
 124 * migration may be using the cache and might finish during this call,
 125 * hence changes to the cache are protected by XBZRLE.lock().
 126 *
 127 * Returns 0 for success or -1 for error
 128 *
 129 * @new_size: new cache size
 130 * @errp: set *errp if the check failed, with reason
 131 */
 132int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 133{
 134    PageCache *new_cache;
 135    int64_t ret = 0;
 136
 137    /* Check for truncation */
 138    if (new_size != (size_t)new_size) {
 139        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 140                   "exceeding address space");
 141        return -1;
 142    }
 143
 144    if (new_size == migrate_xbzrle_cache_size()) {
 145        /* nothing to do */
 146        return 0;
 147    }
 148
 149    XBZRLE_cache_lock();
 150
 151    if (XBZRLE.cache != NULL) {
 152        new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 153        if (!new_cache) {
 154            ret = -1;
 155            goto out;
 156        }
 157
 158        cache_fini(XBZRLE.cache);
 159        XBZRLE.cache = new_cache;
 160    }
 161out:
 162    XBZRLE_cache_unlock();
 163    return ret;
 164}
 165
 166bool ramblock_is_ignored(RAMBlock *block)
 167{
 168    return !qemu_ram_is_migratable(block) ||
 169           (migrate_ignore_shared() && qemu_ram_is_shared(block));
 170}
 171
 172#undef RAMBLOCK_FOREACH
 173
 174int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 175{
 176    RAMBlock *block;
 177    int ret = 0;
 178
 179    RCU_READ_LOCK_GUARD();
 180
 181    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 182        ret = func(block, opaque);
 183        if (ret) {
 184            break;
 185        }
 186    }
 187    return ret;
 188}
 189
 190static void ramblock_recv_map_init(void)
 191{
 192    RAMBlock *rb;
 193
 194    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 195        assert(!rb->receivedmap);
 196        rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 197    }
 198}
 199
 200int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 201{
 202    return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 203                    rb->receivedmap);
 204}
 205
 206bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 207{
 208    return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 209}
 210
 211void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 212{
 213    set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 214}
 215
 216void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 217                                    size_t nr)
 218{
 219    bitmap_set_atomic(rb->receivedmap,
 220                      ramblock_recv_bitmap_offset(host_addr, rb),
 221                      nr);
 222}
 223
 224#define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 225
 226/*
 227 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 228 *
 229 * Returns >0 if success with sent bytes, or <0 if error.
 230 */
 231int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 232                                  const char *block_name)
 233{
 234    RAMBlock *block = qemu_ram_block_by_name(block_name);
 235    unsigned long *le_bitmap, nbits;
 236    uint64_t size;
 237
 238    if (!block) {
 239        error_report("%s: invalid block name: %s", __func__, block_name);
 240        return -1;
 241    }
 242
 243    nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 244
 245    /*
 246     * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 247     * machines we may need 4 more bytes for padding (see below
 248     * comment). So extend it a bit before hand.
 249     */
 250    le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 251
 252    /*
 253     * Always use little endian when sending the bitmap. This is
 254     * required that when source and destination VMs are not using the
 255     * same endianness. (Note: big endian won't work.)
 256     */
 257    bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 258
 259    /* Size of the bitmap, in bytes */
 260    size = DIV_ROUND_UP(nbits, 8);
 261
 262    /*
 263     * size is always aligned to 8 bytes for 64bit machines, but it
 264     * may not be true for 32bit machines. We need this padding to
 265     * make sure the migration can survive even between 32bit and
 266     * 64bit machines.
 267     */
 268    size = ROUND_UP(size, 8);
 269
 270    qemu_put_be64(file, size);
 271    qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 272    /*
 273     * Mark as an end, in case the middle part is screwed up due to
 274     * some "mysterious" reason.
 275     */
 276    qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 277    qemu_fflush(file);
 278
 279    g_free(le_bitmap);
 280
 281    if (qemu_file_get_error(file)) {
 282        return qemu_file_get_error(file);
 283    }
 284
 285    return size + sizeof(size);
 286}
 287
 288/*
 289 * An outstanding page request, on the source, having been received
 290 * and queued
 291 */
 292struct RAMSrcPageRequest {
 293    RAMBlock *rb;
 294    hwaddr    offset;
 295    hwaddr    len;
 296
 297    QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 298};
 299
 300/* State of RAM for migration */
 301struct RAMState {
 302    /* QEMUFile used for this migration */
 303    QEMUFile *f;
 304    /* UFFD file descriptor, used in 'write-tracking' migration */
 305    int uffdio_fd;
 306    /* Last block that we have visited searching for dirty pages */
 307    RAMBlock *last_seen_block;
 308    /* Last block from where we have sent data */
 309    RAMBlock *last_sent_block;
 310    /* Last dirty target page we have sent */
 311    ram_addr_t last_page;
 312    /* last ram version we have seen */
 313    uint32_t last_version;
 314    /* How many times we have dirty too many pages */
 315    int dirty_rate_high_cnt;
 316    /* these variables are used for bitmap sync */
 317    /* last time we did a full bitmap_sync */
 318    int64_t time_last_bitmap_sync;
 319    /* bytes transferred at start_time */
 320    uint64_t bytes_xfer_prev;
 321    /* number of dirty pages since start_time */
 322    uint64_t num_dirty_pages_period;
 323    /* xbzrle misses since the beginning of the period */
 324    uint64_t xbzrle_cache_miss_prev;
 325    /* Amount of xbzrle pages since the beginning of the period */
 326    uint64_t xbzrle_pages_prev;
 327    /* Amount of xbzrle encoded bytes since the beginning of the period */
 328    uint64_t xbzrle_bytes_prev;
 329    /* Start using XBZRLE (e.g., after the first round). */
 330    bool xbzrle_enabled;
 331
 332    /* compression statistics since the beginning of the period */
 333    /* amount of count that no free thread to compress data */
 334    uint64_t compress_thread_busy_prev;
 335    /* amount bytes after compression */
 336    uint64_t compressed_size_prev;
 337    /* amount of compressed pages */
 338    uint64_t compress_pages_prev;
 339
 340    /* total handled target pages at the beginning of period */
 341    uint64_t target_page_count_prev;
 342    /* total handled target pages since start */
 343    uint64_t target_page_count;
 344    /* number of dirty bits in the bitmap */
 345    uint64_t migration_dirty_pages;
 346    /* Protects modification of the bitmap and migration dirty pages */
 347    QemuMutex bitmap_mutex;
 348    /* The RAMBlock used in the last src_page_requests */
 349    RAMBlock *last_req_rb;
 350    /* Queue of outstanding page requests from the destination */
 351    QemuMutex src_page_req_mutex;
 352    QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 353};
 354typedef struct RAMState RAMState;
 355
 356static RAMState *ram_state;
 357
 358static NotifierWithReturnList precopy_notifier_list;
 359
 360void precopy_infrastructure_init(void)
 361{
 362    notifier_with_return_list_init(&precopy_notifier_list);
 363}
 364
 365void precopy_add_notifier(NotifierWithReturn *n)
 366{
 367    notifier_with_return_list_add(&precopy_notifier_list, n);
 368}
 369
 370void precopy_remove_notifier(NotifierWithReturn *n)
 371{
 372    notifier_with_return_remove(n);
 373}
 374
 375int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 376{
 377    PrecopyNotifyData pnd;
 378    pnd.reason = reason;
 379    pnd.errp = errp;
 380
 381    return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 382}
 383
 384uint64_t ram_bytes_remaining(void)
 385{
 386    return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 387                       0;
 388}
 389
 390MigrationStats ram_counters;
 391
 392/* used by the search for pages to send */
 393struct PageSearchStatus {
 394    /* Current block being searched */
 395    RAMBlock    *block;
 396    /* Current page to search from */
 397    unsigned long page;
 398    /* Set once we wrap around */
 399    bool         complete_round;
 400};
 401typedef struct PageSearchStatus PageSearchStatus;
 402
 403CompressionStats compression_counters;
 404
 405struct CompressParam {
 406    bool done;
 407    bool quit;
 408    bool zero_page;
 409    QEMUFile *file;
 410    QemuMutex mutex;
 411    QemuCond cond;
 412    RAMBlock *block;
 413    ram_addr_t offset;
 414
 415    /* internally used fields */
 416    z_stream stream;
 417    uint8_t *originbuf;
 418};
 419typedef struct CompressParam CompressParam;
 420
 421struct DecompressParam {
 422    bool done;
 423    bool quit;
 424    QemuMutex mutex;
 425    QemuCond cond;
 426    void *des;
 427    uint8_t *compbuf;
 428    int len;
 429    z_stream stream;
 430};
 431typedef struct DecompressParam DecompressParam;
 432
 433static CompressParam *comp_param;
 434static QemuThread *compress_threads;
 435/* comp_done_cond is used to wake up the migration thread when
 436 * one of the compression threads has finished the compression.
 437 * comp_done_lock is used to co-work with comp_done_cond.
 438 */
 439static QemuMutex comp_done_lock;
 440static QemuCond comp_done_cond;
 441/* The empty QEMUFileOps will be used by file in CompressParam */
 442static const QEMUFileOps empty_ops = { };
 443
 444static QEMUFile *decomp_file;
 445static DecompressParam *decomp_param;
 446static QemuThread *decompress_threads;
 447static QemuMutex decomp_done_lock;
 448static QemuCond decomp_done_cond;
 449
 450static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 451                                 ram_addr_t offset, uint8_t *source_buf);
 452
 453static void *do_data_compress(void *opaque)
 454{
 455    CompressParam *param = opaque;
 456    RAMBlock *block;
 457    ram_addr_t offset;
 458    bool zero_page;
 459
 460    qemu_mutex_lock(&param->mutex);
 461    while (!param->quit) {
 462        if (param->block) {
 463            block = param->block;
 464            offset = param->offset;
 465            param->block = NULL;
 466            qemu_mutex_unlock(&param->mutex);
 467
 468            zero_page = do_compress_ram_page(param->file, &param->stream,
 469                                             block, offset, param->originbuf);
 470
 471            qemu_mutex_lock(&comp_done_lock);
 472            param->done = true;
 473            param->zero_page = zero_page;
 474            qemu_cond_signal(&comp_done_cond);
 475            qemu_mutex_unlock(&comp_done_lock);
 476
 477            qemu_mutex_lock(&param->mutex);
 478        } else {
 479            qemu_cond_wait(&param->cond, &param->mutex);
 480        }
 481    }
 482    qemu_mutex_unlock(&param->mutex);
 483
 484    return NULL;
 485}
 486
 487static void compress_threads_save_cleanup(void)
 488{
 489    int i, thread_count;
 490
 491    if (!migrate_use_compression() || !comp_param) {
 492        return;
 493    }
 494
 495    thread_count = migrate_compress_threads();
 496    for (i = 0; i < thread_count; i++) {
 497        /*
 498         * we use it as a indicator which shows if the thread is
 499         * properly init'd or not
 500         */
 501        if (!comp_param[i].file) {
 502            break;
 503        }
 504
 505        qemu_mutex_lock(&comp_param[i].mutex);
 506        comp_param[i].quit = true;
 507        qemu_cond_signal(&comp_param[i].cond);
 508        qemu_mutex_unlock(&comp_param[i].mutex);
 509
 510        qemu_thread_join(compress_threads + i);
 511        qemu_mutex_destroy(&comp_param[i].mutex);
 512        qemu_cond_destroy(&comp_param[i].cond);
 513        deflateEnd(&comp_param[i].stream);
 514        g_free(comp_param[i].originbuf);
 515        qemu_fclose(comp_param[i].file);
 516        comp_param[i].file = NULL;
 517    }
 518    qemu_mutex_destroy(&comp_done_lock);
 519    qemu_cond_destroy(&comp_done_cond);
 520    g_free(compress_threads);
 521    g_free(comp_param);
 522    compress_threads = NULL;
 523    comp_param = NULL;
 524}
 525
 526static int compress_threads_save_setup(void)
 527{
 528    int i, thread_count;
 529
 530    if (!migrate_use_compression()) {
 531        return 0;
 532    }
 533    thread_count = migrate_compress_threads();
 534    compress_threads = g_new0(QemuThread, thread_count);
 535    comp_param = g_new0(CompressParam, thread_count);
 536    qemu_cond_init(&comp_done_cond);
 537    qemu_mutex_init(&comp_done_lock);
 538    for (i = 0; i < thread_count; i++) {
 539        comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 540        if (!comp_param[i].originbuf) {
 541            goto exit;
 542        }
 543
 544        if (deflateInit(&comp_param[i].stream,
 545                        migrate_compress_level()) != Z_OK) {
 546            g_free(comp_param[i].originbuf);
 547            goto exit;
 548        }
 549
 550        /* comp_param[i].file is just used as a dummy buffer to save data,
 551         * set its ops to empty.
 552         */
 553        comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
 554        comp_param[i].done = true;
 555        comp_param[i].quit = false;
 556        qemu_mutex_init(&comp_param[i].mutex);
 557        qemu_cond_init(&comp_param[i].cond);
 558        qemu_thread_create(compress_threads + i, "compress",
 559                           do_data_compress, comp_param + i,
 560                           QEMU_THREAD_JOINABLE);
 561    }
 562    return 0;
 563
 564exit:
 565    compress_threads_save_cleanup();
 566    return -1;
 567}
 568
 569/**
 570 * save_page_header: write page header to wire
 571 *
 572 * If this is the 1st block, it also writes the block identification
 573 *
 574 * Returns the number of bytes written
 575 *
 576 * @f: QEMUFile where to send the data
 577 * @block: block that contains the page we want to send
 578 * @offset: offset inside the block for the page
 579 *          in the lower bits, it contains flags
 580 */
 581static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 582                               ram_addr_t offset)
 583{
 584    size_t size, len;
 585
 586    if (block == rs->last_sent_block) {
 587        offset |= RAM_SAVE_FLAG_CONTINUE;
 588    }
 589    qemu_put_be64(f, offset);
 590    size = 8;
 591
 592    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 593        len = strlen(block->idstr);
 594        qemu_put_byte(f, len);
 595        qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 596        size += 1 + len;
 597        rs->last_sent_block = block;
 598    }
 599    return size;
 600}
 601
 602/**
 603 * mig_throttle_guest_down: throttle down the guest
 604 *
 605 * Reduce amount of guest cpu execution to hopefully slow down memory
 606 * writes. If guest dirty memory rate is reduced below the rate at
 607 * which we can transfer pages to the destination then we should be
 608 * able to complete migration. Some workloads dirty memory way too
 609 * fast and will not effectively converge, even with auto-converge.
 610 */
 611static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 612                                    uint64_t bytes_dirty_threshold)
 613{
 614    MigrationState *s = migrate_get_current();
 615    uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 616    uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 617    bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 618    int pct_max = s->parameters.max_cpu_throttle;
 619
 620    uint64_t throttle_now = cpu_throttle_get_percentage();
 621    uint64_t cpu_now, cpu_ideal, throttle_inc;
 622
 623    /* We have not started throttling yet. Let's start it. */
 624    if (!cpu_throttle_active()) {
 625        cpu_throttle_set(pct_initial);
 626    } else {
 627        /* Throttling already on, just increase the rate */
 628        if (!pct_tailslow) {
 629            throttle_inc = pct_increment;
 630        } else {
 631            /* Compute the ideal CPU percentage used by Guest, which may
 632             * make the dirty rate match the dirty rate threshold. */
 633            cpu_now = 100 - throttle_now;
 634            cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 635                        bytes_dirty_period);
 636            throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 637        }
 638        cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 639    }
 640}
 641
 642/**
 643 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 644 *
 645 * @rs: current RAM state
 646 * @current_addr: address for the zero page
 647 *
 648 * Update the xbzrle cache to reflect a page that's been sent as all 0.
 649 * The important thing is that a stale (not-yet-0'd) page be replaced
 650 * by the new data.
 651 * As a bonus, if the page wasn't in the cache it gets added so that
 652 * when a small write is made into the 0'd page it gets XBZRLE sent.
 653 */
 654static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 655{
 656    if (!rs->xbzrle_enabled) {
 657        return;
 658    }
 659
 660    /* We don't care if this fails to allocate a new cache page
 661     * as long as it updated an old one */
 662    cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 663                 ram_counters.dirty_sync_count);
 664}
 665
 666#define ENCODING_FLAG_XBZRLE 0x1
 667
 668/**
 669 * save_xbzrle_page: compress and send current page
 670 *
 671 * Returns: 1 means that we wrote the page
 672 *          0 means that page is identical to the one already sent
 673 *          -1 means that xbzrle would be longer than normal
 674 *
 675 * @rs: current RAM state
 676 * @current_data: pointer to the address of the page contents
 677 * @current_addr: addr of the page
 678 * @block: block that contains the page we want to send
 679 * @offset: offset inside the block for the page
 680 * @last_stage: if we are at the completion stage
 681 */
 682static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 683                            ram_addr_t current_addr, RAMBlock *block,
 684                            ram_addr_t offset, bool last_stage)
 685{
 686    int encoded_len = 0, bytes_xbzrle;
 687    uint8_t *prev_cached_page;
 688
 689    if (!cache_is_cached(XBZRLE.cache, current_addr,
 690                         ram_counters.dirty_sync_count)) {
 691        xbzrle_counters.cache_miss++;
 692        if (!last_stage) {
 693            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 694                             ram_counters.dirty_sync_count) == -1) {
 695                return -1;
 696            } else {
 697                /* update *current_data when the page has been
 698                   inserted into cache */
 699                *current_data = get_cached_data(XBZRLE.cache, current_addr);
 700            }
 701        }
 702        return -1;
 703    }
 704
 705    /*
 706     * Reaching here means the page has hit the xbzrle cache, no matter what
 707     * encoding result it is (normal encoding, overflow or skipping the page),
 708     * count the page as encoded. This is used to calculate the encoding rate.
 709     *
 710     * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 711     * 2nd page turns out to be skipped (i.e. no new bytes written to the
 712     * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 713     * skipped page included. In this way, the encoding rate can tell if the
 714     * guest page is good for xbzrle encoding.
 715     */
 716    xbzrle_counters.pages++;
 717    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 718
 719    /* save current buffer into memory */
 720    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 721
 722    /* XBZRLE encoding (if there is no overflow) */
 723    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 724                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 725                                       TARGET_PAGE_SIZE);
 726
 727    /*
 728     * Update the cache contents, so that it corresponds to the data
 729     * sent, in all cases except where we skip the page.
 730     */
 731    if (!last_stage && encoded_len != 0) {
 732        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 733        /*
 734         * In the case where we couldn't compress, ensure that the caller
 735         * sends the data from the cache, since the guest might have
 736         * changed the RAM since we copied it.
 737         */
 738        *current_data = prev_cached_page;
 739    }
 740
 741    if (encoded_len == 0) {
 742        trace_save_xbzrle_page_skipping();
 743        return 0;
 744    } else if (encoded_len == -1) {
 745        trace_save_xbzrle_page_overflow();
 746        xbzrle_counters.overflow++;
 747        xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 748        return -1;
 749    }
 750
 751    /* Send XBZRLE based compressed page */
 752    bytes_xbzrle = save_page_header(rs, rs->f, block,
 753                                    offset | RAM_SAVE_FLAG_XBZRLE);
 754    qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 755    qemu_put_be16(rs->f, encoded_len);
 756    qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 757    bytes_xbzrle += encoded_len + 1 + 2;
 758    /*
 759     * Like compressed_size (please see update_compress_thread_counts),
 760     * the xbzrle encoded bytes don't count the 8 byte header with
 761     * RAM_SAVE_FLAG_CONTINUE.
 762     */
 763    xbzrle_counters.bytes += bytes_xbzrle - 8;
 764    ram_counters.transferred += bytes_xbzrle;
 765
 766    return 1;
 767}
 768
 769/**
 770 * migration_bitmap_find_dirty: find the next dirty page from start
 771 *
 772 * Returns the page offset within memory region of the start of a dirty page
 773 *
 774 * @rs: current RAM state
 775 * @rb: RAMBlock where to search for dirty pages
 776 * @start: page where we start the search
 777 */
 778static inline
 779unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 780                                          unsigned long start)
 781{
 782    unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 783    unsigned long *bitmap = rb->bmap;
 784
 785    if (ramblock_is_ignored(rb)) {
 786        return size;
 787    }
 788
 789    return find_next_bit(bitmap, size, start);
 790}
 791
 792static void migration_clear_memory_region_dirty_bitmap(RAMState *rs,
 793                                                       RAMBlock *rb,
 794                                                       unsigned long page)
 795{
 796    uint8_t shift;
 797    hwaddr size, start;
 798
 799    if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 800        return;
 801    }
 802
 803    shift = rb->clear_bmap_shift;
 804    /*
 805     * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 806     * can make things easier sometimes since then start address
 807     * of the small chunk will always be 64 pages aligned so the
 808     * bitmap will always be aligned to unsigned long. We should
 809     * even be able to remove this restriction but I'm simply
 810     * keeping it.
 811     */
 812    assert(shift >= 6);
 813
 814    size = 1ULL << (TARGET_PAGE_BITS + shift);
 815    start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 816    trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 817    memory_region_clear_dirty_bitmap(rb->mr, start, size);
 818}
 819
 820static void
 821migration_clear_memory_region_dirty_bitmap_range(RAMState *rs,
 822                                                 RAMBlock *rb,
 823                                                 unsigned long start,
 824                                                 unsigned long npages)
 825{
 826    unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 827    unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 828    unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 829
 830    /*
 831     * Clear pages from start to start + npages - 1, so the end boundary is
 832     * exclusive.
 833     */
 834    for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 835        migration_clear_memory_region_dirty_bitmap(rs, rb, i);
 836    }
 837}
 838
 839static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 840                                                RAMBlock *rb,
 841                                                unsigned long page)
 842{
 843    bool ret;
 844
 845    /*
 846     * Clear dirty bitmap if needed.  This _must_ be called before we
 847     * send any of the page in the chunk because we need to make sure
 848     * we can capture further page content changes when we sync dirty
 849     * log the next time.  So as long as we are going to send any of
 850     * the page in the chunk we clear the remote dirty bitmap for all.
 851     * Clearing it earlier won't be a problem, but too late will.
 852     */
 853    migration_clear_memory_region_dirty_bitmap(rs, rb, page);
 854
 855    ret = test_and_clear_bit(page, rb->bmap);
 856    if (ret) {
 857        rs->migration_dirty_pages--;
 858    }
 859
 860    return ret;
 861}
 862
 863/* Called with RCU critical section */
 864static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 865{
 866    uint64_t new_dirty_pages =
 867        cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 868
 869    rs->migration_dirty_pages += new_dirty_pages;
 870    rs->num_dirty_pages_period += new_dirty_pages;
 871}
 872
 873/**
 874 * ram_pagesize_summary: calculate all the pagesizes of a VM
 875 *
 876 * Returns a summary bitmap of the page sizes of all RAMBlocks
 877 *
 878 * For VMs with just normal pages this is equivalent to the host page
 879 * size. If it's got some huge pages then it's the OR of all the
 880 * different page sizes.
 881 */
 882uint64_t ram_pagesize_summary(void)
 883{
 884    RAMBlock *block;
 885    uint64_t summary = 0;
 886
 887    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 888        summary |= block->page_size;
 889    }
 890
 891    return summary;
 892}
 893
 894uint64_t ram_get_total_transferred_pages(void)
 895{
 896    return  ram_counters.normal + ram_counters.duplicate +
 897                compression_counters.pages + xbzrle_counters.pages;
 898}
 899
 900static void migration_update_rates(RAMState *rs, int64_t end_time)
 901{
 902    uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 903    double compressed_size;
 904
 905    /* calculate period counters */
 906    ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 907                / (end_time - rs->time_last_bitmap_sync);
 908
 909    if (!page_count) {
 910        return;
 911    }
 912
 913    if (migrate_use_xbzrle()) {
 914        double encoded_size, unencoded_size;
 915
 916        xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 917            rs->xbzrle_cache_miss_prev) / page_count;
 918        rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 919        unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 920                         TARGET_PAGE_SIZE;
 921        encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 922        if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 923            xbzrle_counters.encoding_rate = 0;
 924        } else {
 925            xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 926        }
 927        rs->xbzrle_pages_prev = xbzrle_counters.pages;
 928        rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 929    }
 930
 931    if (migrate_use_compression()) {
 932        compression_counters.busy_rate = (double)(compression_counters.busy -
 933            rs->compress_thread_busy_prev) / page_count;
 934        rs->compress_thread_busy_prev = compression_counters.busy;
 935
 936        compressed_size = compression_counters.compressed_size -
 937                          rs->compressed_size_prev;
 938        if (compressed_size) {
 939            double uncompressed_size = (compression_counters.pages -
 940                                    rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 941
 942            /* Compression-Ratio = Uncompressed-size / Compressed-size */
 943            compression_counters.compression_rate =
 944                                        uncompressed_size / compressed_size;
 945
 946            rs->compress_pages_prev = compression_counters.pages;
 947            rs->compressed_size_prev = compression_counters.compressed_size;
 948        }
 949    }
 950}
 951
 952static void migration_trigger_throttle(RAMState *rs)
 953{
 954    MigrationState *s = migrate_get_current();
 955    uint64_t threshold = s->parameters.throttle_trigger_threshold;
 956
 957    uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 958    uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 959    uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 960
 961    /* During block migration the auto-converge logic incorrectly detects
 962     * that ram migration makes no progress. Avoid this by disabling the
 963     * throttling logic during the bulk phase of block migration. */
 964    if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 965        /* The following detection logic can be refined later. For now:
 966           Check to see if the ratio between dirtied bytes and the approx.
 967           amount of bytes that just got transferred since the last time
 968           we were in this routine reaches the threshold. If that happens
 969           twice, start or increase throttling. */
 970
 971        if ((bytes_dirty_period > bytes_dirty_threshold) &&
 972            (++rs->dirty_rate_high_cnt >= 2)) {
 973            trace_migration_throttle();
 974            rs->dirty_rate_high_cnt = 0;
 975            mig_throttle_guest_down(bytes_dirty_period,
 976                                    bytes_dirty_threshold);
 977        }
 978    }
 979}
 980
 981static void migration_bitmap_sync(RAMState *rs)
 982{
 983    RAMBlock *block;
 984    int64_t end_time;
 985
 986    ram_counters.dirty_sync_count++;
 987
 988    if (!rs->time_last_bitmap_sync) {
 989        rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 990    }
 991
 992    trace_migration_bitmap_sync_start();
 993    memory_global_dirty_log_sync();
 994
 995    qemu_mutex_lock(&rs->bitmap_mutex);
 996    WITH_RCU_READ_LOCK_GUARD() {
 997        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 998            ramblock_sync_dirty_bitmap(rs, block);
 999        }
1000        ram_counters.remaining = ram_bytes_remaining();
1001    }
1002    qemu_mutex_unlock(&rs->bitmap_mutex);
1003
1004    memory_global_after_dirty_log_sync();
1005    trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1006
1007    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1008
1009    /* more than 1 second = 1000 millisecons */
1010    if (end_time > rs->time_last_bitmap_sync + 1000) {
1011        migration_trigger_throttle(rs);
1012
1013        migration_update_rates(rs, end_time);
1014
1015        rs->target_page_count_prev = rs->target_page_count;
1016
1017        /* reset period counters */
1018        rs->time_last_bitmap_sync = end_time;
1019        rs->num_dirty_pages_period = 0;
1020        rs->bytes_xfer_prev = ram_counters.transferred;
1021    }
1022    if (migrate_use_events()) {
1023        qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1024    }
1025}
1026
1027static void migration_bitmap_sync_precopy(RAMState *rs)
1028{
1029    Error *local_err = NULL;
1030
1031    /*
1032     * The current notifier usage is just an optimization to migration, so we
1033     * don't stop the normal migration process in the error case.
1034     */
1035    if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1036        error_report_err(local_err);
1037        local_err = NULL;
1038    }
1039
1040    migration_bitmap_sync(rs);
1041
1042    if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1043        error_report_err(local_err);
1044    }
1045}
1046
1047/**
1048 * save_zero_page_to_file: send the zero page to the file
1049 *
1050 * Returns the size of data written to the file, 0 means the page is not
1051 * a zero page
1052 *
1053 * @rs: current RAM state
1054 * @file: the file where the data is saved
1055 * @block: block that contains the page we want to send
1056 * @offset: offset inside the block for the page
1057 */
1058static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1059                                  RAMBlock *block, ram_addr_t offset)
1060{
1061    uint8_t *p = block->host + offset;
1062    int len = 0;
1063
1064    if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1065        len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1066        qemu_put_byte(file, 0);
1067        len += 1;
1068    }
1069    return len;
1070}
1071
1072/**
1073 * save_zero_page: send the zero page to the stream
1074 *
1075 * Returns the number of pages written.
1076 *
1077 * @rs: current RAM state
1078 * @block: block that contains the page we want to send
1079 * @offset: offset inside the block for the page
1080 */
1081static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1082{
1083    int len = save_zero_page_to_file(rs, rs->f, block, offset);
1084
1085    if (len) {
1086        ram_counters.duplicate++;
1087        ram_counters.transferred += len;
1088        return 1;
1089    }
1090    return -1;
1091}
1092
1093static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1094{
1095    if (!migrate_release_ram() || !migration_in_postcopy()) {
1096        return;
1097    }
1098
1099    ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1100}
1101
1102/*
1103 * @pages: the number of pages written by the control path,
1104 *        < 0 - error
1105 *        > 0 - number of pages written
1106 *
1107 * Return true if the pages has been saved, otherwise false is returned.
1108 */
1109static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1110                              int *pages)
1111{
1112    uint64_t bytes_xmit = 0;
1113    int ret;
1114
1115    *pages = -1;
1116    ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1117                                &bytes_xmit);
1118    if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1119        return false;
1120    }
1121
1122    if (bytes_xmit) {
1123        ram_counters.transferred += bytes_xmit;
1124        *pages = 1;
1125    }
1126
1127    if (ret == RAM_SAVE_CONTROL_DELAYED) {
1128        return true;
1129    }
1130
1131    if (bytes_xmit > 0) {
1132        ram_counters.normal++;
1133    } else if (bytes_xmit == 0) {
1134        ram_counters.duplicate++;
1135    }
1136
1137    return true;
1138}
1139
1140/*
1141 * directly send the page to the stream
1142 *
1143 * Returns the number of pages written.
1144 *
1145 * @rs: current RAM state
1146 * @block: block that contains the page we want to send
1147 * @offset: offset inside the block for the page
1148 * @buf: the page to be sent
1149 * @async: send to page asyncly
1150 */
1151static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1152                            uint8_t *buf, bool async)
1153{
1154    ram_counters.transferred += save_page_header(rs, rs->f, block,
1155                                                 offset | RAM_SAVE_FLAG_PAGE);
1156    if (async) {
1157        qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1158                              migrate_release_ram() &
1159                              migration_in_postcopy());
1160    } else {
1161        qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1162    }
1163    ram_counters.transferred += TARGET_PAGE_SIZE;
1164    ram_counters.normal++;
1165    return 1;
1166}
1167
1168/**
1169 * ram_save_page: send the given page to the stream
1170 *
1171 * Returns the number of pages written.
1172 *          < 0 - error
1173 *          >=0 - Number of pages written - this might legally be 0
1174 *                if xbzrle noticed the page was the same.
1175 *
1176 * @rs: current RAM state
1177 * @block: block that contains the page we want to send
1178 * @offset: offset inside the block for the page
1179 * @last_stage: if we are at the completion stage
1180 */
1181static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1182{
1183    int pages = -1;
1184    uint8_t *p;
1185    bool send_async = true;
1186    RAMBlock *block = pss->block;
1187    ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1188    ram_addr_t current_addr = block->offset + offset;
1189
1190    p = block->host + offset;
1191    trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1192
1193    XBZRLE_cache_lock();
1194    if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1195        pages = save_xbzrle_page(rs, &p, current_addr, block,
1196                                 offset, last_stage);
1197        if (!last_stage) {
1198            /* Can't send this cached data async, since the cache page
1199             * might get updated before it gets to the wire
1200             */
1201            send_async = false;
1202        }
1203    }
1204
1205    /* XBZRLE overflow or normal page */
1206    if (pages == -1) {
1207        pages = save_normal_page(rs, block, offset, p, send_async);
1208    }
1209
1210    XBZRLE_cache_unlock();
1211
1212    return pages;
1213}
1214
1215static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1216                                 ram_addr_t offset)
1217{
1218    if (multifd_queue_page(rs->f, block, offset) < 0) {
1219        return -1;
1220    }
1221    ram_counters.normal++;
1222
1223    return 1;
1224}
1225
1226static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1227                                 ram_addr_t offset, uint8_t *source_buf)
1228{
1229    RAMState *rs = ram_state;
1230    uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1231    bool zero_page = false;
1232    int ret;
1233
1234    if (save_zero_page_to_file(rs, f, block, offset)) {
1235        zero_page = true;
1236        goto exit;
1237    }
1238
1239    save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1240
1241    /*
1242     * copy it to a internal buffer to avoid it being modified by VM
1243     * so that we can catch up the error during compression and
1244     * decompression
1245     */
1246    memcpy(source_buf, p, TARGET_PAGE_SIZE);
1247    ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1248    if (ret < 0) {
1249        qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1250        error_report("compressed data failed!");
1251        return false;
1252    }
1253
1254exit:
1255    ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1256    return zero_page;
1257}
1258
1259static void
1260update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1261{
1262    ram_counters.transferred += bytes_xmit;
1263
1264    if (param->zero_page) {
1265        ram_counters.duplicate++;
1266        return;
1267    }
1268
1269    /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1270    compression_counters.compressed_size += bytes_xmit - 8;
1271    compression_counters.pages++;
1272}
1273
1274static bool save_page_use_compression(RAMState *rs);
1275
1276static void flush_compressed_data(RAMState *rs)
1277{
1278    int idx, len, thread_count;
1279
1280    if (!save_page_use_compression(rs)) {
1281        return;
1282    }
1283    thread_count = migrate_compress_threads();
1284
1285    qemu_mutex_lock(&comp_done_lock);
1286    for (idx = 0; idx < thread_count; idx++) {
1287        while (!comp_param[idx].done) {
1288            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1289        }
1290    }
1291    qemu_mutex_unlock(&comp_done_lock);
1292
1293    for (idx = 0; idx < thread_count; idx++) {
1294        qemu_mutex_lock(&comp_param[idx].mutex);
1295        if (!comp_param[idx].quit) {
1296            len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1297            /*
1298             * it's safe to fetch zero_page without holding comp_done_lock
1299             * as there is no further request submitted to the thread,
1300             * i.e, the thread should be waiting for a request at this point.
1301             */
1302            update_compress_thread_counts(&comp_param[idx], len);
1303        }
1304        qemu_mutex_unlock(&comp_param[idx].mutex);
1305    }
1306}
1307
1308static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1309                                       ram_addr_t offset)
1310{
1311    param->block = block;
1312    param->offset = offset;
1313}
1314
1315static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1316                                           ram_addr_t offset)
1317{
1318    int idx, thread_count, bytes_xmit = -1, pages = -1;
1319    bool wait = migrate_compress_wait_thread();
1320
1321    thread_count = migrate_compress_threads();
1322    qemu_mutex_lock(&comp_done_lock);
1323retry:
1324    for (idx = 0; idx < thread_count; idx++) {
1325        if (comp_param[idx].done) {
1326            comp_param[idx].done = false;
1327            bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1328            qemu_mutex_lock(&comp_param[idx].mutex);
1329            set_compress_params(&comp_param[idx], block, offset);
1330            qemu_cond_signal(&comp_param[idx].cond);
1331            qemu_mutex_unlock(&comp_param[idx].mutex);
1332            pages = 1;
1333            update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1334            break;
1335        }
1336    }
1337
1338    /*
1339     * wait for the free thread if the user specifies 'compress-wait-thread',
1340     * otherwise we will post the page out in the main thread as normal page.
1341     */
1342    if (pages < 0 && wait) {
1343        qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1344        goto retry;
1345    }
1346    qemu_mutex_unlock(&comp_done_lock);
1347
1348    return pages;
1349}
1350
1351/**
1352 * find_dirty_block: find the next dirty page and update any state
1353 * associated with the search process.
1354 *
1355 * Returns true if a page is found
1356 *
1357 * @rs: current RAM state
1358 * @pss: data about the state of the current dirty page scan
1359 * @again: set to false if the search has scanned the whole of RAM
1360 */
1361static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1362{
1363    pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1364    if (pss->complete_round && pss->block == rs->last_seen_block &&
1365        pss->page >= rs->last_page) {
1366        /*
1367         * We've been once around the RAM and haven't found anything.
1368         * Give up.
1369         */
1370        *again = false;
1371        return false;
1372    }
1373    if (!offset_in_ramblock(pss->block,
1374                            ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1375        /* Didn't find anything in this RAM Block */
1376        pss->page = 0;
1377        pss->block = QLIST_NEXT_RCU(pss->block, next);
1378        if (!pss->block) {
1379            /*
1380             * If memory migration starts over, we will meet a dirtied page
1381             * which may still exists in compression threads's ring, so we
1382             * should flush the compressed data to make sure the new page
1383             * is not overwritten by the old one in the destination.
1384             *
1385             * Also If xbzrle is on, stop using the data compression at this
1386             * point. In theory, xbzrle can do better than compression.
1387             */
1388            flush_compressed_data(rs);
1389
1390            /* Hit the end of the list */
1391            pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1392            /* Flag that we've looped */
1393            pss->complete_round = true;
1394            /* After the first round, enable XBZRLE. */
1395            if (migrate_use_xbzrle()) {
1396                rs->xbzrle_enabled = true;
1397            }
1398        }
1399        /* Didn't find anything this time, but try again on the new block */
1400        *again = true;
1401        return false;
1402    } else {
1403        /* Can go around again, but... */
1404        *again = true;
1405        /* We've found something so probably don't need to */
1406        return true;
1407    }
1408}
1409
1410/**
1411 * unqueue_page: gets a page of the queue
1412 *
1413 * Helper for 'get_queued_page' - gets a page off the queue
1414 *
1415 * Returns the block of the page (or NULL if none available)
1416 *
1417 * @rs: current RAM state
1418 * @offset: used to return the offset within the RAMBlock
1419 */
1420static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1421{
1422    RAMBlock *block = NULL;
1423
1424    if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1425        return NULL;
1426    }
1427
1428    QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1429    if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1430        struct RAMSrcPageRequest *entry =
1431                                QSIMPLEQ_FIRST(&rs->src_page_requests);
1432        block = entry->rb;
1433        *offset = entry->offset;
1434
1435        if (entry->len > TARGET_PAGE_SIZE) {
1436            entry->len -= TARGET_PAGE_SIZE;
1437            entry->offset += TARGET_PAGE_SIZE;
1438        } else {
1439            memory_region_unref(block->mr);
1440            QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1441            g_free(entry);
1442            migration_consume_urgent_request();
1443        }
1444    }
1445
1446    return block;
1447}
1448
1449#if defined(__linux__)
1450/**
1451 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1452 *   is found, return RAM block pointer and page offset
1453 *
1454 * Returns pointer to the RAMBlock containing faulting page,
1455 *   NULL if no write faults are pending
1456 *
1457 * @rs: current RAM state
1458 * @offset: page offset from the beginning of the block
1459 */
1460static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1461{
1462    struct uffd_msg uffd_msg;
1463    void *page_address;
1464    RAMBlock *block;
1465    int res;
1466
1467    if (!migrate_background_snapshot()) {
1468        return NULL;
1469    }
1470
1471    res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1472    if (res <= 0) {
1473        return NULL;
1474    }
1475
1476    page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1477    block = qemu_ram_block_from_host(page_address, false, offset);
1478    assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1479    return block;
1480}
1481
1482/**
1483 * ram_save_release_protection: release UFFD write protection after
1484 *   a range of pages has been saved
1485 *
1486 * @rs: current RAM state
1487 * @pss: page-search-status structure
1488 * @start_page: index of the first page in the range relative to pss->block
1489 *
1490 * Returns 0 on success, negative value in case of an error
1491*/
1492static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1493        unsigned long start_page)
1494{
1495    int res = 0;
1496
1497    /* Check if page is from UFFD-managed region. */
1498    if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1499        void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1500        uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1501
1502        /* Flush async buffers before un-protect. */
1503        qemu_fflush(rs->f);
1504        /* Un-protect memory range. */
1505        res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1506                false, false);
1507    }
1508
1509    return res;
1510}
1511
1512/* ram_write_tracking_available: check if kernel supports required UFFD features
1513 *
1514 * Returns true if supports, false otherwise
1515 */
1516bool ram_write_tracking_available(void)
1517{
1518    uint64_t uffd_features;
1519    int res;
1520
1521    res = uffd_query_features(&uffd_features);
1522    return (res == 0 &&
1523            (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1524}
1525
1526/* ram_write_tracking_compatible: check if guest configuration is
1527 *   compatible with 'write-tracking'
1528 *
1529 * Returns true if compatible, false otherwise
1530 */
1531bool ram_write_tracking_compatible(void)
1532{
1533    const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1534    int uffd_fd;
1535    RAMBlock *block;
1536    bool ret = false;
1537
1538    /* Open UFFD file descriptor */
1539    uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1540    if (uffd_fd < 0) {
1541        return false;
1542    }
1543
1544    RCU_READ_LOCK_GUARD();
1545
1546    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1547        uint64_t uffd_ioctls;
1548
1549        /* Nothing to do with read-only and MMIO-writable regions */
1550        if (block->mr->readonly || block->mr->rom_device) {
1551            continue;
1552        }
1553        /* Try to register block memory via UFFD-IO to track writes */
1554        if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1555                UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1556            goto out;
1557        }
1558        if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1559            goto out;
1560        }
1561    }
1562    ret = true;
1563
1564out:
1565    uffd_close_fd(uffd_fd);
1566    return ret;
1567}
1568
1569/*
1570 * ram_block_populate_pages: populate memory in the RAM block by reading
1571 *   an integer from the beginning of each page.
1572 *
1573 * Since it's solely used for userfault_fd WP feature, here we just
1574 *   hardcode page size to qemu_real_host_page_size.
1575 *
1576 * @block: RAM block to populate
1577 */
1578static void ram_block_populate_pages(RAMBlock *block)
1579{
1580    char *ptr = (char *) block->host;
1581
1582    for (ram_addr_t offset = 0; offset < block->used_length;
1583            offset += qemu_real_host_page_size) {
1584        char tmp = *(ptr + offset);
1585
1586        /* Don't optimize the read out */
1587        asm volatile("" : "+r" (tmp));
1588    }
1589}
1590
1591/*
1592 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1593 */
1594void ram_write_tracking_prepare(void)
1595{
1596    RAMBlock *block;
1597
1598    RCU_READ_LOCK_GUARD();
1599
1600    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1601        /* Nothing to do with read-only and MMIO-writable regions */
1602        if (block->mr->readonly || block->mr->rom_device) {
1603            continue;
1604        }
1605
1606        /*
1607         * Populate pages of the RAM block before enabling userfault_fd
1608         * write protection.
1609         *
1610         * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1611         * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1612         * pages with pte_none() entries in page table.
1613         */
1614        ram_block_populate_pages(block);
1615    }
1616}
1617
1618/*
1619 * ram_write_tracking_start: start UFFD-WP memory tracking
1620 *
1621 * Returns 0 for success or negative value in case of error
1622 */
1623int ram_write_tracking_start(void)
1624{
1625    int uffd_fd;
1626    RAMState *rs = ram_state;
1627    RAMBlock *block;
1628
1629    /* Open UFFD file descriptor */
1630    uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1631    if (uffd_fd < 0) {
1632        return uffd_fd;
1633    }
1634    rs->uffdio_fd = uffd_fd;
1635
1636    RCU_READ_LOCK_GUARD();
1637
1638    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1639        /* Nothing to do with read-only and MMIO-writable regions */
1640        if (block->mr->readonly || block->mr->rom_device) {
1641            continue;
1642        }
1643
1644        /* Register block memory with UFFD to track writes */
1645        if (uffd_register_memory(rs->uffdio_fd, block->host,
1646                block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1647            goto fail;
1648        }
1649        /* Apply UFFD write protection to the block memory range */
1650        if (uffd_change_protection(rs->uffdio_fd, block->host,
1651                block->max_length, true, false)) {
1652            goto fail;
1653        }
1654        block->flags |= RAM_UF_WRITEPROTECT;
1655        memory_region_ref(block->mr);
1656
1657        trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1658                block->host, block->max_length);
1659    }
1660
1661    return 0;
1662
1663fail:
1664    error_report("ram_write_tracking_start() failed: restoring initial memory state");
1665
1666    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1667        if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1668            continue;
1669        }
1670        /*
1671         * In case some memory block failed to be write-protected
1672         * remove protection and unregister all succeeded RAM blocks
1673         */
1674        uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1675                false, false);
1676        uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1677        /* Cleanup flags and remove reference */
1678        block->flags &= ~RAM_UF_WRITEPROTECT;
1679        memory_region_unref(block->mr);
1680    }
1681
1682    uffd_close_fd(uffd_fd);
1683    rs->uffdio_fd = -1;
1684    return -1;
1685}
1686
1687/**
1688 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1689 */
1690void ram_write_tracking_stop(void)
1691{
1692    RAMState *rs = ram_state;
1693    RAMBlock *block;
1694
1695    RCU_READ_LOCK_GUARD();
1696
1697    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1698        if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1699            continue;
1700        }
1701        /* Remove protection and unregister all affected RAM blocks */
1702        uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1703                false, false);
1704        uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1705
1706        trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1707                block->host, block->max_length);
1708
1709        /* Cleanup flags and remove reference */
1710        block->flags &= ~RAM_UF_WRITEPROTECT;
1711        memory_region_unref(block->mr);
1712    }
1713
1714    /* Finally close UFFD file descriptor */
1715    uffd_close_fd(rs->uffdio_fd);
1716    rs->uffdio_fd = -1;
1717}
1718
1719#else
1720/* No target OS support, stubs just fail or ignore */
1721
1722static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1723{
1724    (void) rs;
1725    (void) offset;
1726
1727    return NULL;
1728}
1729
1730static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1731        unsigned long start_page)
1732{
1733    (void) rs;
1734    (void) pss;
1735    (void) start_page;
1736
1737    return 0;
1738}
1739
1740bool ram_write_tracking_available(void)
1741{
1742    return false;
1743}
1744
1745bool ram_write_tracking_compatible(void)
1746{
1747    assert(0);
1748    return false;
1749}
1750
1751int ram_write_tracking_start(void)
1752{
1753    assert(0);
1754    return -1;
1755}
1756
1757void ram_write_tracking_stop(void)
1758{
1759    assert(0);
1760}
1761#endif /* defined(__linux__) */
1762
1763/**
1764 * get_queued_page: unqueue a page from the postcopy requests
1765 *
1766 * Skips pages that are already sent (!dirty)
1767 *
1768 * Returns true if a queued page is found
1769 *
1770 * @rs: current RAM state
1771 * @pss: data about the state of the current dirty page scan
1772 */
1773static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1774{
1775    RAMBlock  *block;
1776    ram_addr_t offset;
1777    bool dirty;
1778
1779    do {
1780        block = unqueue_page(rs, &offset);
1781        /*
1782         * We're sending this page, and since it's postcopy nothing else
1783         * will dirty it, and we must make sure it doesn't get sent again
1784         * even if this queue request was received after the background
1785         * search already sent it.
1786         */
1787        if (block) {
1788            unsigned long page;
1789
1790            page = offset >> TARGET_PAGE_BITS;
1791            dirty = test_bit(page, block->bmap);
1792            if (!dirty) {
1793                trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1794                                                page);
1795            } else {
1796                trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1797            }
1798        }
1799
1800    } while (block && !dirty);
1801
1802    if (!block) {
1803        /*
1804         * Poll write faults too if background snapshot is enabled; that's
1805         * when we have vcpus got blocked by the write protected pages.
1806         */
1807        block = poll_fault_page(rs, &offset);
1808    }
1809
1810    if (block) {
1811        /*
1812         * We want the background search to continue from the queued page
1813         * since the guest is likely to want other pages near to the page
1814         * it just requested.
1815         */
1816        pss->block = block;
1817        pss->page = offset >> TARGET_PAGE_BITS;
1818
1819        /*
1820         * This unqueued page would break the "one round" check, even is
1821         * really rare.
1822         */
1823        pss->complete_round = false;
1824    }
1825
1826    return !!block;
1827}
1828
1829/**
1830 * migration_page_queue_free: drop any remaining pages in the ram
1831 * request queue
1832 *
1833 * It should be empty at the end anyway, but in error cases there may
1834 * be some left.  in case that there is any page left, we drop it.
1835 *
1836 */
1837static void migration_page_queue_free(RAMState *rs)
1838{
1839    struct RAMSrcPageRequest *mspr, *next_mspr;
1840    /* This queue generally should be empty - but in the case of a failed
1841     * migration might have some droppings in.
1842     */
1843    RCU_READ_LOCK_GUARD();
1844    QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1845        memory_region_unref(mspr->rb->mr);
1846        QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1847        g_free(mspr);
1848    }
1849}
1850
1851/**
1852 * ram_save_queue_pages: queue the page for transmission
1853 *
1854 * A request from postcopy destination for example.
1855 *
1856 * Returns zero on success or negative on error
1857 *
1858 * @rbname: Name of the RAMBLock of the request. NULL means the
1859 *          same that last one.
1860 * @start: starting address from the start of the RAMBlock
1861 * @len: length (in bytes) to send
1862 */
1863int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1864{
1865    RAMBlock *ramblock;
1866    RAMState *rs = ram_state;
1867
1868    ram_counters.postcopy_requests++;
1869    RCU_READ_LOCK_GUARD();
1870
1871    if (!rbname) {
1872        /* Reuse last RAMBlock */
1873        ramblock = rs->last_req_rb;
1874
1875        if (!ramblock) {
1876            /*
1877             * Shouldn't happen, we can't reuse the last RAMBlock if
1878             * it's the 1st request.
1879             */
1880            error_report("ram_save_queue_pages no previous block");
1881            return -1;
1882        }
1883    } else {
1884        ramblock = qemu_ram_block_by_name(rbname);
1885
1886        if (!ramblock) {
1887            /* We shouldn't be asked for a non-existent RAMBlock */
1888            error_report("ram_save_queue_pages no block '%s'", rbname);
1889            return -1;
1890        }
1891        rs->last_req_rb = ramblock;
1892    }
1893    trace_ram_save_queue_pages(ramblock->idstr, start, len);
1894    if (!offset_in_ramblock(ramblock, start + len - 1)) {
1895        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1896                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1897                     __func__, start, len, ramblock->used_length);
1898        return -1;
1899    }
1900
1901    struct RAMSrcPageRequest *new_entry =
1902        g_malloc0(sizeof(struct RAMSrcPageRequest));
1903    new_entry->rb = ramblock;
1904    new_entry->offset = start;
1905    new_entry->len = len;
1906
1907    memory_region_ref(ramblock->mr);
1908    qemu_mutex_lock(&rs->src_page_req_mutex);
1909    QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1910    migration_make_urgent_request();
1911    qemu_mutex_unlock(&rs->src_page_req_mutex);
1912
1913    return 0;
1914}
1915
1916static bool save_page_use_compression(RAMState *rs)
1917{
1918    if (!migrate_use_compression()) {
1919        return false;
1920    }
1921
1922    /*
1923     * If xbzrle is enabled (e.g., after first round of migration), stop
1924     * using the data compression. In theory, xbzrle can do better than
1925     * compression.
1926     */
1927    if (rs->xbzrle_enabled) {
1928        return false;
1929    }
1930
1931    return true;
1932}
1933
1934/*
1935 * try to compress the page before posting it out, return true if the page
1936 * has been properly handled by compression, otherwise needs other
1937 * paths to handle it
1938 */
1939static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1940{
1941    if (!save_page_use_compression(rs)) {
1942        return false;
1943    }
1944
1945    /*
1946     * When starting the process of a new block, the first page of
1947     * the block should be sent out before other pages in the same
1948     * block, and all the pages in last block should have been sent
1949     * out, keeping this order is important, because the 'cont' flag
1950     * is used to avoid resending the block name.
1951     *
1952     * We post the fist page as normal page as compression will take
1953     * much CPU resource.
1954     */
1955    if (block != rs->last_sent_block) {
1956        flush_compressed_data(rs);
1957        return false;
1958    }
1959
1960    if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1961        return true;
1962    }
1963
1964    compression_counters.busy++;
1965    return false;
1966}
1967
1968/**
1969 * ram_save_target_page: save one target page
1970 *
1971 * Returns the number of pages written
1972 *
1973 * @rs: current RAM state
1974 * @pss: data about the page we want to send
1975 * @last_stage: if we are at the completion stage
1976 */
1977static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1978                                bool last_stage)
1979{
1980    RAMBlock *block = pss->block;
1981    ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1982    int res;
1983
1984    if (control_save_page(rs, block, offset, &res)) {
1985        return res;
1986    }
1987
1988    if (save_compress_page(rs, block, offset)) {
1989        return 1;
1990    }
1991
1992    res = save_zero_page(rs, block, offset);
1993    if (res > 0) {
1994        /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1995         * page would be stale
1996         */
1997        if (!save_page_use_compression(rs)) {
1998            XBZRLE_cache_lock();
1999            xbzrle_cache_zero_page(rs, block->offset + offset);
2000            XBZRLE_cache_unlock();
2001        }
2002        ram_release_pages(block->idstr, offset, res);
2003        return res;
2004    }
2005
2006    /*
2007     * Do not use multifd for:
2008     * 1. Compression as the first page in the new block should be posted out
2009     *    before sending the compressed page
2010     * 2. In postcopy as one whole host page should be placed
2011     */
2012    if (!save_page_use_compression(rs) && migrate_use_multifd()
2013        && !migration_in_postcopy()) {
2014        return ram_save_multifd_page(rs, block, offset);
2015    }
2016
2017    return ram_save_page(rs, pss, last_stage);
2018}
2019
2020/**
2021 * ram_save_host_page: save a whole host page
2022 *
2023 * Starting at *offset send pages up to the end of the current host
2024 * page. It's valid for the initial offset to point into the middle of
2025 * a host page in which case the remainder of the hostpage is sent.
2026 * Only dirty target pages are sent. Note that the host page size may
2027 * be a huge page for this block.
2028 * The saving stops at the boundary of the used_length of the block
2029 * if the RAMBlock isn't a multiple of the host page size.
2030 *
2031 * Returns the number of pages written or negative on error
2032 *
2033 * @rs: current RAM state
2034 * @ms: current migration state
2035 * @pss: data about the page we want to send
2036 * @last_stage: if we are at the completion stage
2037 */
2038static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2039                              bool last_stage)
2040{
2041    int tmppages, pages = 0;
2042    size_t pagesize_bits =
2043        qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2044    unsigned long hostpage_boundary =
2045        QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2046    unsigned long start_page = pss->page;
2047    int res;
2048
2049    if (ramblock_is_ignored(pss->block)) {
2050        error_report("block %s should not be migrated !", pss->block->idstr);
2051        return 0;
2052    }
2053
2054    do {
2055        /* Check the pages is dirty and if it is send it */
2056        if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2057            tmppages = ram_save_target_page(rs, pss, last_stage);
2058            if (tmppages < 0) {
2059                return tmppages;
2060            }
2061
2062            pages += tmppages;
2063            /*
2064             * Allow rate limiting to happen in the middle of huge pages if
2065             * something is sent in the current iteration.
2066             */
2067            if (pagesize_bits > 1 && tmppages > 0) {
2068                migration_rate_limit();
2069            }
2070        }
2071        pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2072    } while ((pss->page < hostpage_boundary) &&
2073             offset_in_ramblock(pss->block,
2074                                ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2075    /* The offset we leave with is the min boundary of host page and block */
2076    pss->page = MIN(pss->page, hostpage_boundary) - 1;
2077
2078    res = ram_save_release_protection(rs, pss, start_page);
2079    return (res < 0 ? res : pages);
2080}
2081
2082/**
2083 * ram_find_and_save_block: finds a dirty page and sends it to f
2084 *
2085 * Called within an RCU critical section.
2086 *
2087 * Returns the number of pages written where zero means no dirty pages,
2088 * or negative on error
2089 *
2090 * @rs: current RAM state
2091 * @last_stage: if we are at the completion stage
2092 *
2093 * On systems where host-page-size > target-page-size it will send all the
2094 * pages in a host page that are dirty.
2095 */
2096
2097static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2098{
2099    PageSearchStatus pss;
2100    int pages = 0;
2101    bool again, found;
2102
2103    /* No dirty page as there is zero RAM */
2104    if (!ram_bytes_total()) {
2105        return pages;
2106    }
2107
2108    pss.block = rs->last_seen_block;
2109    pss.page = rs->last_page;
2110    pss.complete_round = false;
2111
2112    if (!pss.block) {
2113        pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2114    }
2115
2116    do {
2117        again = true;
2118        found = get_queued_page(rs, &pss);
2119
2120        if (!found) {
2121            /* priority queue empty, so just search for something dirty */
2122            found = find_dirty_block(rs, &pss, &again);
2123        }
2124
2125        if (found) {
2126            pages = ram_save_host_page(rs, &pss, last_stage);
2127        }
2128    } while (!pages && again);
2129
2130    rs->last_seen_block = pss.block;
2131    rs->last_page = pss.page;
2132
2133    return pages;
2134}
2135
2136void acct_update_position(QEMUFile *f, size_t size, bool zero)
2137{
2138    uint64_t pages = size / TARGET_PAGE_SIZE;
2139
2140    if (zero) {
2141        ram_counters.duplicate += pages;
2142    } else {
2143        ram_counters.normal += pages;
2144        ram_counters.transferred += size;
2145        qemu_update_position(f, size);
2146    }
2147}
2148
2149static uint64_t ram_bytes_total_common(bool count_ignored)
2150{
2151    RAMBlock *block;
2152    uint64_t total = 0;
2153
2154    RCU_READ_LOCK_GUARD();
2155
2156    if (count_ignored) {
2157        RAMBLOCK_FOREACH_MIGRATABLE(block) {
2158            total += block->used_length;
2159        }
2160    } else {
2161        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2162            total += block->used_length;
2163        }
2164    }
2165    return total;
2166}
2167
2168uint64_t ram_bytes_total(void)
2169{
2170    return ram_bytes_total_common(false);
2171}
2172
2173static void xbzrle_load_setup(void)
2174{
2175    XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2176}
2177
2178static void xbzrle_load_cleanup(void)
2179{
2180    g_free(XBZRLE.decoded_buf);
2181    XBZRLE.decoded_buf = NULL;
2182}
2183
2184static void ram_state_cleanup(RAMState **rsp)
2185{
2186    if (*rsp) {
2187        migration_page_queue_free(*rsp);
2188        qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2189        qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2190        g_free(*rsp);
2191        *rsp = NULL;
2192    }
2193}
2194
2195static void xbzrle_cleanup(void)
2196{
2197    XBZRLE_cache_lock();
2198    if (XBZRLE.cache) {
2199        cache_fini(XBZRLE.cache);
2200        g_free(XBZRLE.encoded_buf);
2201        g_free(XBZRLE.current_buf);
2202        g_free(XBZRLE.zero_target_page);
2203        XBZRLE.cache = NULL;
2204        XBZRLE.encoded_buf = NULL;
2205        XBZRLE.current_buf = NULL;
2206        XBZRLE.zero_target_page = NULL;
2207    }
2208    XBZRLE_cache_unlock();
2209}
2210
2211static void ram_save_cleanup(void *opaque)
2212{
2213    RAMState **rsp = opaque;
2214    RAMBlock *block;
2215
2216    /* We don't use dirty log with background snapshots */
2217    if (!migrate_background_snapshot()) {
2218        /* caller have hold iothread lock or is in a bh, so there is
2219         * no writing race against the migration bitmap
2220         */
2221        memory_global_dirty_log_stop();
2222    }
2223
2224    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2225        g_free(block->clear_bmap);
2226        block->clear_bmap = NULL;
2227        g_free(block->bmap);
2228        block->bmap = NULL;
2229    }
2230
2231    xbzrle_cleanup();
2232    compress_threads_save_cleanup();
2233    ram_state_cleanup(rsp);
2234}
2235
2236static void ram_state_reset(RAMState *rs)
2237{
2238    rs->last_seen_block = NULL;
2239    rs->last_sent_block = NULL;
2240    rs->last_page = 0;
2241    rs->last_version = ram_list.version;
2242    rs->xbzrle_enabled = false;
2243}
2244
2245#define MAX_WAIT 50 /* ms, half buffered_file limit */
2246
2247/*
2248 * 'expected' is the value you expect the bitmap mostly to be full
2249 * of; it won't bother printing lines that are all this value.
2250 * If 'todump' is null the migration bitmap is dumped.
2251 */
2252void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2253                           unsigned long pages)
2254{
2255    int64_t cur;
2256    int64_t linelen = 128;
2257    char linebuf[129];
2258
2259    for (cur = 0; cur < pages; cur += linelen) {
2260        int64_t curb;
2261        bool found = false;
2262        /*
2263         * Last line; catch the case where the line length
2264         * is longer than remaining ram
2265         */
2266        if (cur + linelen > pages) {
2267            linelen = pages - cur;
2268        }
2269        for (curb = 0; curb < linelen; curb++) {
2270            bool thisbit = test_bit(cur + curb, todump);
2271            linebuf[curb] = thisbit ? '1' : '.';
2272            found = found || (thisbit != expected);
2273        }
2274        if (found) {
2275            linebuf[curb] = '\0';
2276            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2277        }
2278    }
2279}
2280
2281/* **** functions for postcopy ***** */
2282
2283void ram_postcopy_migrated_memory_release(MigrationState *ms)
2284{
2285    struct RAMBlock *block;
2286
2287    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2288        unsigned long *bitmap = block->bmap;
2289        unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2290        unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2291
2292        while (run_start < range) {
2293            unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2294            ram_discard_range(block->idstr,
2295                              ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2296                              ((ram_addr_t)(run_end - run_start))
2297                                << TARGET_PAGE_BITS);
2298            run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2299        }
2300    }
2301}
2302
2303/**
2304 * postcopy_send_discard_bm_ram: discard a RAMBlock
2305 *
2306 * Returns zero on success
2307 *
2308 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2309 *
2310 * @ms: current migration state
2311 * @block: RAMBlock to discard
2312 */
2313static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2314{
2315    unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2316    unsigned long current;
2317    unsigned long *bitmap = block->bmap;
2318
2319    for (current = 0; current < end; ) {
2320        unsigned long one = find_next_bit(bitmap, end, current);
2321        unsigned long zero, discard_length;
2322
2323        if (one >= end) {
2324            break;
2325        }
2326
2327        zero = find_next_zero_bit(bitmap, end, one + 1);
2328
2329        if (zero >= end) {
2330            discard_length = end - one;
2331        } else {
2332            discard_length = zero - one;
2333        }
2334        postcopy_discard_send_range(ms, one, discard_length);
2335        current = one + discard_length;
2336    }
2337
2338    return 0;
2339}
2340
2341/**
2342 * postcopy_each_ram_send_discard: discard all RAMBlocks
2343 *
2344 * Returns 0 for success or negative for error
2345 *
2346 * Utility for the outgoing postcopy code.
2347 *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2348 *   passing it bitmap indexes and name.
2349 * (qemu_ram_foreach_block ends up passing unscaled lengths
2350 *  which would mean postcopy code would have to deal with target page)
2351 *
2352 * @ms: current migration state
2353 */
2354static int postcopy_each_ram_send_discard(MigrationState *ms)
2355{
2356    struct RAMBlock *block;
2357    int ret;
2358
2359    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2360        postcopy_discard_send_init(ms, block->idstr);
2361
2362        /*
2363         * Postcopy sends chunks of bitmap over the wire, but it
2364         * just needs indexes at this point, avoids it having
2365         * target page specific code.
2366         */
2367        ret = postcopy_send_discard_bm_ram(ms, block);
2368        postcopy_discard_send_finish(ms);
2369        if (ret) {
2370            return ret;
2371        }
2372    }
2373
2374    return 0;
2375}
2376
2377/**
2378 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2379 *
2380 * Helper for postcopy_chunk_hostpages; it's called twice to
2381 * canonicalize the two bitmaps, that are similar, but one is
2382 * inverted.
2383 *
2384 * Postcopy requires that all target pages in a hostpage are dirty or
2385 * clean, not a mix.  This function canonicalizes the bitmaps.
2386 *
2387 * @ms: current migration state
2388 * @block: block that contains the page we want to canonicalize
2389 */
2390static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2391{
2392    RAMState *rs = ram_state;
2393    unsigned long *bitmap = block->bmap;
2394    unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2395    unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2396    unsigned long run_start;
2397
2398    if (block->page_size == TARGET_PAGE_SIZE) {
2399        /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2400        return;
2401    }
2402
2403    /* Find a dirty page */
2404    run_start = find_next_bit(bitmap, pages, 0);
2405
2406    while (run_start < pages) {
2407
2408        /*
2409         * If the start of this run of pages is in the middle of a host
2410         * page, then we need to fixup this host page.
2411         */
2412        if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2413            /* Find the end of this run */
2414            run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2415            /*
2416             * If the end isn't at the start of a host page, then the
2417             * run doesn't finish at the end of a host page
2418             * and we need to discard.
2419             */
2420        }
2421
2422        if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2423            unsigned long page;
2424            unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2425                                                             host_ratio);
2426            run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2427
2428            /* Clean up the bitmap */
2429            for (page = fixup_start_addr;
2430                 page < fixup_start_addr + host_ratio; page++) {
2431                /*
2432                 * Remark them as dirty, updating the count for any pages
2433                 * that weren't previously dirty.
2434                 */
2435                rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2436            }
2437        }
2438
2439        /* Find the next dirty page for the next iteration */
2440        run_start = find_next_bit(bitmap, pages, run_start);
2441    }
2442}
2443
2444/**
2445 * postcopy_chunk_hostpages: discard any partially sent host page
2446 *
2447 * Utility for the outgoing postcopy code.
2448 *
2449 * Discard any partially sent host-page size chunks, mark any partially
2450 * dirty host-page size chunks as all dirty.  In this case the host-page
2451 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2452 *
2453 * Returns zero on success
2454 *
2455 * @ms: current migration state
2456 * @block: block we want to work with
2457 */
2458static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2459{
2460    postcopy_discard_send_init(ms, block->idstr);
2461
2462    /*
2463     * Ensure that all partially dirty host pages are made fully dirty.
2464     */
2465    postcopy_chunk_hostpages_pass(ms, block);
2466
2467    postcopy_discard_send_finish(ms);
2468    return 0;
2469}
2470
2471/**
2472 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2473 *
2474 * Returns zero on success
2475 *
2476 * Transmit the set of pages to be discarded after precopy to the target
2477 * these are pages that:
2478 *     a) Have been previously transmitted but are now dirty again
2479 *     b) Pages that have never been transmitted, this ensures that
2480 *        any pages on the destination that have been mapped by background
2481 *        tasks get discarded (transparent huge pages is the specific concern)
2482 * Hopefully this is pretty sparse
2483 *
2484 * @ms: current migration state
2485 */
2486int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2487{
2488    RAMState *rs = ram_state;
2489    RAMBlock *block;
2490    int ret;
2491
2492    RCU_READ_LOCK_GUARD();
2493
2494    /* This should be our last sync, the src is now paused */
2495    migration_bitmap_sync(rs);
2496
2497    /* Easiest way to make sure we don't resume in the middle of a host-page */
2498    rs->last_seen_block = NULL;
2499    rs->last_sent_block = NULL;
2500    rs->last_page = 0;
2501
2502    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2503        /* Deal with TPS != HPS and huge pages */
2504        ret = postcopy_chunk_hostpages(ms, block);
2505        if (ret) {
2506            return ret;
2507        }
2508
2509#ifdef DEBUG_POSTCOPY
2510        ram_debug_dump_bitmap(block->bmap, true,
2511                              block->used_length >> TARGET_PAGE_BITS);
2512#endif
2513    }
2514    trace_ram_postcopy_send_discard_bitmap();
2515
2516    return postcopy_each_ram_send_discard(ms);
2517}
2518
2519/**
2520 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2521 *
2522 * Returns zero on success
2523 *
2524 * @rbname: name of the RAMBlock of the request. NULL means the
2525 *          same that last one.
2526 * @start: RAMBlock starting page
2527 * @length: RAMBlock size
2528 */
2529int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2530{
2531    trace_ram_discard_range(rbname, start, length);
2532
2533    RCU_READ_LOCK_GUARD();
2534    RAMBlock *rb = qemu_ram_block_by_name(rbname);
2535
2536    if (!rb) {
2537        error_report("ram_discard_range: Failed to find block '%s'", rbname);
2538        return -1;
2539    }
2540
2541    /*
2542     * On source VM, we don't need to update the received bitmap since
2543     * we don't even have one.
2544     */
2545    if (rb->receivedmap) {
2546        bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2547                     length >> qemu_target_page_bits());
2548    }
2549
2550    return ram_block_discard_range(rb, start, length);
2551}
2552
2553/*
2554 * For every allocation, we will try not to crash the VM if the
2555 * allocation failed.
2556 */
2557static int xbzrle_init(void)
2558{
2559    Error *local_err = NULL;
2560
2561    if (!migrate_use_xbzrle()) {
2562        return 0;
2563    }
2564
2565    XBZRLE_cache_lock();
2566
2567    XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2568    if (!XBZRLE.zero_target_page) {
2569        error_report("%s: Error allocating zero page", __func__);
2570        goto err_out;
2571    }
2572
2573    XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2574                              TARGET_PAGE_SIZE, &local_err);
2575    if (!XBZRLE.cache) {
2576        error_report_err(local_err);
2577        goto free_zero_page;
2578    }
2579
2580    XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2581    if (!XBZRLE.encoded_buf) {
2582        error_report("%s: Error allocating encoded_buf", __func__);
2583        goto free_cache;
2584    }
2585
2586    XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2587    if (!XBZRLE.current_buf) {
2588        error_report("%s: Error allocating current_buf", __func__);
2589        goto free_encoded_buf;
2590    }
2591
2592    /* We are all good */
2593    XBZRLE_cache_unlock();
2594    return 0;
2595
2596free_encoded_buf:
2597    g_free(XBZRLE.encoded_buf);
2598    XBZRLE.encoded_buf = NULL;
2599free_cache:
2600    cache_fini(XBZRLE.cache);
2601    XBZRLE.cache = NULL;
2602free_zero_page:
2603    g_free(XBZRLE.zero_target_page);
2604    XBZRLE.zero_target_page = NULL;
2605err_out:
2606    XBZRLE_cache_unlock();
2607    return -ENOMEM;
2608}
2609
2610static int ram_state_init(RAMState **rsp)
2611{
2612    *rsp = g_try_new0(RAMState, 1);
2613
2614    if (!*rsp) {
2615        error_report("%s: Init ramstate fail", __func__);
2616        return -1;
2617    }
2618
2619    qemu_mutex_init(&(*rsp)->bitmap_mutex);
2620    qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2621    QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2622
2623    /*
2624     * Count the total number of pages used by ram blocks not including any
2625     * gaps due to alignment or unplugs.
2626     * This must match with the initial values of dirty bitmap.
2627     */
2628    (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2629    ram_state_reset(*rsp);
2630
2631    return 0;
2632}
2633
2634static void ram_list_init_bitmaps(void)
2635{
2636    MigrationState *ms = migrate_get_current();
2637    RAMBlock *block;
2638    unsigned long pages;
2639    uint8_t shift;
2640
2641    /* Skip setting bitmap if there is no RAM */
2642    if (ram_bytes_total()) {
2643        shift = ms->clear_bitmap_shift;
2644        if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2645            error_report("clear_bitmap_shift (%u) too big, using "
2646                         "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2647            shift = CLEAR_BITMAP_SHIFT_MAX;
2648        } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2649            error_report("clear_bitmap_shift (%u) too small, using "
2650                         "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2651            shift = CLEAR_BITMAP_SHIFT_MIN;
2652        }
2653
2654        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2655            pages = block->max_length >> TARGET_PAGE_BITS;
2656            /*
2657             * The initial dirty bitmap for migration must be set with all
2658             * ones to make sure we'll migrate every guest RAM page to
2659             * destination.
2660             * Here we set RAMBlock.bmap all to 1 because when rebegin a
2661             * new migration after a failed migration, ram_list.
2662             * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2663             * guest memory.
2664             */
2665            block->bmap = bitmap_new(pages);
2666            bitmap_set(block->bmap, 0, pages);
2667            block->clear_bmap_shift = shift;
2668            block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2669        }
2670    }
2671}
2672
2673static void ram_init_bitmaps(RAMState *rs)
2674{
2675    /* For memory_global_dirty_log_start below.  */
2676    qemu_mutex_lock_iothread();
2677    qemu_mutex_lock_ramlist();
2678
2679    WITH_RCU_READ_LOCK_GUARD() {
2680        ram_list_init_bitmaps();
2681        /* We don't use dirty log with background snapshots */
2682        if (!migrate_background_snapshot()) {
2683            memory_global_dirty_log_start();
2684            migration_bitmap_sync_precopy(rs);
2685        }
2686    }
2687    qemu_mutex_unlock_ramlist();
2688    qemu_mutex_unlock_iothread();
2689}
2690
2691static int ram_init_all(RAMState **rsp)
2692{
2693    if (ram_state_init(rsp)) {
2694        return -1;
2695    }
2696
2697    if (xbzrle_init()) {
2698        ram_state_cleanup(rsp);
2699        return -1;
2700    }
2701
2702    ram_init_bitmaps(*rsp);
2703
2704    return 0;
2705}
2706
2707static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2708{
2709    RAMBlock *block;
2710    uint64_t pages = 0;
2711
2712    /*
2713     * Postcopy is not using xbzrle/compression, so no need for that.
2714     * Also, since source are already halted, we don't need to care
2715     * about dirty page logging as well.
2716     */
2717
2718    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2719        pages += bitmap_count_one(block->bmap,
2720                                  block->used_length >> TARGET_PAGE_BITS);
2721    }
2722
2723    /* This may not be aligned with current bitmaps. Recalculate. */
2724    rs->migration_dirty_pages = pages;
2725
2726    ram_state_reset(rs);
2727
2728    /* Update RAMState cache of output QEMUFile */
2729    rs->f = out;
2730
2731    trace_ram_state_resume_prepare(pages);
2732}
2733
2734/*
2735 * This function clears bits of the free pages reported by the caller from the
2736 * migration dirty bitmap. @addr is the host address corresponding to the
2737 * start of the continuous guest free pages, and @len is the total bytes of
2738 * those pages.
2739 */
2740void qemu_guest_free_page_hint(void *addr, size_t len)
2741{
2742    RAMBlock *block;
2743    ram_addr_t offset;
2744    size_t used_len, start, npages;
2745    MigrationState *s = migrate_get_current();
2746
2747    /* This function is currently expected to be used during live migration */
2748    if (!migration_is_setup_or_active(s->state)) {
2749        return;
2750    }
2751
2752    for (; len > 0; len -= used_len, addr += used_len) {
2753        block = qemu_ram_block_from_host(addr, false, &offset);
2754        if (unlikely(!block || offset >= block->used_length)) {
2755            /*
2756             * The implementation might not support RAMBlock resize during
2757             * live migration, but it could happen in theory with future
2758             * updates. So we add a check here to capture that case.
2759             */
2760            error_report_once("%s unexpected error", __func__);
2761            return;
2762        }
2763
2764        if (len <= block->used_length - offset) {
2765            used_len = len;
2766        } else {
2767            used_len = block->used_length - offset;
2768        }
2769
2770        start = offset >> TARGET_PAGE_BITS;
2771        npages = used_len >> TARGET_PAGE_BITS;
2772
2773        qemu_mutex_lock(&ram_state->bitmap_mutex);
2774        /*
2775         * The skipped free pages are equavalent to be sent from clear_bmap's
2776         * perspective, so clear the bits from the memory region bitmap which
2777         * are initially set. Otherwise those skipped pages will be sent in
2778         * the next round after syncing from the memory region bitmap.
2779         */
2780        migration_clear_memory_region_dirty_bitmap_range(ram_state, block,
2781                                                         start, npages);
2782        ram_state->migration_dirty_pages -=
2783                      bitmap_count_one_with_offset(block->bmap, start, npages);
2784        bitmap_clear(block->bmap, start, npages);
2785        qemu_mutex_unlock(&ram_state->bitmap_mutex);
2786    }
2787}
2788
2789/*
2790 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2791 * long-running RCU critical section.  When rcu-reclaims in the code
2792 * start to become numerous it will be necessary to reduce the
2793 * granularity of these critical sections.
2794 */
2795
2796/**
2797 * ram_save_setup: Setup RAM for migration
2798 *
2799 * Returns zero to indicate success and negative for error
2800 *
2801 * @f: QEMUFile where to send the data
2802 * @opaque: RAMState pointer
2803 */
2804static int ram_save_setup(QEMUFile *f, void *opaque)
2805{
2806    RAMState **rsp = opaque;
2807    RAMBlock *block;
2808
2809    if (compress_threads_save_setup()) {
2810        return -1;
2811    }
2812
2813    /* migration has already setup the bitmap, reuse it. */
2814    if (!migration_in_colo_state()) {
2815        if (ram_init_all(rsp) != 0) {
2816            compress_threads_save_cleanup();
2817            return -1;
2818        }
2819    }
2820    (*rsp)->f = f;
2821
2822    WITH_RCU_READ_LOCK_GUARD() {
2823        qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2824
2825        RAMBLOCK_FOREACH_MIGRATABLE(block) {
2826            qemu_put_byte(f, strlen(block->idstr));
2827            qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2828            qemu_put_be64(f, block->used_length);
2829            if (migrate_postcopy_ram() && block->page_size !=
2830                                          qemu_host_page_size) {
2831                qemu_put_be64(f, block->page_size);
2832            }
2833            if (migrate_ignore_shared()) {
2834                qemu_put_be64(f, block->mr->addr);
2835            }
2836        }
2837    }
2838
2839    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2840    ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2841
2842    multifd_send_sync_main(f);
2843    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2844    qemu_fflush(f);
2845
2846    return 0;
2847}
2848
2849/**
2850 * ram_save_iterate: iterative stage for migration
2851 *
2852 * Returns zero to indicate success and negative for error
2853 *
2854 * @f: QEMUFile where to send the data
2855 * @opaque: RAMState pointer
2856 */
2857static int ram_save_iterate(QEMUFile *f, void *opaque)
2858{
2859    RAMState **temp = opaque;
2860    RAMState *rs = *temp;
2861    int ret = 0;
2862    int i;
2863    int64_t t0;
2864    int done = 0;
2865
2866    if (blk_mig_bulk_active()) {
2867        /* Avoid transferring ram during bulk phase of block migration as
2868         * the bulk phase will usually take a long time and transferring
2869         * ram updates during that time is pointless. */
2870        goto out;
2871    }
2872
2873    /*
2874     * We'll take this lock a little bit long, but it's okay for two reasons.
2875     * Firstly, the only possible other thread to take it is who calls
2876     * qemu_guest_free_page_hint(), which should be rare; secondly, see
2877     * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2878     * guarantees that we'll at least released it in a regular basis.
2879     */
2880    qemu_mutex_lock(&rs->bitmap_mutex);
2881    WITH_RCU_READ_LOCK_GUARD() {
2882        if (ram_list.version != rs->last_version) {
2883            ram_state_reset(rs);
2884        }
2885
2886        /* Read version before ram_list.blocks */
2887        smp_rmb();
2888
2889        ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2890
2891        t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2892        i = 0;
2893        while ((ret = qemu_file_rate_limit(f)) == 0 ||
2894                !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2895            int pages;
2896
2897            if (qemu_file_get_error(f)) {
2898                break;
2899            }
2900
2901            pages = ram_find_and_save_block(rs, false);
2902            /* no more pages to sent */
2903            if (pages == 0) {
2904                done = 1;
2905                break;
2906            }
2907
2908            if (pages < 0) {
2909                qemu_file_set_error(f, pages);
2910                break;
2911            }
2912
2913            rs->target_page_count += pages;
2914
2915            /*
2916             * During postcopy, it is necessary to make sure one whole host
2917             * page is sent in one chunk.
2918             */
2919            if (migrate_postcopy_ram()) {
2920                flush_compressed_data(rs);
2921            }
2922
2923            /*
2924             * we want to check in the 1st loop, just in case it was the 1st
2925             * time and we had to sync the dirty bitmap.
2926             * qemu_clock_get_ns() is a bit expensive, so we only check each
2927             * some iterations
2928             */
2929            if ((i & 63) == 0) {
2930                uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2931                              1000000;
2932                if (t1 > MAX_WAIT) {
2933                    trace_ram_save_iterate_big_wait(t1, i);
2934                    break;
2935                }
2936            }
2937            i++;
2938        }
2939    }
2940    qemu_mutex_unlock(&rs->bitmap_mutex);
2941
2942    /*
2943     * Must occur before EOS (or any QEMUFile operation)
2944     * because of RDMA protocol.
2945     */
2946    ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2947
2948out:
2949    if (ret >= 0
2950        && migration_is_setup_or_active(migrate_get_current()->state)) {
2951        multifd_send_sync_main(rs->f);
2952        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2953        qemu_fflush(f);
2954        ram_counters.transferred += 8;
2955
2956        ret = qemu_file_get_error(f);
2957    }
2958    if (ret < 0) {
2959        return ret;
2960    }
2961
2962    return done;
2963}
2964
2965/**
2966 * ram_save_complete: function called to send the remaining amount of ram
2967 *
2968 * Returns zero to indicate success or negative on error
2969 *
2970 * Called with iothread lock
2971 *
2972 * @f: QEMUFile where to send the data
2973 * @opaque: RAMState pointer
2974 */
2975static int ram_save_complete(QEMUFile *f, void *opaque)
2976{
2977    RAMState **temp = opaque;
2978    RAMState *rs = *temp;
2979    int ret = 0;
2980
2981    WITH_RCU_READ_LOCK_GUARD() {
2982        if (!migration_in_postcopy()) {
2983            migration_bitmap_sync_precopy(rs);
2984        }
2985
2986        ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2987
2988        /* try transferring iterative blocks of memory */
2989
2990        /* flush all remaining blocks regardless of rate limiting */
2991        while (true) {
2992            int pages;
2993
2994            pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2995            /* no more blocks to sent */
2996            if (pages == 0) {
2997                break;
2998            }
2999            if (pages < 0) {
3000                ret = pages;
3001                break;
3002            }
3003        }
3004
3005        flush_compressed_data(rs);
3006        ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3007    }
3008
3009    if (ret >= 0) {
3010        multifd_send_sync_main(rs->f);
3011        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3012        qemu_fflush(f);
3013    }
3014
3015    return ret;
3016}
3017
3018static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3019                             uint64_t *res_precopy_only,
3020                             uint64_t *res_compatible,
3021                             uint64_t *res_postcopy_only)
3022{
3023    RAMState **temp = opaque;
3024    RAMState *rs = *temp;
3025    uint64_t remaining_size;
3026
3027    remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3028
3029    if (!migration_in_postcopy() &&
3030        remaining_size < max_size) {
3031        qemu_mutex_lock_iothread();
3032        WITH_RCU_READ_LOCK_GUARD() {
3033            migration_bitmap_sync_precopy(rs);
3034        }
3035        qemu_mutex_unlock_iothread();
3036        remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3037    }
3038
3039    if (migrate_postcopy_ram()) {
3040        /* We can do postcopy, and all the data is postcopiable */
3041        *res_compatible += remaining_size;
3042    } else {
3043        *res_precopy_only += remaining_size;
3044    }
3045}
3046
3047static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3048{
3049    unsigned int xh_len;
3050    int xh_flags;
3051    uint8_t *loaded_data;
3052
3053    /* extract RLE header */
3054    xh_flags = qemu_get_byte(f);
3055    xh_len = qemu_get_be16(f);
3056
3057    if (xh_flags != ENCODING_FLAG_XBZRLE) {
3058        error_report("Failed to load XBZRLE page - wrong compression!");
3059        return -1;
3060    }
3061
3062    if (xh_len > TARGET_PAGE_SIZE) {
3063        error_report("Failed to load XBZRLE page - len overflow!");
3064        return -1;
3065    }
3066    loaded_data = XBZRLE.decoded_buf;
3067    /* load data and decode */
3068    /* it can change loaded_data to point to an internal buffer */
3069    qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3070
3071    /* decode RLE */
3072    if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3073                             TARGET_PAGE_SIZE) == -1) {
3074        error_report("Failed to load XBZRLE page - decode error!");
3075        return -1;
3076    }
3077
3078    return 0;
3079}
3080
3081/**
3082 * ram_block_from_stream: read a RAMBlock id from the migration stream
3083 *
3084 * Must be called from within a rcu critical section.
3085 *
3086 * Returns a pointer from within the RCU-protected ram_list.
3087 *
3088 * @f: QEMUFile where to read the data from
3089 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3090 */
3091static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3092{
3093    static RAMBlock *block;
3094    char id[256];
3095    uint8_t len;
3096
3097    if (flags & RAM_SAVE_FLAG_CONTINUE) {
3098        if (!block) {
3099            error_report("Ack, bad migration stream!");
3100            return NULL;
3101        }
3102        return block;
3103    }
3104
3105    len = qemu_get_byte(f);
3106    qemu_get_buffer(f, (uint8_t *)id, len);
3107    id[len] = 0;
3108
3109    block = qemu_ram_block_by_name(id);
3110    if (!block) {
3111        error_report("Can't find block %s", id);
3112        return NULL;
3113    }
3114
3115    if (ramblock_is_ignored(block)) {
3116        error_report("block %s should not be migrated !", id);
3117        return NULL;
3118    }
3119
3120    return block;
3121}
3122
3123static inline void *host_from_ram_block_offset(RAMBlock *block,
3124                                               ram_addr_t offset)
3125{
3126    if (!offset_in_ramblock(block, offset)) {
3127        return NULL;
3128    }
3129
3130    return block->host + offset;
3131}
3132
3133static void *host_page_from_ram_block_offset(RAMBlock *block,
3134                                             ram_addr_t offset)
3135{
3136    /* Note: Explicitly no check against offset_in_ramblock(). */
3137    return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3138                                   block->page_size);
3139}
3140
3141static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3142                                                         ram_addr_t offset)
3143{
3144    return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3145}
3146
3147static inline void *colo_cache_from_block_offset(RAMBlock *block,
3148                             ram_addr_t offset, bool record_bitmap)
3149{
3150    if (!offset_in_ramblock(block, offset)) {
3151        return NULL;
3152    }
3153    if (!block->colo_cache) {
3154        error_report("%s: colo_cache is NULL in block :%s",
3155                     __func__, block->idstr);
3156        return NULL;
3157    }
3158
3159    /*
3160    * During colo checkpoint, we need bitmap of these migrated pages.
3161    * It help us to decide which pages in ram cache should be flushed
3162    * into VM's RAM later.
3163    */
3164    if (record_bitmap &&
3165        !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3166        ram_state->migration_dirty_pages++;
3167    }
3168    return block->colo_cache + offset;
3169}
3170
3171/**
3172 * ram_handle_compressed: handle the zero page case
3173 *
3174 * If a page (or a whole RDMA chunk) has been
3175 * determined to be zero, then zap it.
3176 *
3177 * @host: host address for the zero page
3178 * @ch: what the page is filled from.  We only support zero
3179 * @size: size of the zero page
3180 */
3181void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3182{
3183    if (ch != 0 || !is_zero_range(host, size)) {
3184        memset(host, ch, size);
3185    }
3186}
3187
3188/* return the size after decompression, or negative value on error */
3189static int
3190qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3191                     const uint8_t *source, size_t source_len)
3192{
3193    int err;
3194
3195    err = inflateReset(stream);
3196    if (err != Z_OK) {
3197        return -1;
3198    }
3199
3200    stream->avail_in = source_len;
3201    stream->next_in = (uint8_t *)source;
3202    stream->avail_out = dest_len;
3203    stream->next_out = dest;
3204
3205    err = inflate(stream, Z_NO_FLUSH);
3206    if (err != Z_STREAM_END) {
3207        return -1;
3208    }
3209
3210    return stream->total_out;
3211}
3212
3213static void *do_data_decompress(void *opaque)
3214{
3215    DecompressParam *param = opaque;
3216    unsigned long pagesize;
3217    uint8_t *des;
3218    int len, ret;
3219
3220    qemu_mutex_lock(&param->mutex);
3221    while (!param->quit) {
3222        if (param->des) {
3223            des = param->des;
3224            len = param->len;
3225            param->des = 0;
3226            qemu_mutex_unlock(&param->mutex);
3227
3228            pagesize = TARGET_PAGE_SIZE;
3229
3230            ret = qemu_uncompress_data(&param->stream, des, pagesize,
3231                                       param->compbuf, len);
3232            if (ret < 0 && migrate_get_current()->decompress_error_check) {
3233                error_report("decompress data failed");
3234                qemu_file_set_error(decomp_file, ret);
3235            }
3236
3237            qemu_mutex_lock(&decomp_done_lock);
3238            param->done = true;
3239            qemu_cond_signal(&decomp_done_cond);
3240            qemu_mutex_unlock(&decomp_done_lock);
3241
3242            qemu_mutex_lock(&param->mutex);
3243        } else {
3244            qemu_cond_wait(&param->cond, &param->mutex);
3245        }
3246    }
3247    qemu_mutex_unlock(&param->mutex);
3248
3249    return NULL;
3250}
3251
3252static int wait_for_decompress_done(void)
3253{
3254    int idx, thread_count;
3255
3256    if (!migrate_use_compression()) {
3257        return 0;
3258    }
3259
3260    thread_count = migrate_decompress_threads();
3261    qemu_mutex_lock(&decomp_done_lock);
3262    for (idx = 0; idx < thread_count; idx++) {
3263        while (!decomp_param[idx].done) {
3264            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3265        }
3266    }
3267    qemu_mutex_unlock(&decomp_done_lock);
3268    return qemu_file_get_error(decomp_file);
3269}
3270
3271static void compress_threads_load_cleanup(void)
3272{
3273    int i, thread_count;
3274
3275    if (!migrate_use_compression()) {
3276        return;
3277    }
3278    thread_count = migrate_decompress_threads();
3279    for (i = 0; i < thread_count; i++) {
3280        /*
3281         * we use it as a indicator which shows if the thread is
3282         * properly init'd or not
3283         */
3284        if (!decomp_param[i].compbuf) {
3285            break;
3286        }
3287
3288        qemu_mutex_lock(&decomp_param[i].mutex);
3289        decomp_param[i].quit = true;
3290        qemu_cond_signal(&decomp_param[i].cond);
3291        qemu_mutex_unlock(&decomp_param[i].mutex);
3292    }
3293    for (i = 0; i < thread_count; i++) {
3294        if (!decomp_param[i].compbuf) {
3295            break;
3296        }
3297
3298        qemu_thread_join(decompress_threads + i);
3299        qemu_mutex_destroy(&decomp_param[i].mutex);
3300        qemu_cond_destroy(&decomp_param[i].cond);
3301        inflateEnd(&decomp_param[i].stream);
3302        g_free(decomp_param[i].compbuf);
3303        decomp_param[i].compbuf = NULL;
3304    }
3305    g_free(decompress_threads);
3306    g_free(decomp_param);
3307    decompress_threads = NULL;
3308    decomp_param = NULL;
3309    decomp_file = NULL;
3310}
3311
3312static int compress_threads_load_setup(QEMUFile *f)
3313{
3314    int i, thread_count;
3315
3316    if (!migrate_use_compression()) {
3317        return 0;
3318    }
3319
3320    thread_count = migrate_decompress_threads();
3321    decompress_threads = g_new0(QemuThread, thread_count);
3322    decomp_param = g_new0(DecompressParam, thread_count);
3323    qemu_mutex_init(&decomp_done_lock);
3324    qemu_cond_init(&decomp_done_cond);
3325    decomp_file = f;
3326    for (i = 0; i < thread_count; i++) {
3327        if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3328            goto exit;
3329        }
3330
3331        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3332        qemu_mutex_init(&decomp_param[i].mutex);
3333        qemu_cond_init(&decomp_param[i].cond);
3334        decomp_param[i].done = true;
3335        decomp_param[i].quit = false;
3336        qemu_thread_create(decompress_threads + i, "decompress",
3337                           do_data_decompress, decomp_param + i,
3338                           QEMU_THREAD_JOINABLE);
3339    }
3340    return 0;
3341exit:
3342    compress_threads_load_cleanup();
3343    return -1;
3344}
3345
3346static void decompress_data_with_multi_threads(QEMUFile *f,
3347                                               void *host, int len)
3348{
3349    int idx, thread_count;
3350
3351    thread_count = migrate_decompress_threads();
3352    QEMU_LOCK_GUARD(&decomp_done_lock);
3353    while (true) {
3354        for (idx = 0; idx < thread_count; idx++) {
3355            if (decomp_param[idx].done) {
3356                decomp_param[idx].done = false;
3357                qemu_mutex_lock(&decomp_param[idx].mutex);
3358                qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3359                decomp_param[idx].des = host;
3360                decomp_param[idx].len = len;
3361                qemu_cond_signal(&decomp_param[idx].cond);
3362                qemu_mutex_unlock(&decomp_param[idx].mutex);
3363                break;
3364            }
3365        }
3366        if (idx < thread_count) {
3367            break;
3368        } else {
3369            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3370        }
3371    }
3372}
3373
3374static void colo_init_ram_state(void)
3375{
3376    ram_state_init(&ram_state);
3377}
3378
3379/*
3380 * colo cache: this is for secondary VM, we cache the whole
3381 * memory of the secondary VM, it is need to hold the global lock
3382 * to call this helper.
3383 */
3384int colo_init_ram_cache(void)
3385{
3386    RAMBlock *block;
3387
3388    WITH_RCU_READ_LOCK_GUARD() {
3389        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3390            block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3391                                                    NULL, false, false);
3392            if (!block->colo_cache) {
3393                error_report("%s: Can't alloc memory for COLO cache of block %s,"
3394                             "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3395                             block->used_length);
3396                RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3397                    if (block->colo_cache) {
3398                        qemu_anon_ram_free(block->colo_cache, block->used_length);
3399                        block->colo_cache = NULL;
3400                    }
3401                }
3402                return -errno;
3403            }
3404        }
3405    }
3406
3407    /*
3408    * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3409    * with to decide which page in cache should be flushed into SVM's RAM. Here
3410    * we use the same name 'ram_bitmap' as for migration.
3411    */
3412    if (ram_bytes_total()) {
3413        RAMBlock *block;
3414
3415        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3416            unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3417            block->bmap = bitmap_new(pages);
3418        }
3419    }
3420
3421    colo_init_ram_state();
3422    return 0;
3423}
3424
3425/* TODO: duplicated with ram_init_bitmaps */
3426void colo_incoming_start_dirty_log(void)
3427{
3428    RAMBlock *block = NULL;
3429    /* For memory_global_dirty_log_start below. */
3430    qemu_mutex_lock_iothread();
3431    qemu_mutex_lock_ramlist();
3432
3433    memory_global_dirty_log_sync();
3434    WITH_RCU_READ_LOCK_GUARD() {
3435        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3436            ramblock_sync_dirty_bitmap(ram_state, block);
3437            /* Discard this dirty bitmap record */
3438            bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3439        }
3440        memory_global_dirty_log_start();
3441    }
3442    ram_state->migration_dirty_pages = 0;
3443    qemu_mutex_unlock_ramlist();
3444    qemu_mutex_unlock_iothread();
3445}
3446
3447/* It is need to hold the global lock to call this helper */
3448void colo_release_ram_cache(void)
3449{
3450    RAMBlock *block;
3451
3452    memory_global_dirty_log_stop();
3453    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3454        g_free(block->bmap);
3455        block->bmap = NULL;
3456    }
3457
3458    WITH_RCU_READ_LOCK_GUARD() {
3459        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3460            if (block->colo_cache) {
3461                qemu_anon_ram_free(block->colo_cache, block->used_length);
3462                block->colo_cache = NULL;
3463            }
3464        }
3465    }
3466    ram_state_cleanup(&ram_state);
3467}
3468
3469/**
3470 * ram_load_setup: Setup RAM for migration incoming side
3471 *
3472 * Returns zero to indicate success and negative for error
3473 *
3474 * @f: QEMUFile where to receive the data
3475 * @opaque: RAMState pointer
3476 */
3477static int ram_load_setup(QEMUFile *f, void *opaque)
3478{
3479    if (compress_threads_load_setup(f)) {
3480        return -1;
3481    }
3482
3483    xbzrle_load_setup();
3484    ramblock_recv_map_init();
3485
3486    return 0;
3487}
3488
3489static int ram_load_cleanup(void *opaque)
3490{
3491    RAMBlock *rb;
3492
3493    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3494        qemu_ram_block_writeback(rb);
3495    }
3496
3497    xbzrle_load_cleanup();
3498    compress_threads_load_cleanup();
3499
3500    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3501        g_free(rb->receivedmap);
3502        rb->receivedmap = NULL;
3503    }
3504
3505    return 0;
3506}
3507
3508/**
3509 * ram_postcopy_incoming_init: allocate postcopy data structures
3510 *
3511 * Returns 0 for success and negative if there was one error
3512 *
3513 * @mis: current migration incoming state
3514 *
3515 * Allocate data structures etc needed by incoming migration with
3516 * postcopy-ram. postcopy-ram's similarly names
3517 * postcopy_ram_incoming_init does the work.
3518 */
3519int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3520{
3521    return postcopy_ram_incoming_init(mis);
3522}
3523
3524/**
3525 * ram_load_postcopy: load a page in postcopy case
3526 *
3527 * Returns 0 for success or -errno in case of error
3528 *
3529 * Called in postcopy mode by ram_load().
3530 * rcu_read_lock is taken prior to this being called.
3531 *
3532 * @f: QEMUFile where to send the data
3533 */
3534static int ram_load_postcopy(QEMUFile *f)
3535{
3536    int flags = 0, ret = 0;
3537    bool place_needed = false;
3538    bool matches_target_page_size = false;
3539    MigrationIncomingState *mis = migration_incoming_get_current();
3540    /* Temporary page that is later 'placed' */
3541    void *postcopy_host_page = mis->postcopy_tmp_page;
3542    void *host_page = NULL;
3543    bool all_zero = true;
3544    int target_pages = 0;
3545
3546    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3547        ram_addr_t addr;
3548        void *page_buffer = NULL;
3549        void *place_source = NULL;
3550        RAMBlock *block = NULL;
3551        uint8_t ch;
3552        int len;
3553
3554        addr = qemu_get_be64(f);
3555
3556        /*
3557         * If qemu file error, we should stop here, and then "addr"
3558         * may be invalid
3559         */
3560        ret = qemu_file_get_error(f);
3561        if (ret) {
3562            break;
3563        }
3564
3565        flags = addr & ~TARGET_PAGE_MASK;
3566        addr &= TARGET_PAGE_MASK;
3567
3568        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3569        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3570                     RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3571            block = ram_block_from_stream(f, flags);
3572            if (!block) {
3573                ret = -EINVAL;
3574                break;
3575            }
3576
3577            /*
3578             * Relying on used_length is racy and can result in false positives.
3579             * We might place pages beyond used_length in case RAM was shrunk
3580             * while in postcopy, which is fine - trying to place via
3581             * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3582             */
3583            if (!block->host || addr >= block->postcopy_length) {
3584                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3585                ret = -EINVAL;
3586                break;
3587            }
3588            target_pages++;
3589            matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3590            /*
3591             * Postcopy requires that we place whole host pages atomically;
3592             * these may be huge pages for RAMBlocks that are backed by
3593             * hugetlbfs.
3594             * To make it atomic, the data is read into a temporary page
3595             * that's moved into place later.
3596             * The migration protocol uses,  possibly smaller, target-pages
3597             * however the source ensures it always sends all the components
3598             * of a host page in one chunk.
3599             */
3600            page_buffer = postcopy_host_page +
3601                          host_page_offset_from_ram_block_offset(block, addr);
3602            /* If all TP are zero then we can optimise the place */
3603            if (target_pages == 1) {
3604                host_page = host_page_from_ram_block_offset(block, addr);
3605            } else if (host_page != host_page_from_ram_block_offset(block,
3606                                                                    addr)) {
3607                /* not the 1st TP within the HP */
3608                error_report("Non-same host page %p/%p", host_page,
3609                             host_page_from_ram_block_offset(block, addr));
3610                ret = -EINVAL;
3611                break;
3612            }
3613
3614            /*
3615             * If it's the last part of a host page then we place the host
3616             * page
3617             */
3618            if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3619                place_needed = true;
3620            }
3621            place_source = postcopy_host_page;
3622        }
3623
3624        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3625        case RAM_SAVE_FLAG_ZERO:
3626            ch = qemu_get_byte(f);
3627            /*
3628             * Can skip to set page_buffer when
3629             * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3630             */
3631            if (ch || !matches_target_page_size) {
3632                memset(page_buffer, ch, TARGET_PAGE_SIZE);
3633            }
3634            if (ch) {
3635                all_zero = false;
3636            }
3637            break;
3638
3639        case RAM_SAVE_FLAG_PAGE:
3640            all_zero = false;
3641            if (!matches_target_page_size) {
3642                /* For huge pages, we always use temporary buffer */
3643                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3644            } else {
3645                /*
3646                 * For small pages that matches target page size, we
3647                 * avoid the qemu_file copy.  Instead we directly use
3648                 * the buffer of QEMUFile to place the page.  Note: we
3649                 * cannot do any QEMUFile operation before using that
3650                 * buffer to make sure the buffer is valid when
3651                 * placing the page.
3652                 */
3653                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3654                                         TARGET_PAGE_SIZE);
3655            }
3656            break;
3657        case RAM_SAVE_FLAG_COMPRESS_PAGE:
3658            all_zero = false;
3659            len = qemu_get_be32(f);
3660            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3661                error_report("Invalid compressed data length: %d", len);
3662                ret = -EINVAL;
3663                break;
3664            }
3665            decompress_data_with_multi_threads(f, page_buffer, len);
3666            break;
3667
3668        case RAM_SAVE_FLAG_EOS:
3669            /* normal exit */
3670            multifd_recv_sync_main();
3671            break;
3672        default:
3673            error_report("Unknown combination of migration flags: 0x%x"
3674                         " (postcopy mode)", flags);
3675            ret = -EINVAL;
3676            break;
3677        }
3678
3679        /* Got the whole host page, wait for decompress before placing. */
3680        if (place_needed) {
3681            ret |= wait_for_decompress_done();
3682        }
3683
3684        /* Detect for any possible file errors */
3685        if (!ret && qemu_file_get_error(f)) {
3686            ret = qemu_file_get_error(f);
3687        }
3688
3689        if (!ret && place_needed) {
3690            if (all_zero) {
3691                ret = postcopy_place_page_zero(mis, host_page, block);
3692            } else {
3693                ret = postcopy_place_page(mis, host_page, place_source,
3694                                          block);
3695            }
3696            place_needed = false;
3697            target_pages = 0;
3698            /* Assume we have a zero page until we detect something different */
3699            all_zero = true;
3700        }
3701    }
3702
3703    return ret;
3704}
3705
3706static bool postcopy_is_advised(void)
3707{
3708    PostcopyState ps = postcopy_state_get();
3709    return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3710}
3711
3712static bool postcopy_is_running(void)
3713{
3714    PostcopyState ps = postcopy_state_get();
3715    return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3716}
3717
3718/*
3719 * Flush content of RAM cache into SVM's memory.
3720 * Only flush the pages that be dirtied by PVM or SVM or both.
3721 */
3722void colo_flush_ram_cache(void)
3723{
3724    RAMBlock *block = NULL;
3725    void *dst_host;
3726    void *src_host;
3727    unsigned long offset = 0;
3728
3729    memory_global_dirty_log_sync();
3730    qemu_mutex_lock(&ram_state->bitmap_mutex);
3731    WITH_RCU_READ_LOCK_GUARD() {
3732        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3733            ramblock_sync_dirty_bitmap(ram_state, block);
3734        }
3735    }
3736
3737    trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3738    WITH_RCU_READ_LOCK_GUARD() {
3739        block = QLIST_FIRST_RCU(&ram_list.blocks);
3740
3741        while (block) {
3742            offset = migration_bitmap_find_dirty(ram_state, block, offset);
3743
3744            if (!offset_in_ramblock(block,
3745                                    ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3746                offset = 0;
3747                block = QLIST_NEXT_RCU(block, next);
3748            } else {
3749                migration_bitmap_clear_dirty(ram_state, block, offset);
3750                dst_host = block->host
3751                         + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3752                src_host = block->colo_cache
3753                         + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3754                memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3755            }
3756        }
3757    }
3758    trace_colo_flush_ram_cache_end();
3759    qemu_mutex_unlock(&ram_state->bitmap_mutex);
3760}
3761
3762/**
3763 * ram_load_precopy: load pages in precopy case
3764 *
3765 * Returns 0 for success or -errno in case of error
3766 *
3767 * Called in precopy mode by ram_load().
3768 * rcu_read_lock is taken prior to this being called.
3769 *
3770 * @f: QEMUFile where to send the data
3771 */
3772static int ram_load_precopy(QEMUFile *f)
3773{
3774    int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3775    /* ADVISE is earlier, it shows the source has the postcopy capability on */
3776    bool postcopy_advised = postcopy_is_advised();
3777    if (!migrate_use_compression()) {
3778        invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3779    }
3780
3781    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3782        ram_addr_t addr, total_ram_bytes;
3783        void *host = NULL, *host_bak = NULL;
3784        uint8_t ch;
3785
3786        /*
3787         * Yield periodically to let main loop run, but an iteration of
3788         * the main loop is expensive, so do it each some iterations
3789         */
3790        if ((i & 32767) == 0 && qemu_in_coroutine()) {
3791            aio_co_schedule(qemu_get_current_aio_context(),
3792                            qemu_coroutine_self());
3793            qemu_coroutine_yield();
3794        }
3795        i++;
3796
3797        addr = qemu_get_be64(f);
3798        flags = addr & ~TARGET_PAGE_MASK;
3799        addr &= TARGET_PAGE_MASK;
3800
3801        if (flags & invalid_flags) {
3802            if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3803                error_report("Received an unexpected compressed page");
3804            }
3805
3806            ret = -EINVAL;
3807            break;
3808        }
3809
3810        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3811                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3812            RAMBlock *block = ram_block_from_stream(f, flags);
3813
3814            host = host_from_ram_block_offset(block, addr);
3815            /*
3816             * After going into COLO stage, we should not load the page
3817             * into SVM's memory directly, we put them into colo_cache firstly.
3818             * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3819             * Previously, we copied all these memory in preparing stage of COLO
3820             * while we need to stop VM, which is a time-consuming process.
3821             * Here we optimize it by a trick, back-up every page while in
3822             * migration process while COLO is enabled, though it affects the
3823             * speed of the migration, but it obviously reduce the downtime of
3824             * back-up all SVM'S memory in COLO preparing stage.
3825             */
3826            if (migration_incoming_colo_enabled()) {
3827                if (migration_incoming_in_colo_state()) {
3828                    /* In COLO stage, put all pages into cache temporarily */
3829                    host = colo_cache_from_block_offset(block, addr, true);
3830                } else {
3831                   /*
3832                    * In migration stage but before COLO stage,
3833                    * Put all pages into both cache and SVM's memory.
3834                    */
3835                    host_bak = colo_cache_from_block_offset(block, addr, false);
3836                }
3837            }
3838            if (!host) {
3839                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3840                ret = -EINVAL;
3841                break;
3842            }
3843            if (!migration_incoming_in_colo_state()) {
3844                ramblock_recv_bitmap_set(block, host);
3845            }
3846
3847            trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3848        }
3849
3850        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3851        case RAM_SAVE_FLAG_MEM_SIZE:
3852            /* Synchronize RAM block list */
3853            total_ram_bytes = addr;
3854            while (!ret && total_ram_bytes) {
3855                RAMBlock *block;
3856                char id[256];
3857                ram_addr_t length;
3858
3859                len = qemu_get_byte(f);
3860                qemu_get_buffer(f, (uint8_t *)id, len);
3861                id[len] = 0;
3862                length = qemu_get_be64(f);
3863
3864                block = qemu_ram_block_by_name(id);
3865                if (block && !qemu_ram_is_migratable(block)) {
3866                    error_report("block %s should not be migrated !", id);
3867                    ret = -EINVAL;
3868                } else if (block) {
3869                    if (length != block->used_length) {
3870                        Error *local_err = NULL;
3871
3872                        ret = qemu_ram_resize(block, length,
3873                                              &local_err);
3874                        if (local_err) {
3875                            error_report_err(local_err);
3876                        }
3877                    }
3878                    /* For postcopy we need to check hugepage sizes match */
3879                    if (postcopy_advised && migrate_postcopy_ram() &&
3880                        block->page_size != qemu_host_page_size) {
3881                        uint64_t remote_page_size = qemu_get_be64(f);
3882                        if (remote_page_size != block->page_size) {
3883                            error_report("Mismatched RAM page size %s "
3884                                         "(local) %zd != %" PRId64,
3885                                         id, block->page_size,
3886                                         remote_page_size);
3887                            ret = -EINVAL;
3888                        }
3889                    }
3890                    if (migrate_ignore_shared()) {
3891                        hwaddr addr = qemu_get_be64(f);
3892                        if (ramblock_is_ignored(block) &&
3893                            block->mr->addr != addr) {
3894                            error_report("Mismatched GPAs for block %s "
3895                                         "%" PRId64 "!= %" PRId64,
3896                                         id, (uint64_t)addr,
3897                                         (uint64_t)block->mr->addr);
3898                            ret = -EINVAL;
3899                        }
3900                    }
3901                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3902                                          block->idstr);
3903                } else {
3904                    error_report("Unknown ramblock \"%s\", cannot "
3905                                 "accept migration", id);
3906                    ret = -EINVAL;
3907                }
3908
3909                total_ram_bytes -= length;
3910            }
3911            break;
3912
3913        case RAM_SAVE_FLAG_ZERO:
3914            ch = qemu_get_byte(f);
3915            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3916            break;
3917
3918        case RAM_SAVE_FLAG_PAGE:
3919            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3920            break;
3921
3922        case RAM_SAVE_FLAG_COMPRESS_PAGE:
3923            len = qemu_get_be32(f);
3924            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3925                error_report("Invalid compressed data length: %d", len);
3926                ret = -EINVAL;
3927                break;
3928            }
3929            decompress_data_with_multi_threads(f, host, len);
3930            break;
3931
3932        case RAM_SAVE_FLAG_XBZRLE:
3933            if (load_xbzrle(f, addr, host) < 0) {
3934                error_report("Failed to decompress XBZRLE page at "
3935                             RAM_ADDR_FMT, addr);
3936                ret = -EINVAL;
3937                break;
3938            }
3939            break;
3940        case RAM_SAVE_FLAG_EOS:
3941            /* normal exit */
3942            multifd_recv_sync_main();
3943            break;
3944        default:
3945            if (flags & RAM_SAVE_FLAG_HOOK) {
3946                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3947            } else {
3948                error_report("Unknown combination of migration flags: 0x%x",
3949                             flags);
3950                ret = -EINVAL;
3951            }
3952        }
3953        if (!ret) {
3954            ret = qemu_file_get_error(f);
3955        }
3956        if (!ret && host_bak) {
3957            memcpy(host_bak, host, TARGET_PAGE_SIZE);
3958        }
3959    }
3960
3961    ret |= wait_for_decompress_done();
3962    return ret;
3963}
3964
3965static int ram_load(QEMUFile *f, void *opaque, int version_id)
3966{
3967    int ret = 0;
3968    static uint64_t seq_iter;
3969    /*
3970     * If system is running in postcopy mode, page inserts to host memory must
3971     * be atomic
3972     */
3973    bool postcopy_running = postcopy_is_running();
3974
3975    seq_iter++;
3976
3977    if (version_id != 4) {
3978        return -EINVAL;
3979    }
3980
3981    /*
3982     * This RCU critical section can be very long running.
3983     * When RCU reclaims in the code start to become numerous,
3984     * it will be necessary to reduce the granularity of this
3985     * critical section.
3986     */
3987    WITH_RCU_READ_LOCK_GUARD() {
3988        if (postcopy_running) {
3989            ret = ram_load_postcopy(f);
3990        } else {
3991            ret = ram_load_precopy(f);
3992        }
3993    }
3994    trace_ram_load_complete(ret, seq_iter);
3995
3996    return ret;
3997}
3998
3999static bool ram_has_postcopy(void *opaque)
4000{
4001    RAMBlock *rb;
4002    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4003        if (ramblock_is_pmem(rb)) {
4004            info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4005                         "is not supported now!", rb->idstr, rb->host);
4006            return false;
4007        }
4008    }
4009
4010    return migrate_postcopy_ram();
4011}
4012
4013/* Sync all the dirty bitmap with destination VM.  */
4014static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4015{
4016    RAMBlock *block;
4017    QEMUFile *file = s->to_dst_file;
4018    int ramblock_count = 0;
4019
4020    trace_ram_dirty_bitmap_sync_start();
4021
4022    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4023        qemu_savevm_send_recv_bitmap(file, block->idstr);
4024        trace_ram_dirty_bitmap_request(block->idstr);
4025        ramblock_count++;
4026    }
4027
4028    trace_ram_dirty_bitmap_sync_wait();
4029
4030    /* Wait until all the ramblocks' dirty bitmap synced */
4031    while (ramblock_count--) {
4032        qemu_sem_wait(&s->rp_state.rp_sem);
4033    }
4034
4035    trace_ram_dirty_bitmap_sync_complete();
4036
4037    return 0;
4038}
4039
4040static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4041{
4042    qemu_sem_post(&s->rp_state.rp_sem);
4043}
4044
4045/*
4046 * Read the received bitmap, revert it as the initial dirty bitmap.
4047 * This is only used when the postcopy migration is paused but wants
4048 * to resume from a middle point.
4049 */
4050int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4051{
4052    int ret = -EINVAL;
4053    /* from_dst_file is always valid because we're within rp_thread */
4054    QEMUFile *file = s->rp_state.from_dst_file;
4055    unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4056    uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4057    uint64_t size, end_mark;
4058
4059    trace_ram_dirty_bitmap_reload_begin(block->idstr);
4060
4061    if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4062        error_report("%s: incorrect state %s", __func__,
4063                     MigrationStatus_str(s->state));
4064        return -EINVAL;
4065    }
4066
4067    /*
4068     * Note: see comments in ramblock_recv_bitmap_send() on why we
4069     * need the endianness conversion, and the paddings.
4070     */
4071    local_size = ROUND_UP(local_size, 8);
4072
4073    /* Add paddings */
4074    le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4075
4076    size = qemu_get_be64(file);
4077
4078    /* The size of the bitmap should match with our ramblock */
4079    if (size != local_size) {
4080        error_report("%s: ramblock '%s' bitmap size mismatch "
4081                     "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4082                     block->idstr, size, local_size);
4083        ret = -EINVAL;
4084        goto out;
4085    }
4086
4087    size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4088    end_mark = qemu_get_be64(file);
4089
4090    ret = qemu_file_get_error(file);
4091    if (ret || size != local_size) {
4092        error_report("%s: read bitmap failed for ramblock '%s': %d"
4093                     " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4094                     __func__, block->idstr, ret, local_size, size);
4095        ret = -EIO;
4096        goto out;
4097    }
4098
4099    if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4100        error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4101                     __func__, block->idstr, end_mark);
4102        ret = -EINVAL;
4103        goto out;
4104    }
4105
4106    /*
4107     * Endianness conversion. We are during postcopy (though paused).
4108     * The dirty bitmap won't change. We can directly modify it.
4109     */
4110    bitmap_from_le(block->bmap, le_bitmap, nbits);
4111
4112    /*
4113     * What we received is "received bitmap". Revert it as the initial
4114     * dirty bitmap for this ramblock.
4115     */
4116    bitmap_complement(block->bmap, block->bmap, nbits);
4117
4118    trace_ram_dirty_bitmap_reload_complete(block->idstr);
4119
4120    /*
4121     * We succeeded to sync bitmap for current ramblock. If this is
4122     * the last one to sync, we need to notify the main send thread.
4123     */
4124    ram_dirty_bitmap_reload_notify(s);
4125
4126    ret = 0;
4127out:
4128    g_free(le_bitmap);
4129    return ret;
4130}
4131
4132static int ram_resume_prepare(MigrationState *s, void *opaque)
4133{
4134    RAMState *rs = *(RAMState **)opaque;
4135    int ret;
4136
4137    ret = ram_dirty_bitmap_sync_all(s, rs);
4138    if (ret) {
4139        return ret;
4140    }
4141
4142    ram_state_resume_prepare(rs, s->to_dst_file);
4143
4144    return 0;
4145}
4146
4147static SaveVMHandlers savevm_ram_handlers = {
4148    .save_setup = ram_save_setup,
4149    .save_live_iterate = ram_save_iterate,
4150    .save_live_complete_postcopy = ram_save_complete,
4151    .save_live_complete_precopy = ram_save_complete,
4152    .has_postcopy = ram_has_postcopy,
4153    .save_live_pending = ram_save_pending,
4154    .load_state = ram_load,
4155    .save_cleanup = ram_save_cleanup,
4156    .load_setup = ram_load_setup,
4157    .load_cleanup = ram_load_cleanup,
4158    .resume_prepare = ram_resume_prepare,
4159};
4160
4161static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4162                                      size_t old_size, size_t new_size)
4163{
4164    PostcopyState ps = postcopy_state_get();
4165    ram_addr_t offset;
4166    RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4167    Error *err = NULL;
4168
4169    if (ramblock_is_ignored(rb)) {
4170        return;
4171    }
4172
4173    if (!migration_is_idle()) {
4174        /*
4175         * Precopy code on the source cannot deal with the size of RAM blocks
4176         * changing at random points in time - especially after sending the
4177         * RAM block sizes in the migration stream, they must no longer change.
4178         * Abort and indicate a proper reason.
4179         */
4180        error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4181        migrate_set_error(migrate_get_current(), err);
4182        error_free(err);
4183        migration_cancel();
4184    }
4185
4186    switch (ps) {
4187    case POSTCOPY_INCOMING_ADVISE:
4188        /*
4189         * Update what ram_postcopy_incoming_init()->init_range() does at the
4190         * time postcopy was advised. Syncing RAM blocks with the source will
4191         * result in RAM resizes.
4192         */
4193        if (old_size < new_size) {
4194            if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4195                error_report("RAM block '%s' discard of resized RAM failed",
4196                             rb->idstr);
4197            }
4198        }
4199        rb->postcopy_length = new_size;
4200        break;
4201    case POSTCOPY_INCOMING_NONE:
4202    case POSTCOPY_INCOMING_RUNNING:
4203    case POSTCOPY_INCOMING_END:
4204        /*
4205         * Once our guest is running, postcopy does no longer care about
4206         * resizes. When growing, the new memory was not available on the
4207         * source, no handler needed.
4208         */
4209        break;
4210    default:
4211        error_report("RAM block '%s' resized during postcopy state: %d",
4212                     rb->idstr, ps);
4213        exit(-1);
4214    }
4215}
4216
4217static RAMBlockNotifier ram_mig_ram_notifier = {
4218    .ram_block_resized = ram_mig_ram_block_resized,
4219};
4220
4221void ram_mig_init(void)
4222{
4223    qemu_mutex_init(&XBZRLE.lock);
4224    register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4225    ram_block_notifier_add(&ram_mig_ram_notifier);
4226}
4227