qemu/migration/ram.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2011-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include "qemu/cutils.h"
  31#include "qemu/bitops.h"
  32#include "qemu/bitmap.h"
  33#include "qemu/madvise.h"
  34#include "qemu/main-loop.h"
  35#include "io/channel-null.h"
  36#include "xbzrle.h"
  37#include "ram.h"
  38#include "migration.h"
  39#include "migration/register.h"
  40#include "migration/misc.h"
  41#include "qemu-file.h"
  42#include "postcopy-ram.h"
  43#include "page_cache.h"
  44#include "qemu/error-report.h"
  45#include "qapi/error.h"
  46#include "qapi/qapi-types-migration.h"
  47#include "qapi/qapi-events-migration.h"
  48#include "qapi/qmp/qerror.h"
  49#include "trace.h"
  50#include "exec/ram_addr.h"
  51#include "exec/target_page.h"
  52#include "qemu/rcu_queue.h"
  53#include "migration/colo.h"
  54#include "block.h"
  55#include "sysemu/cpu-throttle.h"
  56#include "savevm.h"
  57#include "qemu/iov.h"
  58#include "multifd.h"
  59#include "sysemu/runstate.h"
  60
  61#include "hw/boards.h" /* for machine_dump_guest_core() */
  62
  63#if defined(__linux__)
  64#include "qemu/userfaultfd.h"
  65#endif /* defined(__linux__) */
  66
  67/***********************************************************/
  68/* ram save/restore */
  69
  70/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  71 * worked for pages that where filled with the same char.  We switched
  72 * it to only search for the zero value.  And to avoid confusion with
  73 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  74 */
  75
  76#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  77#define RAM_SAVE_FLAG_ZERO     0x02
  78#define RAM_SAVE_FLAG_MEM_SIZE 0x04
  79#define RAM_SAVE_FLAG_PAGE     0x08
  80#define RAM_SAVE_FLAG_EOS      0x10
  81#define RAM_SAVE_FLAG_CONTINUE 0x20
  82#define RAM_SAVE_FLAG_XBZRLE   0x40
  83/* 0x80 is reserved in migration.h start with 0x100 next */
  84#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  85
  86XBZRLECacheStats xbzrle_counters;
  87
  88/* struct contains XBZRLE cache and a static page
  89   used by the compression */
  90static struct {
  91    /* buffer used for XBZRLE encoding */
  92    uint8_t *encoded_buf;
  93    /* buffer for storing page content */
  94    uint8_t *current_buf;
  95    /* Cache for XBZRLE, Protected by lock. */
  96    PageCache *cache;
  97    QemuMutex lock;
  98    /* it will store a page full of zeros */
  99    uint8_t *zero_target_page;
 100    /* buffer used for XBZRLE decoding */
 101    uint8_t *decoded_buf;
 102} XBZRLE;
 103
 104static void XBZRLE_cache_lock(void)
 105{
 106    if (migrate_use_xbzrle()) {
 107        qemu_mutex_lock(&XBZRLE.lock);
 108    }
 109}
 110
 111static void XBZRLE_cache_unlock(void)
 112{
 113    if (migrate_use_xbzrle()) {
 114        qemu_mutex_unlock(&XBZRLE.lock);
 115    }
 116}
 117
 118/**
 119 * xbzrle_cache_resize: resize the xbzrle cache
 120 *
 121 * This function is called from migrate_params_apply in main
 122 * thread, possibly while a migration is in progress.  A running
 123 * migration may be using the cache and might finish during this call,
 124 * hence changes to the cache are protected by XBZRLE.lock().
 125 *
 126 * Returns 0 for success or -1 for error
 127 *
 128 * @new_size: new cache size
 129 * @errp: set *errp if the check failed, with reason
 130 */
 131int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 132{
 133    PageCache *new_cache;
 134    int64_t ret = 0;
 135
 136    /* Check for truncation */
 137    if (new_size != (size_t)new_size) {
 138        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 139                   "exceeding address space");
 140        return -1;
 141    }
 142
 143    if (new_size == migrate_xbzrle_cache_size()) {
 144        /* nothing to do */
 145        return 0;
 146    }
 147
 148    XBZRLE_cache_lock();
 149
 150    if (XBZRLE.cache != NULL) {
 151        new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 152        if (!new_cache) {
 153            ret = -1;
 154            goto out;
 155        }
 156
 157        cache_fini(XBZRLE.cache);
 158        XBZRLE.cache = new_cache;
 159    }
 160out:
 161    XBZRLE_cache_unlock();
 162    return ret;
 163}
 164
 165bool ramblock_is_ignored(RAMBlock *block)
 166{
 167    return !qemu_ram_is_migratable(block) ||
 168           (migrate_ignore_shared() && qemu_ram_is_shared(block));
 169}
 170
 171#undef RAMBLOCK_FOREACH
 172
 173int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 174{
 175    RAMBlock *block;
 176    int ret = 0;
 177
 178    RCU_READ_LOCK_GUARD();
 179
 180    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 181        ret = func(block, opaque);
 182        if (ret) {
 183            break;
 184        }
 185    }
 186    return ret;
 187}
 188
 189static void ramblock_recv_map_init(void)
 190{
 191    RAMBlock *rb;
 192
 193    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 194        assert(!rb->receivedmap);
 195        rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 196    }
 197}
 198
 199int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 200{
 201    return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 202                    rb->receivedmap);
 203}
 204
 205bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 206{
 207    return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 208}
 209
 210void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 211{
 212    set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 213}
 214
 215void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 216                                    size_t nr)
 217{
 218    bitmap_set_atomic(rb->receivedmap,
 219                      ramblock_recv_bitmap_offset(host_addr, rb),
 220                      nr);
 221}
 222
 223#define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 224
 225/*
 226 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 227 *
 228 * Returns >0 if success with sent bytes, or <0 if error.
 229 */
 230int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 231                                  const char *block_name)
 232{
 233    RAMBlock *block = qemu_ram_block_by_name(block_name);
 234    unsigned long *le_bitmap, nbits;
 235    uint64_t size;
 236
 237    if (!block) {
 238        error_report("%s: invalid block name: %s", __func__, block_name);
 239        return -1;
 240    }
 241
 242    nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 243
 244    /*
 245     * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 246     * machines we may need 4 more bytes for padding (see below
 247     * comment). So extend it a bit before hand.
 248     */
 249    le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 250
 251    /*
 252     * Always use little endian when sending the bitmap. This is
 253     * required that when source and destination VMs are not using the
 254     * same endianness. (Note: big endian won't work.)
 255     */
 256    bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 257
 258    /* Size of the bitmap, in bytes */
 259    size = DIV_ROUND_UP(nbits, 8);
 260
 261    /*
 262     * size is always aligned to 8 bytes for 64bit machines, but it
 263     * may not be true for 32bit machines. We need this padding to
 264     * make sure the migration can survive even between 32bit and
 265     * 64bit machines.
 266     */
 267    size = ROUND_UP(size, 8);
 268
 269    qemu_put_be64(file, size);
 270    qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 271    /*
 272     * Mark as an end, in case the middle part is screwed up due to
 273     * some "mysterious" reason.
 274     */
 275    qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 276    qemu_fflush(file);
 277
 278    g_free(le_bitmap);
 279
 280    if (qemu_file_get_error(file)) {
 281        return qemu_file_get_error(file);
 282    }
 283
 284    return size + sizeof(size);
 285}
 286
 287/*
 288 * An outstanding page request, on the source, having been received
 289 * and queued
 290 */
 291struct RAMSrcPageRequest {
 292    RAMBlock *rb;
 293    hwaddr    offset;
 294    hwaddr    len;
 295
 296    QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 297};
 298
 299typedef struct {
 300    /*
 301     * Cached ramblock/offset values if preempted.  They're only meaningful if
 302     * preempted==true below.
 303     */
 304    RAMBlock *ram_block;
 305    unsigned long ram_page;
 306    /*
 307     * Whether a postcopy preemption just happened.  Will be reset after
 308     * precopy recovered to background migration.
 309     */
 310    bool preempted;
 311} PostcopyPreemptState;
 312
 313/* State of RAM for migration */
 314struct RAMState {
 315    /* QEMUFile used for this migration */
 316    QEMUFile *f;
 317    /* UFFD file descriptor, used in 'write-tracking' migration */
 318    int uffdio_fd;
 319    /* Last block that we have visited searching for dirty pages */
 320    RAMBlock *last_seen_block;
 321    /* Last block from where we have sent data */
 322    RAMBlock *last_sent_block;
 323    /* Last dirty target page we have sent */
 324    ram_addr_t last_page;
 325    /* last ram version we have seen */
 326    uint32_t last_version;
 327    /* How many times we have dirty too many pages */
 328    int dirty_rate_high_cnt;
 329    /* these variables are used for bitmap sync */
 330    /* last time we did a full bitmap_sync */
 331    int64_t time_last_bitmap_sync;
 332    /* bytes transferred at start_time */
 333    uint64_t bytes_xfer_prev;
 334    /* number of dirty pages since start_time */
 335    uint64_t num_dirty_pages_period;
 336    /* xbzrle misses since the beginning of the period */
 337    uint64_t xbzrle_cache_miss_prev;
 338    /* Amount of xbzrle pages since the beginning of the period */
 339    uint64_t xbzrle_pages_prev;
 340    /* Amount of xbzrle encoded bytes since the beginning of the period */
 341    uint64_t xbzrle_bytes_prev;
 342    /* Start using XBZRLE (e.g., after the first round). */
 343    bool xbzrle_enabled;
 344    /* Are we on the last stage of migration */
 345    bool last_stage;
 346    /* compression statistics since the beginning of the period */
 347    /* amount of count that no free thread to compress data */
 348    uint64_t compress_thread_busy_prev;
 349    /* amount bytes after compression */
 350    uint64_t compressed_size_prev;
 351    /* amount of compressed pages */
 352    uint64_t compress_pages_prev;
 353
 354    /* total handled target pages at the beginning of period */
 355    uint64_t target_page_count_prev;
 356    /* total handled target pages since start */
 357    uint64_t target_page_count;
 358    /* number of dirty bits in the bitmap */
 359    uint64_t migration_dirty_pages;
 360    /* Protects modification of the bitmap and migration dirty pages */
 361    QemuMutex bitmap_mutex;
 362    /* The RAMBlock used in the last src_page_requests */
 363    RAMBlock *last_req_rb;
 364    /* Queue of outstanding page requests from the destination */
 365    QemuMutex src_page_req_mutex;
 366    QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 367
 368    /* Postcopy preemption informations */
 369    PostcopyPreemptState postcopy_preempt_state;
 370    /*
 371     * Current channel we're using on src VM.  Only valid if postcopy-preempt
 372     * is enabled.
 373     */
 374    unsigned int postcopy_channel;
 375};
 376typedef struct RAMState RAMState;
 377
 378static RAMState *ram_state;
 379
 380static NotifierWithReturnList precopy_notifier_list;
 381
 382static void postcopy_preempt_reset(RAMState *rs)
 383{
 384    memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState));
 385}
 386
 387/* Whether postcopy has queued requests? */
 388static bool postcopy_has_request(RAMState *rs)
 389{
 390    return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
 391}
 392
 393void precopy_infrastructure_init(void)
 394{
 395    notifier_with_return_list_init(&precopy_notifier_list);
 396}
 397
 398void precopy_add_notifier(NotifierWithReturn *n)
 399{
 400    notifier_with_return_list_add(&precopy_notifier_list, n);
 401}
 402
 403void precopy_remove_notifier(NotifierWithReturn *n)
 404{
 405    notifier_with_return_remove(n);
 406}
 407
 408int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 409{
 410    PrecopyNotifyData pnd;
 411    pnd.reason = reason;
 412    pnd.errp = errp;
 413
 414    return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 415}
 416
 417uint64_t ram_bytes_remaining(void)
 418{
 419    return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 420                       0;
 421}
 422
 423MigrationStats ram_counters;
 424
 425static void ram_transferred_add(uint64_t bytes)
 426{
 427    if (runstate_is_running()) {
 428        ram_counters.precopy_bytes += bytes;
 429    } else if (migration_in_postcopy()) {
 430        ram_counters.postcopy_bytes += bytes;
 431    } else {
 432        ram_counters.downtime_bytes += bytes;
 433    }
 434    ram_counters.transferred += bytes;
 435}
 436
 437void dirty_sync_missed_zero_copy(void)
 438{
 439    ram_counters.dirty_sync_missed_zero_copy++;
 440}
 441
 442/* used by the search for pages to send */
 443struct PageSearchStatus {
 444    /* Current block being searched */
 445    RAMBlock    *block;
 446    /* Current page to search from */
 447    unsigned long page;
 448    /* Set once we wrap around */
 449    bool         complete_round;
 450    /*
 451     * [POSTCOPY-ONLY] Whether current page is explicitly requested by
 452     * postcopy.  When set, the request is "urgent" because the dest QEMU
 453     * threads are waiting for us.
 454     */
 455    bool         postcopy_requested;
 456    /*
 457     * [POSTCOPY-ONLY] The target channel to use to send current page.
 458     *
 459     * Note: This may _not_ match with the value in postcopy_requested
 460     * above. Let's imagine the case where the postcopy request is exactly
 461     * the page that we're sending in progress during precopy. In this case
 462     * we'll have postcopy_requested set to true but the target channel
 463     * will be the precopy channel (so that we don't split brain on that
 464     * specific page since the precopy channel already contains partial of
 465     * that page data).
 466     *
 467     * Besides that specific use case, postcopy_target_channel should
 468     * always be equal to postcopy_requested, because by default we send
 469     * postcopy pages via postcopy preempt channel.
 470     */
 471    bool         postcopy_target_channel;
 472};
 473typedef struct PageSearchStatus PageSearchStatus;
 474
 475CompressionStats compression_counters;
 476
 477struct CompressParam {
 478    bool done;
 479    bool quit;
 480    bool zero_page;
 481    QEMUFile *file;
 482    QemuMutex mutex;
 483    QemuCond cond;
 484    RAMBlock *block;
 485    ram_addr_t offset;
 486
 487    /* internally used fields */
 488    z_stream stream;
 489    uint8_t *originbuf;
 490};
 491typedef struct CompressParam CompressParam;
 492
 493struct DecompressParam {
 494    bool done;
 495    bool quit;
 496    QemuMutex mutex;
 497    QemuCond cond;
 498    void *des;
 499    uint8_t *compbuf;
 500    int len;
 501    z_stream stream;
 502};
 503typedef struct DecompressParam DecompressParam;
 504
 505static CompressParam *comp_param;
 506static QemuThread *compress_threads;
 507/* comp_done_cond is used to wake up the migration thread when
 508 * one of the compression threads has finished the compression.
 509 * comp_done_lock is used to co-work with comp_done_cond.
 510 */
 511static QemuMutex comp_done_lock;
 512static QemuCond comp_done_cond;
 513
 514static QEMUFile *decomp_file;
 515static DecompressParam *decomp_param;
 516static QemuThread *decompress_threads;
 517static QemuMutex decomp_done_lock;
 518static QemuCond decomp_done_cond;
 519
 520static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 521                                 ram_addr_t offset, uint8_t *source_buf);
 522
 523static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
 524                                     bool postcopy_requested);
 525
 526static void *do_data_compress(void *opaque)
 527{
 528    CompressParam *param = opaque;
 529    RAMBlock *block;
 530    ram_addr_t offset;
 531    bool zero_page;
 532
 533    qemu_mutex_lock(&param->mutex);
 534    while (!param->quit) {
 535        if (param->block) {
 536            block = param->block;
 537            offset = param->offset;
 538            param->block = NULL;
 539            qemu_mutex_unlock(&param->mutex);
 540
 541            zero_page = do_compress_ram_page(param->file, &param->stream,
 542                                             block, offset, param->originbuf);
 543
 544            qemu_mutex_lock(&comp_done_lock);
 545            param->done = true;
 546            param->zero_page = zero_page;
 547            qemu_cond_signal(&comp_done_cond);
 548            qemu_mutex_unlock(&comp_done_lock);
 549
 550            qemu_mutex_lock(&param->mutex);
 551        } else {
 552            qemu_cond_wait(&param->cond, &param->mutex);
 553        }
 554    }
 555    qemu_mutex_unlock(&param->mutex);
 556
 557    return NULL;
 558}
 559
 560static void compress_threads_save_cleanup(void)
 561{
 562    int i, thread_count;
 563
 564    if (!migrate_use_compression() || !comp_param) {
 565        return;
 566    }
 567
 568    thread_count = migrate_compress_threads();
 569    for (i = 0; i < thread_count; i++) {
 570        /*
 571         * we use it as a indicator which shows if the thread is
 572         * properly init'd or not
 573         */
 574        if (!comp_param[i].file) {
 575            break;
 576        }
 577
 578        qemu_mutex_lock(&comp_param[i].mutex);
 579        comp_param[i].quit = true;
 580        qemu_cond_signal(&comp_param[i].cond);
 581        qemu_mutex_unlock(&comp_param[i].mutex);
 582
 583        qemu_thread_join(compress_threads + i);
 584        qemu_mutex_destroy(&comp_param[i].mutex);
 585        qemu_cond_destroy(&comp_param[i].cond);
 586        deflateEnd(&comp_param[i].stream);
 587        g_free(comp_param[i].originbuf);
 588        qemu_fclose(comp_param[i].file);
 589        comp_param[i].file = NULL;
 590    }
 591    qemu_mutex_destroy(&comp_done_lock);
 592    qemu_cond_destroy(&comp_done_cond);
 593    g_free(compress_threads);
 594    g_free(comp_param);
 595    compress_threads = NULL;
 596    comp_param = NULL;
 597}
 598
 599static int compress_threads_save_setup(void)
 600{
 601    int i, thread_count;
 602
 603    if (!migrate_use_compression()) {
 604        return 0;
 605    }
 606    thread_count = migrate_compress_threads();
 607    compress_threads = g_new0(QemuThread, thread_count);
 608    comp_param = g_new0(CompressParam, thread_count);
 609    qemu_cond_init(&comp_done_cond);
 610    qemu_mutex_init(&comp_done_lock);
 611    for (i = 0; i < thread_count; i++) {
 612        comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 613        if (!comp_param[i].originbuf) {
 614            goto exit;
 615        }
 616
 617        if (deflateInit(&comp_param[i].stream,
 618                        migrate_compress_level()) != Z_OK) {
 619            g_free(comp_param[i].originbuf);
 620            goto exit;
 621        }
 622
 623        /* comp_param[i].file is just used as a dummy buffer to save data,
 624         * set its ops to empty.
 625         */
 626        comp_param[i].file = qemu_file_new_output(
 627            QIO_CHANNEL(qio_channel_null_new()));
 628        comp_param[i].done = true;
 629        comp_param[i].quit = false;
 630        qemu_mutex_init(&comp_param[i].mutex);
 631        qemu_cond_init(&comp_param[i].cond);
 632        qemu_thread_create(compress_threads + i, "compress",
 633                           do_data_compress, comp_param + i,
 634                           QEMU_THREAD_JOINABLE);
 635    }
 636    return 0;
 637
 638exit:
 639    compress_threads_save_cleanup();
 640    return -1;
 641}
 642
 643/**
 644 * save_page_header: write page header to wire
 645 *
 646 * If this is the 1st block, it also writes the block identification
 647 *
 648 * Returns the number of bytes written
 649 *
 650 * @f: QEMUFile where to send the data
 651 * @block: block that contains the page we want to send
 652 * @offset: offset inside the block for the page
 653 *          in the lower bits, it contains flags
 654 */
 655static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 656                               ram_addr_t offset)
 657{
 658    size_t size, len;
 659
 660    if (block == rs->last_sent_block) {
 661        offset |= RAM_SAVE_FLAG_CONTINUE;
 662    }
 663    qemu_put_be64(f, offset);
 664    size = 8;
 665
 666    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 667        len = strlen(block->idstr);
 668        qemu_put_byte(f, len);
 669        qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 670        size += 1 + len;
 671        rs->last_sent_block = block;
 672    }
 673    return size;
 674}
 675
 676/**
 677 * mig_throttle_guest_down: throttle down the guest
 678 *
 679 * Reduce amount of guest cpu execution to hopefully slow down memory
 680 * writes. If guest dirty memory rate is reduced below the rate at
 681 * which we can transfer pages to the destination then we should be
 682 * able to complete migration. Some workloads dirty memory way too
 683 * fast and will not effectively converge, even with auto-converge.
 684 */
 685static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 686                                    uint64_t bytes_dirty_threshold)
 687{
 688    MigrationState *s = migrate_get_current();
 689    uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 690    uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 691    bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 692    int pct_max = s->parameters.max_cpu_throttle;
 693
 694    uint64_t throttle_now = cpu_throttle_get_percentage();
 695    uint64_t cpu_now, cpu_ideal, throttle_inc;
 696
 697    /* We have not started throttling yet. Let's start it. */
 698    if (!cpu_throttle_active()) {
 699        cpu_throttle_set(pct_initial);
 700    } else {
 701        /* Throttling already on, just increase the rate */
 702        if (!pct_tailslow) {
 703            throttle_inc = pct_increment;
 704        } else {
 705            /* Compute the ideal CPU percentage used by Guest, which may
 706             * make the dirty rate match the dirty rate threshold. */
 707            cpu_now = 100 - throttle_now;
 708            cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 709                        bytes_dirty_period);
 710            throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 711        }
 712        cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 713    }
 714}
 715
 716void mig_throttle_counter_reset(void)
 717{
 718    RAMState *rs = ram_state;
 719
 720    rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 721    rs->num_dirty_pages_period = 0;
 722    rs->bytes_xfer_prev = ram_counters.transferred;
 723}
 724
 725/**
 726 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 727 *
 728 * @rs: current RAM state
 729 * @current_addr: address for the zero page
 730 *
 731 * Update the xbzrle cache to reflect a page that's been sent as all 0.
 732 * The important thing is that a stale (not-yet-0'd) page be replaced
 733 * by the new data.
 734 * As a bonus, if the page wasn't in the cache it gets added so that
 735 * when a small write is made into the 0'd page it gets XBZRLE sent.
 736 */
 737static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 738{
 739    if (!rs->xbzrle_enabled) {
 740        return;
 741    }
 742
 743    /* We don't care if this fails to allocate a new cache page
 744     * as long as it updated an old one */
 745    cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 746                 ram_counters.dirty_sync_count);
 747}
 748
 749#define ENCODING_FLAG_XBZRLE 0x1
 750
 751/**
 752 * save_xbzrle_page: compress and send current page
 753 *
 754 * Returns: 1 means that we wrote the page
 755 *          0 means that page is identical to the one already sent
 756 *          -1 means that xbzrle would be longer than normal
 757 *
 758 * @rs: current RAM state
 759 * @current_data: pointer to the address of the page contents
 760 * @current_addr: addr of the page
 761 * @block: block that contains the page we want to send
 762 * @offset: offset inside the block for the page
 763 */
 764static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 765                            ram_addr_t current_addr, RAMBlock *block,
 766                            ram_addr_t offset)
 767{
 768    int encoded_len = 0, bytes_xbzrle;
 769    uint8_t *prev_cached_page;
 770
 771    if (!cache_is_cached(XBZRLE.cache, current_addr,
 772                         ram_counters.dirty_sync_count)) {
 773        xbzrle_counters.cache_miss++;
 774        if (!rs->last_stage) {
 775            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 776                             ram_counters.dirty_sync_count) == -1) {
 777                return -1;
 778            } else {
 779                /* update *current_data when the page has been
 780                   inserted into cache */
 781                *current_data = get_cached_data(XBZRLE.cache, current_addr);
 782            }
 783        }
 784        return -1;
 785    }
 786
 787    /*
 788     * Reaching here means the page has hit the xbzrle cache, no matter what
 789     * encoding result it is (normal encoding, overflow or skipping the page),
 790     * count the page as encoded. This is used to calculate the encoding rate.
 791     *
 792     * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 793     * 2nd page turns out to be skipped (i.e. no new bytes written to the
 794     * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 795     * skipped page included. In this way, the encoding rate can tell if the
 796     * guest page is good for xbzrle encoding.
 797     */
 798    xbzrle_counters.pages++;
 799    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 800
 801    /* save current buffer into memory */
 802    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 803
 804    /* XBZRLE encoding (if there is no overflow) */
 805    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 806                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 807                                       TARGET_PAGE_SIZE);
 808
 809    /*
 810     * Update the cache contents, so that it corresponds to the data
 811     * sent, in all cases except where we skip the page.
 812     */
 813    if (!rs->last_stage && encoded_len != 0) {
 814        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 815        /*
 816         * In the case where we couldn't compress, ensure that the caller
 817         * sends the data from the cache, since the guest might have
 818         * changed the RAM since we copied it.
 819         */
 820        *current_data = prev_cached_page;
 821    }
 822
 823    if (encoded_len == 0) {
 824        trace_save_xbzrle_page_skipping();
 825        return 0;
 826    } else if (encoded_len == -1) {
 827        trace_save_xbzrle_page_overflow();
 828        xbzrle_counters.overflow++;
 829        xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 830        return -1;
 831    }
 832
 833    /* Send XBZRLE based compressed page */
 834    bytes_xbzrle = save_page_header(rs, rs->f, block,
 835                                    offset | RAM_SAVE_FLAG_XBZRLE);
 836    qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 837    qemu_put_be16(rs->f, encoded_len);
 838    qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 839    bytes_xbzrle += encoded_len + 1 + 2;
 840    /*
 841     * Like compressed_size (please see update_compress_thread_counts),
 842     * the xbzrle encoded bytes don't count the 8 byte header with
 843     * RAM_SAVE_FLAG_CONTINUE.
 844     */
 845    xbzrle_counters.bytes += bytes_xbzrle - 8;
 846    ram_transferred_add(bytes_xbzrle);
 847
 848    return 1;
 849}
 850
 851/**
 852 * migration_bitmap_find_dirty: find the next dirty page from start
 853 *
 854 * Returns the page offset within memory region of the start of a dirty page
 855 *
 856 * @rs: current RAM state
 857 * @rb: RAMBlock where to search for dirty pages
 858 * @start: page where we start the search
 859 */
 860static inline
 861unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 862                                          unsigned long start)
 863{
 864    unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 865    unsigned long *bitmap = rb->bmap;
 866
 867    if (ramblock_is_ignored(rb)) {
 868        return size;
 869    }
 870
 871    return find_next_bit(bitmap, size, start);
 872}
 873
 874static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 875                                                       unsigned long page)
 876{
 877    uint8_t shift;
 878    hwaddr size, start;
 879
 880    if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 881        return;
 882    }
 883
 884    shift = rb->clear_bmap_shift;
 885    /*
 886     * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 887     * can make things easier sometimes since then start address
 888     * of the small chunk will always be 64 pages aligned so the
 889     * bitmap will always be aligned to unsigned long. We should
 890     * even be able to remove this restriction but I'm simply
 891     * keeping it.
 892     */
 893    assert(shift >= 6);
 894
 895    size = 1ULL << (TARGET_PAGE_BITS + shift);
 896    start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 897    trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 898    memory_region_clear_dirty_bitmap(rb->mr, start, size);
 899}
 900
 901static void
 902migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 903                                                 unsigned long start,
 904                                                 unsigned long npages)
 905{
 906    unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 907    unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 908    unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 909
 910    /*
 911     * Clear pages from start to start + npages - 1, so the end boundary is
 912     * exclusive.
 913     */
 914    for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 915        migration_clear_memory_region_dirty_bitmap(rb, i);
 916    }
 917}
 918
 919/*
 920 * colo_bitmap_find_diry:find contiguous dirty pages from start
 921 *
 922 * Returns the page offset within memory region of the start of the contiguout
 923 * dirty page
 924 *
 925 * @rs: current RAM state
 926 * @rb: RAMBlock where to search for dirty pages
 927 * @start: page where we start the search
 928 * @num: the number of contiguous dirty pages
 929 */
 930static inline
 931unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 932                                     unsigned long start, unsigned long *num)
 933{
 934    unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 935    unsigned long *bitmap = rb->bmap;
 936    unsigned long first, next;
 937
 938    *num = 0;
 939
 940    if (ramblock_is_ignored(rb)) {
 941        return size;
 942    }
 943
 944    first = find_next_bit(bitmap, size, start);
 945    if (first >= size) {
 946        return first;
 947    }
 948    next = find_next_zero_bit(bitmap, size, first + 1);
 949    assert(next >= first);
 950    *num = next - first;
 951    return first;
 952}
 953
 954static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 955                                                RAMBlock *rb,
 956                                                unsigned long page)
 957{
 958    bool ret;
 959
 960    /*
 961     * Clear dirty bitmap if needed.  This _must_ be called before we
 962     * send any of the page in the chunk because we need to make sure
 963     * we can capture further page content changes when we sync dirty
 964     * log the next time.  So as long as we are going to send any of
 965     * the page in the chunk we clear the remote dirty bitmap for all.
 966     * Clearing it earlier won't be a problem, but too late will.
 967     */
 968    migration_clear_memory_region_dirty_bitmap(rb, page);
 969
 970    ret = test_and_clear_bit(page, rb->bmap);
 971    if (ret) {
 972        rs->migration_dirty_pages--;
 973    }
 974
 975    return ret;
 976}
 977
 978static void dirty_bitmap_clear_section(MemoryRegionSection *section,
 979                                       void *opaque)
 980{
 981    const hwaddr offset = section->offset_within_region;
 982    const hwaddr size = int128_get64(section->size);
 983    const unsigned long start = offset >> TARGET_PAGE_BITS;
 984    const unsigned long npages = size >> TARGET_PAGE_BITS;
 985    RAMBlock *rb = section->mr->ram_block;
 986    uint64_t *cleared_bits = opaque;
 987
 988    /*
 989     * We don't grab ram_state->bitmap_mutex because we expect to run
 990     * only when starting migration or during postcopy recovery where
 991     * we don't have concurrent access.
 992     */
 993    if (!migration_in_postcopy() && !migrate_background_snapshot()) {
 994        migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
 995    }
 996    *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
 997    bitmap_clear(rb->bmap, start, npages);
 998}
 999
1000/*
1001 * Exclude all dirty pages from migration that fall into a discarded range as
1002 * managed by a RamDiscardManager responsible for the mapped memory region of
1003 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1004 *
1005 * Discarded pages ("logically unplugged") have undefined content and must
1006 * not get migrated, because even reading these pages for migration might
1007 * result in undesired behavior.
1008 *
1009 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1010 *
1011 * Note: The result is only stable while migrating (precopy/postcopy).
1012 */
1013static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1014{
1015    uint64_t cleared_bits = 0;
1016
1017    if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1018        RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1019        MemoryRegionSection section = {
1020            .mr = rb->mr,
1021            .offset_within_region = 0,
1022            .size = int128_make64(qemu_ram_get_used_length(rb)),
1023        };
1024
1025        ram_discard_manager_replay_discarded(rdm, &section,
1026                                             dirty_bitmap_clear_section,
1027                                             &cleared_bits);
1028    }
1029    return cleared_bits;
1030}
1031
1032/*
1033 * Check if a host-page aligned page falls into a discarded range as managed by
1034 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1035 *
1036 * Note: The result is only stable while migrating (precopy/postcopy).
1037 */
1038bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1039{
1040    if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1041        RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1042        MemoryRegionSection section = {
1043            .mr = rb->mr,
1044            .offset_within_region = start,
1045            .size = int128_make64(qemu_ram_pagesize(rb)),
1046        };
1047
1048        return !ram_discard_manager_is_populated(rdm, &section);
1049    }
1050    return false;
1051}
1052
1053/* Called with RCU critical section */
1054static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1055{
1056    uint64_t new_dirty_pages =
1057        cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1058
1059    rs->migration_dirty_pages += new_dirty_pages;
1060    rs->num_dirty_pages_period += new_dirty_pages;
1061}
1062
1063/**
1064 * ram_pagesize_summary: calculate all the pagesizes of a VM
1065 *
1066 * Returns a summary bitmap of the page sizes of all RAMBlocks
1067 *
1068 * For VMs with just normal pages this is equivalent to the host page
1069 * size. If it's got some huge pages then it's the OR of all the
1070 * different page sizes.
1071 */
1072uint64_t ram_pagesize_summary(void)
1073{
1074    RAMBlock *block;
1075    uint64_t summary = 0;
1076
1077    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1078        summary |= block->page_size;
1079    }
1080
1081    return summary;
1082}
1083
1084uint64_t ram_get_total_transferred_pages(void)
1085{
1086    return  ram_counters.normal + ram_counters.duplicate +
1087                compression_counters.pages + xbzrle_counters.pages;
1088}
1089
1090static void migration_update_rates(RAMState *rs, int64_t end_time)
1091{
1092    uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1093    double compressed_size;
1094
1095    /* calculate period counters */
1096    ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1097                / (end_time - rs->time_last_bitmap_sync);
1098
1099    if (!page_count) {
1100        return;
1101    }
1102
1103    if (migrate_use_xbzrle()) {
1104        double encoded_size, unencoded_size;
1105
1106        xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1107            rs->xbzrle_cache_miss_prev) / page_count;
1108        rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1109        unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1110                         TARGET_PAGE_SIZE;
1111        encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1112        if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1113            xbzrle_counters.encoding_rate = 0;
1114        } else {
1115            xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1116        }
1117        rs->xbzrle_pages_prev = xbzrle_counters.pages;
1118        rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1119    }
1120
1121    if (migrate_use_compression()) {
1122        compression_counters.busy_rate = (double)(compression_counters.busy -
1123            rs->compress_thread_busy_prev) / page_count;
1124        rs->compress_thread_busy_prev = compression_counters.busy;
1125
1126        compressed_size = compression_counters.compressed_size -
1127                          rs->compressed_size_prev;
1128        if (compressed_size) {
1129            double uncompressed_size = (compression_counters.pages -
1130                                    rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1131
1132            /* Compression-Ratio = Uncompressed-size / Compressed-size */
1133            compression_counters.compression_rate =
1134                                        uncompressed_size / compressed_size;
1135
1136            rs->compress_pages_prev = compression_counters.pages;
1137            rs->compressed_size_prev = compression_counters.compressed_size;
1138        }
1139    }
1140}
1141
1142static void migration_trigger_throttle(RAMState *rs)
1143{
1144    MigrationState *s = migrate_get_current();
1145    uint64_t threshold = s->parameters.throttle_trigger_threshold;
1146
1147    uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1148    uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1149    uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1150
1151    /* During block migration the auto-converge logic incorrectly detects
1152     * that ram migration makes no progress. Avoid this by disabling the
1153     * throttling logic during the bulk phase of block migration. */
1154    if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1155        /* The following detection logic can be refined later. For now:
1156           Check to see if the ratio between dirtied bytes and the approx.
1157           amount of bytes that just got transferred since the last time
1158           we were in this routine reaches the threshold. If that happens
1159           twice, start or increase throttling. */
1160
1161        if ((bytes_dirty_period > bytes_dirty_threshold) &&
1162            (++rs->dirty_rate_high_cnt >= 2)) {
1163            trace_migration_throttle();
1164            rs->dirty_rate_high_cnt = 0;
1165            mig_throttle_guest_down(bytes_dirty_period,
1166                                    bytes_dirty_threshold);
1167        }
1168    }
1169}
1170
1171static void migration_bitmap_sync(RAMState *rs)
1172{
1173    RAMBlock *block;
1174    int64_t end_time;
1175
1176    ram_counters.dirty_sync_count++;
1177
1178    if (!rs->time_last_bitmap_sync) {
1179        rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1180    }
1181
1182    trace_migration_bitmap_sync_start();
1183    memory_global_dirty_log_sync();
1184
1185    qemu_mutex_lock(&rs->bitmap_mutex);
1186    WITH_RCU_READ_LOCK_GUARD() {
1187        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1188            ramblock_sync_dirty_bitmap(rs, block);
1189        }
1190        ram_counters.remaining = ram_bytes_remaining();
1191    }
1192    qemu_mutex_unlock(&rs->bitmap_mutex);
1193
1194    memory_global_after_dirty_log_sync();
1195    trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1196
1197    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1198
1199    /* more than 1 second = 1000 millisecons */
1200    if (end_time > rs->time_last_bitmap_sync + 1000) {
1201        migration_trigger_throttle(rs);
1202
1203        migration_update_rates(rs, end_time);
1204
1205        rs->target_page_count_prev = rs->target_page_count;
1206
1207        /* reset period counters */
1208        rs->time_last_bitmap_sync = end_time;
1209        rs->num_dirty_pages_period = 0;
1210        rs->bytes_xfer_prev = ram_counters.transferred;
1211    }
1212    if (migrate_use_events()) {
1213        qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1214    }
1215}
1216
1217static void migration_bitmap_sync_precopy(RAMState *rs)
1218{
1219    Error *local_err = NULL;
1220
1221    /*
1222     * The current notifier usage is just an optimization to migration, so we
1223     * don't stop the normal migration process in the error case.
1224     */
1225    if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1226        error_report_err(local_err);
1227        local_err = NULL;
1228    }
1229
1230    migration_bitmap_sync(rs);
1231
1232    if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1233        error_report_err(local_err);
1234    }
1235}
1236
1237static void ram_release_page(const char *rbname, uint64_t offset)
1238{
1239    if (!migrate_release_ram() || !migration_in_postcopy()) {
1240        return;
1241    }
1242
1243    ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1244}
1245
1246/**
1247 * save_zero_page_to_file: send the zero page to the file
1248 *
1249 * Returns the size of data written to the file, 0 means the page is not
1250 * a zero page
1251 *
1252 * @rs: current RAM state
1253 * @file: the file where the data is saved
1254 * @block: block that contains the page we want to send
1255 * @offset: offset inside the block for the page
1256 */
1257static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1258                                  RAMBlock *block, ram_addr_t offset)
1259{
1260    uint8_t *p = block->host + offset;
1261    int len = 0;
1262
1263    if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1264        len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1265        qemu_put_byte(file, 0);
1266        len += 1;
1267        ram_release_page(block->idstr, offset);
1268    }
1269    return len;
1270}
1271
1272/**
1273 * save_zero_page: send the zero page to the stream
1274 *
1275 * Returns the number of pages written.
1276 *
1277 * @rs: current RAM state
1278 * @block: block that contains the page we want to send
1279 * @offset: offset inside the block for the page
1280 */
1281static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1282{
1283    int len = save_zero_page_to_file(rs, rs->f, block, offset);
1284
1285    if (len) {
1286        ram_counters.duplicate++;
1287        ram_transferred_add(len);
1288        return 1;
1289    }
1290    return -1;
1291}
1292
1293/*
1294 * @pages: the number of pages written by the control path,
1295 *        < 0 - error
1296 *        > 0 - number of pages written
1297 *
1298 * Return true if the pages has been saved, otherwise false is returned.
1299 */
1300static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1301                              int *pages)
1302{
1303    uint64_t bytes_xmit = 0;
1304    int ret;
1305
1306    *pages = -1;
1307    ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1308                                &bytes_xmit);
1309    if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1310        return false;
1311    }
1312
1313    if (bytes_xmit) {
1314        ram_transferred_add(bytes_xmit);
1315        *pages = 1;
1316    }
1317
1318    if (ret == RAM_SAVE_CONTROL_DELAYED) {
1319        return true;
1320    }
1321
1322    if (bytes_xmit > 0) {
1323        ram_counters.normal++;
1324    } else if (bytes_xmit == 0) {
1325        ram_counters.duplicate++;
1326    }
1327
1328    return true;
1329}
1330
1331/*
1332 * directly send the page to the stream
1333 *
1334 * Returns the number of pages written.
1335 *
1336 * @rs: current RAM state
1337 * @block: block that contains the page we want to send
1338 * @offset: offset inside the block for the page
1339 * @buf: the page to be sent
1340 * @async: send to page asyncly
1341 */
1342static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1343                            uint8_t *buf, bool async)
1344{
1345    ram_transferred_add(save_page_header(rs, rs->f, block,
1346                                         offset | RAM_SAVE_FLAG_PAGE));
1347    if (async) {
1348        qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1349                              migrate_release_ram() &&
1350                              migration_in_postcopy());
1351    } else {
1352        qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1353    }
1354    ram_transferred_add(TARGET_PAGE_SIZE);
1355    ram_counters.normal++;
1356    return 1;
1357}
1358
1359/**
1360 * ram_save_page: send the given page to the stream
1361 *
1362 * Returns the number of pages written.
1363 *          < 0 - error
1364 *          >=0 - Number of pages written - this might legally be 0
1365 *                if xbzrle noticed the page was the same.
1366 *
1367 * @rs: current RAM state
1368 * @block: block that contains the page we want to send
1369 * @offset: offset inside the block for the page
1370 */
1371static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1372{
1373    int pages = -1;
1374    uint8_t *p;
1375    bool send_async = true;
1376    RAMBlock *block = pss->block;
1377    ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1378    ram_addr_t current_addr = block->offset + offset;
1379
1380    p = block->host + offset;
1381    trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1382
1383    XBZRLE_cache_lock();
1384    if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1385        pages = save_xbzrle_page(rs, &p, current_addr, block,
1386                                 offset);
1387        if (!rs->last_stage) {
1388            /* Can't send this cached data async, since the cache page
1389             * might get updated before it gets to the wire
1390             */
1391            send_async = false;
1392        }
1393    }
1394
1395    /* XBZRLE overflow or normal page */
1396    if (pages == -1) {
1397        pages = save_normal_page(rs, block, offset, p, send_async);
1398    }
1399
1400    XBZRLE_cache_unlock();
1401
1402    return pages;
1403}
1404
1405static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1406                                 ram_addr_t offset)
1407{
1408    if (multifd_queue_page(rs->f, block, offset) < 0) {
1409        return -1;
1410    }
1411    ram_counters.normal++;
1412
1413    return 1;
1414}
1415
1416static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1417                                 ram_addr_t offset, uint8_t *source_buf)
1418{
1419    RAMState *rs = ram_state;
1420    uint8_t *p = block->host + offset;
1421    int ret;
1422
1423    if (save_zero_page_to_file(rs, f, block, offset)) {
1424        return true;
1425    }
1426
1427    save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1428
1429    /*
1430     * copy it to a internal buffer to avoid it being modified by VM
1431     * so that we can catch up the error during compression and
1432     * decompression
1433     */
1434    memcpy(source_buf, p, TARGET_PAGE_SIZE);
1435    ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1436    if (ret < 0) {
1437        qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1438        error_report("compressed data failed!");
1439    }
1440    return false;
1441}
1442
1443static void
1444update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1445{
1446    ram_transferred_add(bytes_xmit);
1447
1448    if (param->zero_page) {
1449        ram_counters.duplicate++;
1450        return;
1451    }
1452
1453    /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1454    compression_counters.compressed_size += bytes_xmit - 8;
1455    compression_counters.pages++;
1456}
1457
1458static bool save_page_use_compression(RAMState *rs);
1459
1460static void flush_compressed_data(RAMState *rs)
1461{
1462    int idx, len, thread_count;
1463
1464    if (!save_page_use_compression(rs)) {
1465        return;
1466    }
1467    thread_count = migrate_compress_threads();
1468
1469    qemu_mutex_lock(&comp_done_lock);
1470    for (idx = 0; idx < thread_count; idx++) {
1471        while (!comp_param[idx].done) {
1472            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1473        }
1474    }
1475    qemu_mutex_unlock(&comp_done_lock);
1476
1477    for (idx = 0; idx < thread_count; idx++) {
1478        qemu_mutex_lock(&comp_param[idx].mutex);
1479        if (!comp_param[idx].quit) {
1480            len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1481            /*
1482             * it's safe to fetch zero_page without holding comp_done_lock
1483             * as there is no further request submitted to the thread,
1484             * i.e, the thread should be waiting for a request at this point.
1485             */
1486            update_compress_thread_counts(&comp_param[idx], len);
1487        }
1488        qemu_mutex_unlock(&comp_param[idx].mutex);
1489    }
1490}
1491
1492static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1493                                       ram_addr_t offset)
1494{
1495    param->block = block;
1496    param->offset = offset;
1497}
1498
1499static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1500                                           ram_addr_t offset)
1501{
1502    int idx, thread_count, bytes_xmit = -1, pages = -1;
1503    bool wait = migrate_compress_wait_thread();
1504
1505    thread_count = migrate_compress_threads();
1506    qemu_mutex_lock(&comp_done_lock);
1507retry:
1508    for (idx = 0; idx < thread_count; idx++) {
1509        if (comp_param[idx].done) {
1510            comp_param[idx].done = false;
1511            bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1512            qemu_mutex_lock(&comp_param[idx].mutex);
1513            set_compress_params(&comp_param[idx], block, offset);
1514            qemu_cond_signal(&comp_param[idx].cond);
1515            qemu_mutex_unlock(&comp_param[idx].mutex);
1516            pages = 1;
1517            update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1518            break;
1519        }
1520    }
1521
1522    /*
1523     * wait for the free thread if the user specifies 'compress-wait-thread',
1524     * otherwise we will post the page out in the main thread as normal page.
1525     */
1526    if (pages < 0 && wait) {
1527        qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1528        goto retry;
1529    }
1530    qemu_mutex_unlock(&comp_done_lock);
1531
1532    return pages;
1533}
1534
1535/**
1536 * find_dirty_block: find the next dirty page and update any state
1537 * associated with the search process.
1538 *
1539 * Returns true if a page is found
1540 *
1541 * @rs: current RAM state
1542 * @pss: data about the state of the current dirty page scan
1543 * @again: set to false if the search has scanned the whole of RAM
1544 */
1545static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1546{
1547    /*
1548     * This is not a postcopy requested page, mark it "not urgent", and use
1549     * precopy channel to send it.
1550     */
1551    pss->postcopy_requested = false;
1552    pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
1553
1554    pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1555    if (pss->complete_round && pss->block == rs->last_seen_block &&
1556        pss->page >= rs->last_page) {
1557        /*
1558         * We've been once around the RAM and haven't found anything.
1559         * Give up.
1560         */
1561        *again = false;
1562        return false;
1563    }
1564    if (!offset_in_ramblock(pss->block,
1565                            ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1566        /* Didn't find anything in this RAM Block */
1567        pss->page = 0;
1568        pss->block = QLIST_NEXT_RCU(pss->block, next);
1569        if (!pss->block) {
1570            /*
1571             * If memory migration starts over, we will meet a dirtied page
1572             * which may still exists in compression threads's ring, so we
1573             * should flush the compressed data to make sure the new page
1574             * is not overwritten by the old one in the destination.
1575             *
1576             * Also If xbzrle is on, stop using the data compression at this
1577             * point. In theory, xbzrle can do better than compression.
1578             */
1579            flush_compressed_data(rs);
1580
1581            /* Hit the end of the list */
1582            pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1583            /* Flag that we've looped */
1584            pss->complete_round = true;
1585            /* After the first round, enable XBZRLE. */
1586            if (migrate_use_xbzrle()) {
1587                rs->xbzrle_enabled = true;
1588            }
1589        }
1590        /* Didn't find anything this time, but try again on the new block */
1591        *again = true;
1592        return false;
1593    } else {
1594        /* Can go around again, but... */
1595        *again = true;
1596        /* We've found something so probably don't need to */
1597        return true;
1598    }
1599}
1600
1601/**
1602 * unqueue_page: gets a page of the queue
1603 *
1604 * Helper for 'get_queued_page' - gets a page off the queue
1605 *
1606 * Returns the block of the page (or NULL if none available)
1607 *
1608 * @rs: current RAM state
1609 * @offset: used to return the offset within the RAMBlock
1610 */
1611static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1612{
1613    struct RAMSrcPageRequest *entry;
1614    RAMBlock *block = NULL;
1615
1616    if (!postcopy_has_request(rs)) {
1617        return NULL;
1618    }
1619
1620    QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1621
1622    /*
1623     * This should _never_ change even after we take the lock, because no one
1624     * should be taking anything off the request list other than us.
1625     */
1626    assert(postcopy_has_request(rs));
1627
1628    entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1629    block = entry->rb;
1630    *offset = entry->offset;
1631
1632    if (entry->len > TARGET_PAGE_SIZE) {
1633        entry->len -= TARGET_PAGE_SIZE;
1634        entry->offset += TARGET_PAGE_SIZE;
1635    } else {
1636        memory_region_unref(block->mr);
1637        QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1638        g_free(entry);
1639        migration_consume_urgent_request();
1640    }
1641
1642    return block;
1643}
1644
1645#if defined(__linux__)
1646/**
1647 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1648 *   is found, return RAM block pointer and page offset
1649 *
1650 * Returns pointer to the RAMBlock containing faulting page,
1651 *   NULL if no write faults are pending
1652 *
1653 * @rs: current RAM state
1654 * @offset: page offset from the beginning of the block
1655 */
1656static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1657{
1658    struct uffd_msg uffd_msg;
1659    void *page_address;
1660    RAMBlock *block;
1661    int res;
1662
1663    if (!migrate_background_snapshot()) {
1664        return NULL;
1665    }
1666
1667    res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1668    if (res <= 0) {
1669        return NULL;
1670    }
1671
1672    page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1673    block = qemu_ram_block_from_host(page_address, false, offset);
1674    assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1675    return block;
1676}
1677
1678/**
1679 * ram_save_release_protection: release UFFD write protection after
1680 *   a range of pages has been saved
1681 *
1682 * @rs: current RAM state
1683 * @pss: page-search-status structure
1684 * @start_page: index of the first page in the range relative to pss->block
1685 *
1686 * Returns 0 on success, negative value in case of an error
1687*/
1688static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1689        unsigned long start_page)
1690{
1691    int res = 0;
1692
1693    /* Check if page is from UFFD-managed region. */
1694    if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1695        void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1696        uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1697
1698        /* Flush async buffers before un-protect. */
1699        qemu_fflush(rs->f);
1700        /* Un-protect memory range. */
1701        res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1702                false, false);
1703    }
1704
1705    return res;
1706}
1707
1708/* ram_write_tracking_available: check if kernel supports required UFFD features
1709 *
1710 * Returns true if supports, false otherwise
1711 */
1712bool ram_write_tracking_available(void)
1713{
1714    uint64_t uffd_features;
1715    int res;
1716
1717    res = uffd_query_features(&uffd_features);
1718    return (res == 0 &&
1719            (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1720}
1721
1722/* ram_write_tracking_compatible: check if guest configuration is
1723 *   compatible with 'write-tracking'
1724 *
1725 * Returns true if compatible, false otherwise
1726 */
1727bool ram_write_tracking_compatible(void)
1728{
1729    const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1730    int uffd_fd;
1731    RAMBlock *block;
1732    bool ret = false;
1733
1734    /* Open UFFD file descriptor */
1735    uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1736    if (uffd_fd < 0) {
1737        return false;
1738    }
1739
1740    RCU_READ_LOCK_GUARD();
1741
1742    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1743        uint64_t uffd_ioctls;
1744
1745        /* Nothing to do with read-only and MMIO-writable regions */
1746        if (block->mr->readonly || block->mr->rom_device) {
1747            continue;
1748        }
1749        /* Try to register block memory via UFFD-IO to track writes */
1750        if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1751                UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1752            goto out;
1753        }
1754        if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1755            goto out;
1756        }
1757    }
1758    ret = true;
1759
1760out:
1761    uffd_close_fd(uffd_fd);
1762    return ret;
1763}
1764
1765static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1766                                       ram_addr_t size)
1767{
1768    /*
1769     * We read one byte of each page; this will preallocate page tables if
1770     * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1771     * where no page was populated yet. This might require adaption when
1772     * supporting other mappings, like shmem.
1773     */
1774    for (; offset < size; offset += block->page_size) {
1775        char tmp = *((char *)block->host + offset);
1776
1777        /* Don't optimize the read out */
1778        asm volatile("" : "+r" (tmp));
1779    }
1780}
1781
1782static inline int populate_read_section(MemoryRegionSection *section,
1783                                        void *opaque)
1784{
1785    const hwaddr size = int128_get64(section->size);
1786    hwaddr offset = section->offset_within_region;
1787    RAMBlock *block = section->mr->ram_block;
1788
1789    populate_read_range(block, offset, size);
1790    return 0;
1791}
1792
1793/*
1794 * ram_block_populate_read: preallocate page tables and populate pages in the
1795 *   RAM block by reading a byte of each page.
1796 *
1797 * Since it's solely used for userfault_fd WP feature, here we just
1798 *   hardcode page size to qemu_real_host_page_size.
1799 *
1800 * @block: RAM block to populate
1801 */
1802static void ram_block_populate_read(RAMBlock *rb)
1803{
1804    /*
1805     * Skip populating all pages that fall into a discarded range as managed by
1806     * a RamDiscardManager responsible for the mapped memory region of the
1807     * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1808     * must not get populated automatically. We don't have to track
1809     * modifications via userfaultfd WP reliably, because these pages will
1810     * not be part of the migration stream either way -- see
1811     * ramblock_dirty_bitmap_exclude_discarded_pages().
1812     *
1813     * Note: The result is only stable while migrating (precopy/postcopy).
1814     */
1815    if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1816        RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1817        MemoryRegionSection section = {
1818            .mr = rb->mr,
1819            .offset_within_region = 0,
1820            .size = rb->mr->size,
1821        };
1822
1823        ram_discard_manager_replay_populated(rdm, &section,
1824                                             populate_read_section, NULL);
1825    } else {
1826        populate_read_range(rb, 0, rb->used_length);
1827    }
1828}
1829
1830/*
1831 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1832 */
1833void ram_write_tracking_prepare(void)
1834{
1835    RAMBlock *block;
1836
1837    RCU_READ_LOCK_GUARD();
1838
1839    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1840        /* Nothing to do with read-only and MMIO-writable regions */
1841        if (block->mr->readonly || block->mr->rom_device) {
1842            continue;
1843        }
1844
1845        /*
1846         * Populate pages of the RAM block before enabling userfault_fd
1847         * write protection.
1848         *
1849         * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1850         * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1851         * pages with pte_none() entries in page table.
1852         */
1853        ram_block_populate_read(block);
1854    }
1855}
1856
1857/*
1858 * ram_write_tracking_start: start UFFD-WP memory tracking
1859 *
1860 * Returns 0 for success or negative value in case of error
1861 */
1862int ram_write_tracking_start(void)
1863{
1864    int uffd_fd;
1865    RAMState *rs = ram_state;
1866    RAMBlock *block;
1867
1868    /* Open UFFD file descriptor */
1869    uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1870    if (uffd_fd < 0) {
1871        return uffd_fd;
1872    }
1873    rs->uffdio_fd = uffd_fd;
1874
1875    RCU_READ_LOCK_GUARD();
1876
1877    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1878        /* Nothing to do with read-only and MMIO-writable regions */
1879        if (block->mr->readonly || block->mr->rom_device) {
1880            continue;
1881        }
1882
1883        /* Register block memory with UFFD to track writes */
1884        if (uffd_register_memory(rs->uffdio_fd, block->host,
1885                block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1886            goto fail;
1887        }
1888        /* Apply UFFD write protection to the block memory range */
1889        if (uffd_change_protection(rs->uffdio_fd, block->host,
1890                block->max_length, true, false)) {
1891            goto fail;
1892        }
1893        block->flags |= RAM_UF_WRITEPROTECT;
1894        memory_region_ref(block->mr);
1895
1896        trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1897                block->host, block->max_length);
1898    }
1899
1900    return 0;
1901
1902fail:
1903    error_report("ram_write_tracking_start() failed: restoring initial memory state");
1904
1905    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1906        if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1907            continue;
1908        }
1909        /*
1910         * In case some memory block failed to be write-protected
1911         * remove protection and unregister all succeeded RAM blocks
1912         */
1913        uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1914                false, false);
1915        uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1916        /* Cleanup flags and remove reference */
1917        block->flags &= ~RAM_UF_WRITEPROTECT;
1918        memory_region_unref(block->mr);
1919    }
1920
1921    uffd_close_fd(uffd_fd);
1922    rs->uffdio_fd = -1;
1923    return -1;
1924}
1925
1926/**
1927 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1928 */
1929void ram_write_tracking_stop(void)
1930{
1931    RAMState *rs = ram_state;
1932    RAMBlock *block;
1933
1934    RCU_READ_LOCK_GUARD();
1935
1936    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1937        if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1938            continue;
1939        }
1940        /* Remove protection and unregister all affected RAM blocks */
1941        uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1942                false, false);
1943        uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1944
1945        trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1946                block->host, block->max_length);
1947
1948        /* Cleanup flags and remove reference */
1949        block->flags &= ~RAM_UF_WRITEPROTECT;
1950        memory_region_unref(block->mr);
1951    }
1952
1953    /* Finally close UFFD file descriptor */
1954    uffd_close_fd(rs->uffdio_fd);
1955    rs->uffdio_fd = -1;
1956}
1957
1958#else
1959/* No target OS support, stubs just fail or ignore */
1960
1961static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1962{
1963    (void) rs;
1964    (void) offset;
1965
1966    return NULL;
1967}
1968
1969static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1970        unsigned long start_page)
1971{
1972    (void) rs;
1973    (void) pss;
1974    (void) start_page;
1975
1976    return 0;
1977}
1978
1979bool ram_write_tracking_available(void)
1980{
1981    return false;
1982}
1983
1984bool ram_write_tracking_compatible(void)
1985{
1986    assert(0);
1987    return false;
1988}
1989
1990int ram_write_tracking_start(void)
1991{
1992    assert(0);
1993    return -1;
1994}
1995
1996void ram_write_tracking_stop(void)
1997{
1998    assert(0);
1999}
2000#endif /* defined(__linux__) */
2001
2002/*
2003 * Check whether two addr/offset of the ramblock falls onto the same host huge
2004 * page.  Returns true if so, false otherwise.
2005 */
2006static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1,
2007                                     uint64_t addr2)
2008{
2009    size_t page_size = qemu_ram_pagesize(rb);
2010
2011    addr1 = ROUND_DOWN(addr1, page_size);
2012    addr2 = ROUND_DOWN(addr2, page_size);
2013
2014    return addr1 == addr2;
2015}
2016
2017/*
2018 * Whether a previous preempted precopy huge page contains current requested
2019 * page?  Returns true if so, false otherwise.
2020 *
2021 * This should really happen very rarely, because it means when we were sending
2022 * during background migration for postcopy we're sending exactly the page that
2023 * some vcpu got faulted on on dest node.  When it happens, we probably don't
2024 * need to do much but drop the request, because we know right after we restore
2025 * the precopy stream it'll be serviced.  It'll slightly affect the order of
2026 * postcopy requests to be serviced (e.g. it'll be the same as we move current
2027 * request to the end of the queue) but it shouldn't be a big deal.  The most
2028 * imporant thing is we can _never_ try to send a partial-sent huge page on the
2029 * POSTCOPY channel again, otherwise that huge page will got "split brain" on
2030 * two channels (PRECOPY, POSTCOPY).
2031 */
2032static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block,
2033                                        ram_addr_t offset)
2034{
2035    PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2036
2037    /* No preemption at all? */
2038    if (!state->preempted) {
2039        return false;
2040    }
2041
2042    /* Not even the same ramblock? */
2043    if (state->ram_block != block) {
2044        return false;
2045    }
2046
2047    return offset_on_same_huge_page(block, offset,
2048                                    state->ram_page << TARGET_PAGE_BITS);
2049}
2050
2051/**
2052 * get_queued_page: unqueue a page from the postcopy requests
2053 *
2054 * Skips pages that are already sent (!dirty)
2055 *
2056 * Returns true if a queued page is found
2057 *
2058 * @rs: current RAM state
2059 * @pss: data about the state of the current dirty page scan
2060 */
2061static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2062{
2063    RAMBlock  *block;
2064    ram_addr_t offset;
2065    bool dirty;
2066
2067    do {
2068        block = unqueue_page(rs, &offset);
2069        /*
2070         * We're sending this page, and since it's postcopy nothing else
2071         * will dirty it, and we must make sure it doesn't get sent again
2072         * even if this queue request was received after the background
2073         * search already sent it.
2074         */
2075        if (block) {
2076            unsigned long page;
2077
2078            page = offset >> TARGET_PAGE_BITS;
2079            dirty = test_bit(page, block->bmap);
2080            if (!dirty) {
2081                trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2082                                                page);
2083            } else {
2084                trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2085            }
2086        }
2087
2088    } while (block && !dirty);
2089
2090    if (block) {
2091        /* See comment above postcopy_preempted_contains() */
2092        if (postcopy_preempted_contains(rs, block, offset)) {
2093            trace_postcopy_preempt_hit(block->idstr, offset);
2094            /*
2095             * If what we preempted previously was exactly what we're
2096             * requesting right now, restore the preempted precopy
2097             * immediately, boosting its priority as it's requested by
2098             * postcopy.
2099             */
2100            postcopy_preempt_restore(rs, pss, true);
2101            return true;
2102        }
2103    } else {
2104        /*
2105         * Poll write faults too if background snapshot is enabled; that's
2106         * when we have vcpus got blocked by the write protected pages.
2107         */
2108        block = poll_fault_page(rs, &offset);
2109    }
2110
2111    if (block) {
2112        /*
2113         * We want the background search to continue from the queued page
2114         * since the guest is likely to want other pages near to the page
2115         * it just requested.
2116         */
2117        pss->block = block;
2118        pss->page = offset >> TARGET_PAGE_BITS;
2119
2120        /*
2121         * This unqueued page would break the "one round" check, even is
2122         * really rare.
2123         */
2124        pss->complete_round = false;
2125        /* Mark it an urgent request, meanwhile using POSTCOPY channel */
2126        pss->postcopy_requested = true;
2127        pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY;
2128    }
2129
2130    return !!block;
2131}
2132
2133/**
2134 * migration_page_queue_free: drop any remaining pages in the ram
2135 * request queue
2136 *
2137 * It should be empty at the end anyway, but in error cases there may
2138 * be some left.  in case that there is any page left, we drop it.
2139 *
2140 */
2141static void migration_page_queue_free(RAMState *rs)
2142{
2143    struct RAMSrcPageRequest *mspr, *next_mspr;
2144    /* This queue generally should be empty - but in the case of a failed
2145     * migration might have some droppings in.
2146     */
2147    RCU_READ_LOCK_GUARD();
2148    QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2149        memory_region_unref(mspr->rb->mr);
2150        QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2151        g_free(mspr);
2152    }
2153}
2154
2155/**
2156 * ram_save_queue_pages: queue the page for transmission
2157 *
2158 * A request from postcopy destination for example.
2159 *
2160 * Returns zero on success or negative on error
2161 *
2162 * @rbname: Name of the RAMBLock of the request. NULL means the
2163 *          same that last one.
2164 * @start: starting address from the start of the RAMBlock
2165 * @len: length (in bytes) to send
2166 */
2167int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2168{
2169    RAMBlock *ramblock;
2170    RAMState *rs = ram_state;
2171
2172    ram_counters.postcopy_requests++;
2173    RCU_READ_LOCK_GUARD();
2174
2175    if (!rbname) {
2176        /* Reuse last RAMBlock */
2177        ramblock = rs->last_req_rb;
2178
2179        if (!ramblock) {
2180            /*
2181             * Shouldn't happen, we can't reuse the last RAMBlock if
2182             * it's the 1st request.
2183             */
2184            error_report("ram_save_queue_pages no previous block");
2185            return -1;
2186        }
2187    } else {
2188        ramblock = qemu_ram_block_by_name(rbname);
2189
2190        if (!ramblock) {
2191            /* We shouldn't be asked for a non-existent RAMBlock */
2192            error_report("ram_save_queue_pages no block '%s'", rbname);
2193            return -1;
2194        }
2195        rs->last_req_rb = ramblock;
2196    }
2197    trace_ram_save_queue_pages(ramblock->idstr, start, len);
2198    if (!offset_in_ramblock(ramblock, start + len - 1)) {
2199        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2200                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2201                     __func__, start, len, ramblock->used_length);
2202        return -1;
2203    }
2204
2205    struct RAMSrcPageRequest *new_entry =
2206        g_new0(struct RAMSrcPageRequest, 1);
2207    new_entry->rb = ramblock;
2208    new_entry->offset = start;
2209    new_entry->len = len;
2210
2211    memory_region_ref(ramblock->mr);
2212    qemu_mutex_lock(&rs->src_page_req_mutex);
2213    QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2214    migration_make_urgent_request();
2215    qemu_mutex_unlock(&rs->src_page_req_mutex);
2216
2217    return 0;
2218}
2219
2220static bool save_page_use_compression(RAMState *rs)
2221{
2222    if (!migrate_use_compression()) {
2223        return false;
2224    }
2225
2226    /*
2227     * If xbzrle is enabled (e.g., after first round of migration), stop
2228     * using the data compression. In theory, xbzrle can do better than
2229     * compression.
2230     */
2231    if (rs->xbzrle_enabled) {
2232        return false;
2233    }
2234
2235    return true;
2236}
2237
2238/*
2239 * try to compress the page before posting it out, return true if the page
2240 * has been properly handled by compression, otherwise needs other
2241 * paths to handle it
2242 */
2243static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2244{
2245    if (!save_page_use_compression(rs)) {
2246        return false;
2247    }
2248
2249    /*
2250     * When starting the process of a new block, the first page of
2251     * the block should be sent out before other pages in the same
2252     * block, and all the pages in last block should have been sent
2253     * out, keeping this order is important, because the 'cont' flag
2254     * is used to avoid resending the block name.
2255     *
2256     * We post the fist page as normal page as compression will take
2257     * much CPU resource.
2258     */
2259    if (block != rs->last_sent_block) {
2260        flush_compressed_data(rs);
2261        return false;
2262    }
2263
2264    if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2265        return true;
2266    }
2267
2268    compression_counters.busy++;
2269    return false;
2270}
2271
2272/**
2273 * ram_save_target_page: save one target page
2274 *
2275 * Returns the number of pages written
2276 *
2277 * @rs: current RAM state
2278 * @pss: data about the page we want to send
2279 */
2280static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2281{
2282    RAMBlock *block = pss->block;
2283    ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2284    int res;
2285
2286    if (control_save_page(rs, block, offset, &res)) {
2287        return res;
2288    }
2289
2290    if (save_compress_page(rs, block, offset)) {
2291        return 1;
2292    }
2293
2294    res = save_zero_page(rs, block, offset);
2295    if (res > 0) {
2296        /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2297         * page would be stale
2298         */
2299        if (!save_page_use_compression(rs)) {
2300            XBZRLE_cache_lock();
2301            xbzrle_cache_zero_page(rs, block->offset + offset);
2302            XBZRLE_cache_unlock();
2303        }
2304        return res;
2305    }
2306
2307    /*
2308     * Do not use multifd in postcopy as one whole host page should be
2309     * placed.  Meanwhile postcopy requires atomic update of pages, so even
2310     * if host page size == guest page size the dest guest during run may
2311     * still see partially copied pages which is data corruption.
2312     */
2313    if (migrate_use_multifd() && !migration_in_postcopy()) {
2314        return ram_save_multifd_page(rs, block, offset);
2315    }
2316
2317    return ram_save_page(rs, pss);
2318}
2319
2320static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss)
2321{
2322    MigrationState *ms = migrate_get_current();
2323
2324    /* Not enabled eager preempt?  Then never do that. */
2325    if (!migrate_postcopy_preempt()) {
2326        return false;
2327    }
2328
2329    /* If the user explicitly disabled breaking of huge page, skip */
2330    if (!ms->postcopy_preempt_break_huge) {
2331        return false;
2332    }
2333
2334    /* If the ramblock we're sending is a small page?  Never bother. */
2335    if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) {
2336        return false;
2337    }
2338
2339    /* Not in postcopy at all? */
2340    if (!migration_in_postcopy()) {
2341        return false;
2342    }
2343
2344    /*
2345     * If we're already handling a postcopy request, don't preempt as this page
2346     * has got the same high priority.
2347     */
2348    if (pss->postcopy_requested) {
2349        return false;
2350    }
2351
2352    /* If there's postcopy requests, then check it up! */
2353    return postcopy_has_request(rs);
2354}
2355
2356/* Returns true if we preempted precopy, false otherwise */
2357static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss)
2358{
2359    PostcopyPreemptState *p_state = &rs->postcopy_preempt_state;
2360
2361    trace_postcopy_preempt_triggered(pss->block->idstr, pss->page);
2362
2363    /*
2364     * Time to preempt precopy. Cache current PSS into preempt state, so that
2365     * after handling the postcopy pages we can recover to it.  We need to do
2366     * so because the dest VM will have partial of the precopy huge page kept
2367     * over in its tmp huge page caches; better move on with it when we can.
2368     */
2369    p_state->ram_block = pss->block;
2370    p_state->ram_page = pss->page;
2371    p_state->preempted = true;
2372}
2373
2374/* Whether we're preempted by a postcopy request during sending a huge page */
2375static bool postcopy_preempt_triggered(RAMState *rs)
2376{
2377    return rs->postcopy_preempt_state.preempted;
2378}
2379
2380static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
2381                                     bool postcopy_requested)
2382{
2383    PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2384
2385    assert(state->preempted);
2386
2387    pss->block = state->ram_block;
2388    pss->page = state->ram_page;
2389
2390    /* Whether this is a postcopy request? */
2391    pss->postcopy_requested = postcopy_requested;
2392    /*
2393     * When restoring a preempted page, the old data resides in PRECOPY
2394     * slow channel, even if postcopy_requested is set.  So always use
2395     * PRECOPY channel here.
2396     */
2397    pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
2398
2399    trace_postcopy_preempt_restored(pss->block->idstr, pss->page);
2400
2401    /* Reset preempt state, most importantly, set preempted==false */
2402    postcopy_preempt_reset(rs);
2403}
2404
2405static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss)
2406{
2407    MigrationState *s = migrate_get_current();
2408    unsigned int channel = pss->postcopy_target_channel;
2409    QEMUFile *next;
2410
2411    if (channel != rs->postcopy_channel) {
2412        if (channel == RAM_CHANNEL_PRECOPY) {
2413            next = s->to_dst_file;
2414        } else {
2415            next = s->postcopy_qemufile_src;
2416        }
2417        /* Update and cache the current channel */
2418        rs->f = next;
2419        rs->postcopy_channel = channel;
2420
2421        /*
2422         * If channel switched, reset last_sent_block since the old sent block
2423         * may not be on the same channel.
2424         */
2425        rs->last_sent_block = NULL;
2426
2427        trace_postcopy_preempt_switch_channel(channel);
2428    }
2429
2430    trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2431}
2432
2433/* We need to make sure rs->f always points to the default channel elsewhere */
2434static void postcopy_preempt_reset_channel(RAMState *rs)
2435{
2436    if (migrate_postcopy_preempt() && migration_in_postcopy()) {
2437        rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2438        rs->f = migrate_get_current()->to_dst_file;
2439        trace_postcopy_preempt_reset_channel();
2440    }
2441}
2442
2443/**
2444 * ram_save_host_page: save a whole host page
2445 *
2446 * Starting at *offset send pages up to the end of the current host
2447 * page. It's valid for the initial offset to point into the middle of
2448 * a host page in which case the remainder of the hostpage is sent.
2449 * Only dirty target pages are sent. Note that the host page size may
2450 * be a huge page for this block.
2451 * The saving stops at the boundary of the used_length of the block
2452 * if the RAMBlock isn't a multiple of the host page size.
2453 *
2454 * Returns the number of pages written or negative on error
2455 *
2456 * @rs: current RAM state
2457 * @pss: data about the page we want to send
2458 */
2459static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2460{
2461    int tmppages, pages = 0;
2462    size_t pagesize_bits =
2463        qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2464    unsigned long hostpage_boundary =
2465        QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2466    unsigned long start_page = pss->page;
2467    int res;
2468
2469    if (ramblock_is_ignored(pss->block)) {
2470        error_report("block %s should not be migrated !", pss->block->idstr);
2471        return 0;
2472    }
2473
2474    if (migrate_postcopy_preempt() && migration_in_postcopy()) {
2475        postcopy_preempt_choose_channel(rs, pss);
2476    }
2477
2478    do {
2479        if (postcopy_needs_preempt(rs, pss)) {
2480            postcopy_do_preempt(rs, pss);
2481            break;
2482        }
2483
2484        /* Check the pages is dirty and if it is send it */
2485        if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2486            tmppages = ram_save_target_page(rs, pss);
2487            if (tmppages < 0) {
2488                return tmppages;
2489            }
2490
2491            pages += tmppages;
2492            /*
2493             * Allow rate limiting to happen in the middle of huge pages if
2494             * something is sent in the current iteration.
2495             */
2496            if (pagesize_bits > 1 && tmppages > 0) {
2497                migration_rate_limit();
2498            }
2499        }
2500        pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2501    } while ((pss->page < hostpage_boundary) &&
2502             offset_in_ramblock(pss->block,
2503                                ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2504    /* The offset we leave with is the min boundary of host page and block */
2505    pss->page = MIN(pss->page, hostpage_boundary);
2506
2507    /*
2508     * When with postcopy preempt mode, flush the data as soon as possible for
2509     * postcopy requests, because we've already sent a whole huge page, so the
2510     * dst node should already have enough resource to atomically filling in
2511     * the current missing page.
2512     *
2513     * More importantly, when using separate postcopy channel, we must do
2514     * explicit flush or it won't flush until the buffer is full.
2515     */
2516    if (migrate_postcopy_preempt() && pss->postcopy_requested) {
2517        qemu_fflush(rs->f);
2518    }
2519
2520    res = ram_save_release_protection(rs, pss, start_page);
2521    return (res < 0 ? res : pages);
2522}
2523
2524/**
2525 * ram_find_and_save_block: finds a dirty page and sends it to f
2526 *
2527 * Called within an RCU critical section.
2528 *
2529 * Returns the number of pages written where zero means no dirty pages,
2530 * or negative on error
2531 *
2532 * @rs: current RAM state
2533 *
2534 * On systems where host-page-size > target-page-size it will send all the
2535 * pages in a host page that are dirty.
2536 */
2537static int ram_find_and_save_block(RAMState *rs)
2538{
2539    PageSearchStatus pss;
2540    int pages = 0;
2541    bool again, found;
2542
2543    /* No dirty page as there is zero RAM */
2544    if (!ram_bytes_total()) {
2545        return pages;
2546    }
2547
2548    /*
2549     * Always keep last_seen_block/last_page valid during this procedure,
2550     * because find_dirty_block() relies on these values (e.g., we compare
2551     * last_seen_block with pss.block to see whether we searched all the
2552     * ramblocks) to detect the completion of migration.  Having NULL value
2553     * of last_seen_block can conditionally cause below loop to run forever.
2554     */
2555    if (!rs->last_seen_block) {
2556        rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2557        rs->last_page = 0;
2558    }
2559
2560    pss.block = rs->last_seen_block;
2561    pss.page = rs->last_page;
2562    pss.complete_round = false;
2563
2564    do {
2565        again = true;
2566        found = get_queued_page(rs, &pss);
2567
2568        if (!found) {
2569            /*
2570             * Recover previous precopy ramblock/offset if postcopy has
2571             * preempted precopy.  Otherwise find the next dirty bit.
2572             */
2573            if (postcopy_preempt_triggered(rs)) {
2574                postcopy_preempt_restore(rs, &pss, false);
2575                found = true;
2576            } else {
2577                /* priority queue empty, so just search for something dirty */
2578                found = find_dirty_block(rs, &pss, &again);
2579            }
2580        }
2581
2582        if (found) {
2583            pages = ram_save_host_page(rs, &pss);
2584        }
2585    } while (!pages && again);
2586
2587    rs->last_seen_block = pss.block;
2588    rs->last_page = pss.page;
2589
2590    return pages;
2591}
2592
2593void acct_update_position(QEMUFile *f, size_t size, bool zero)
2594{
2595    uint64_t pages = size / TARGET_PAGE_SIZE;
2596
2597    if (zero) {
2598        ram_counters.duplicate += pages;
2599    } else {
2600        ram_counters.normal += pages;
2601        ram_transferred_add(size);
2602        qemu_file_credit_transfer(f, size);
2603    }
2604}
2605
2606static uint64_t ram_bytes_total_common(bool count_ignored)
2607{
2608    RAMBlock *block;
2609    uint64_t total = 0;
2610
2611    RCU_READ_LOCK_GUARD();
2612
2613    if (count_ignored) {
2614        RAMBLOCK_FOREACH_MIGRATABLE(block) {
2615            total += block->used_length;
2616        }
2617    } else {
2618        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2619            total += block->used_length;
2620        }
2621    }
2622    return total;
2623}
2624
2625uint64_t ram_bytes_total(void)
2626{
2627    return ram_bytes_total_common(false);
2628}
2629
2630static void xbzrle_load_setup(void)
2631{
2632    XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2633}
2634
2635static void xbzrle_load_cleanup(void)
2636{
2637    g_free(XBZRLE.decoded_buf);
2638    XBZRLE.decoded_buf = NULL;
2639}
2640
2641static void ram_state_cleanup(RAMState **rsp)
2642{
2643    if (*rsp) {
2644        migration_page_queue_free(*rsp);
2645        qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2646        qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2647        g_free(*rsp);
2648        *rsp = NULL;
2649    }
2650}
2651
2652static void xbzrle_cleanup(void)
2653{
2654    XBZRLE_cache_lock();
2655    if (XBZRLE.cache) {
2656        cache_fini(XBZRLE.cache);
2657        g_free(XBZRLE.encoded_buf);
2658        g_free(XBZRLE.current_buf);
2659        g_free(XBZRLE.zero_target_page);
2660        XBZRLE.cache = NULL;
2661        XBZRLE.encoded_buf = NULL;
2662        XBZRLE.current_buf = NULL;
2663        XBZRLE.zero_target_page = NULL;
2664    }
2665    XBZRLE_cache_unlock();
2666}
2667
2668static void ram_save_cleanup(void *opaque)
2669{
2670    RAMState **rsp = opaque;
2671    RAMBlock *block;
2672
2673    /* We don't use dirty log with background snapshots */
2674    if (!migrate_background_snapshot()) {
2675        /* caller have hold iothread lock or is in a bh, so there is
2676         * no writing race against the migration bitmap
2677         */
2678        if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2679            /*
2680             * do not stop dirty log without starting it, since
2681             * memory_global_dirty_log_stop will assert that
2682             * memory_global_dirty_log_start/stop used in pairs
2683             */
2684            memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2685        }
2686    }
2687
2688    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2689        g_free(block->clear_bmap);
2690        block->clear_bmap = NULL;
2691        g_free(block->bmap);
2692        block->bmap = NULL;
2693    }
2694
2695    xbzrle_cleanup();
2696    compress_threads_save_cleanup();
2697    ram_state_cleanup(rsp);
2698}
2699
2700static void ram_state_reset(RAMState *rs)
2701{
2702    rs->last_seen_block = NULL;
2703    rs->last_sent_block = NULL;
2704    rs->last_page = 0;
2705    rs->last_version = ram_list.version;
2706    rs->xbzrle_enabled = false;
2707    postcopy_preempt_reset(rs);
2708    rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2709}
2710
2711#define MAX_WAIT 50 /* ms, half buffered_file limit */
2712
2713/* **** functions for postcopy ***** */
2714
2715void ram_postcopy_migrated_memory_release(MigrationState *ms)
2716{
2717    struct RAMBlock *block;
2718
2719    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2720        unsigned long *bitmap = block->bmap;
2721        unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2722        unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2723
2724        while (run_start < range) {
2725            unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2726            ram_discard_range(block->idstr,
2727                              ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2728                              ((ram_addr_t)(run_end - run_start))
2729                                << TARGET_PAGE_BITS);
2730            run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2731        }
2732    }
2733}
2734
2735/**
2736 * postcopy_send_discard_bm_ram: discard a RAMBlock
2737 *
2738 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2739 *
2740 * @ms: current migration state
2741 * @block: RAMBlock to discard
2742 */
2743static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2744{
2745    unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2746    unsigned long current;
2747    unsigned long *bitmap = block->bmap;
2748
2749    for (current = 0; current < end; ) {
2750        unsigned long one = find_next_bit(bitmap, end, current);
2751        unsigned long zero, discard_length;
2752
2753        if (one >= end) {
2754            break;
2755        }
2756
2757        zero = find_next_zero_bit(bitmap, end, one + 1);
2758
2759        if (zero >= end) {
2760            discard_length = end - one;
2761        } else {
2762            discard_length = zero - one;
2763        }
2764        postcopy_discard_send_range(ms, one, discard_length);
2765        current = one + discard_length;
2766    }
2767}
2768
2769static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2770
2771/**
2772 * postcopy_each_ram_send_discard: discard all RAMBlocks
2773 *
2774 * Utility for the outgoing postcopy code.
2775 *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2776 *   passing it bitmap indexes and name.
2777 * (qemu_ram_foreach_block ends up passing unscaled lengths
2778 *  which would mean postcopy code would have to deal with target page)
2779 *
2780 * @ms: current migration state
2781 */
2782static void postcopy_each_ram_send_discard(MigrationState *ms)
2783{
2784    struct RAMBlock *block;
2785
2786    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2787        postcopy_discard_send_init(ms, block->idstr);
2788
2789        /*
2790         * Deal with TPS != HPS and huge pages.  It discard any partially sent
2791         * host-page size chunks, mark any partially dirty host-page size
2792         * chunks as all dirty.  In this case the host-page is the host-page
2793         * for the particular RAMBlock, i.e. it might be a huge page.
2794         */
2795        postcopy_chunk_hostpages_pass(ms, block);
2796
2797        /*
2798         * Postcopy sends chunks of bitmap over the wire, but it
2799         * just needs indexes at this point, avoids it having
2800         * target page specific code.
2801         */
2802        postcopy_send_discard_bm_ram(ms, block);
2803        postcopy_discard_send_finish(ms);
2804    }
2805}
2806
2807/**
2808 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2809 *
2810 * Helper for postcopy_chunk_hostpages; it's called twice to
2811 * canonicalize the two bitmaps, that are similar, but one is
2812 * inverted.
2813 *
2814 * Postcopy requires that all target pages in a hostpage are dirty or
2815 * clean, not a mix.  This function canonicalizes the bitmaps.
2816 *
2817 * @ms: current migration state
2818 * @block: block that contains the page we want to canonicalize
2819 */
2820static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2821{
2822    RAMState *rs = ram_state;
2823    unsigned long *bitmap = block->bmap;
2824    unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2825    unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2826    unsigned long run_start;
2827
2828    if (block->page_size == TARGET_PAGE_SIZE) {
2829        /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2830        return;
2831    }
2832
2833    /* Find a dirty page */
2834    run_start = find_next_bit(bitmap, pages, 0);
2835
2836    while (run_start < pages) {
2837
2838        /*
2839         * If the start of this run of pages is in the middle of a host
2840         * page, then we need to fixup this host page.
2841         */
2842        if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2843            /* Find the end of this run */
2844            run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2845            /*
2846             * If the end isn't at the start of a host page, then the
2847             * run doesn't finish at the end of a host page
2848             * and we need to discard.
2849             */
2850        }
2851
2852        if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2853            unsigned long page;
2854            unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2855                                                             host_ratio);
2856            run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2857
2858            /* Clean up the bitmap */
2859            for (page = fixup_start_addr;
2860                 page < fixup_start_addr + host_ratio; page++) {
2861                /*
2862                 * Remark them as dirty, updating the count for any pages
2863                 * that weren't previously dirty.
2864                 */
2865                rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2866            }
2867        }
2868
2869        /* Find the next dirty page for the next iteration */
2870        run_start = find_next_bit(bitmap, pages, run_start);
2871    }
2872}
2873
2874/**
2875 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2876 *
2877 * Transmit the set of pages to be discarded after precopy to the target
2878 * these are pages that:
2879 *     a) Have been previously transmitted but are now dirty again
2880 *     b) Pages that have never been transmitted, this ensures that
2881 *        any pages on the destination that have been mapped by background
2882 *        tasks get discarded (transparent huge pages is the specific concern)
2883 * Hopefully this is pretty sparse
2884 *
2885 * @ms: current migration state
2886 */
2887void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2888{
2889    RAMState *rs = ram_state;
2890
2891    RCU_READ_LOCK_GUARD();
2892
2893    /* This should be our last sync, the src is now paused */
2894    migration_bitmap_sync(rs);
2895
2896    /* Easiest way to make sure we don't resume in the middle of a host-page */
2897    rs->last_seen_block = NULL;
2898    rs->last_sent_block = NULL;
2899    rs->last_page = 0;
2900
2901    postcopy_each_ram_send_discard(ms);
2902
2903    trace_ram_postcopy_send_discard_bitmap();
2904}
2905
2906/**
2907 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2908 *
2909 * Returns zero on success
2910 *
2911 * @rbname: name of the RAMBlock of the request. NULL means the
2912 *          same that last one.
2913 * @start: RAMBlock starting page
2914 * @length: RAMBlock size
2915 */
2916int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2917{
2918    trace_ram_discard_range(rbname, start, length);
2919
2920    RCU_READ_LOCK_GUARD();
2921    RAMBlock *rb = qemu_ram_block_by_name(rbname);
2922
2923    if (!rb) {
2924        error_report("ram_discard_range: Failed to find block '%s'", rbname);
2925        return -1;
2926    }
2927
2928    /*
2929     * On source VM, we don't need to update the received bitmap since
2930     * we don't even have one.
2931     */
2932    if (rb->receivedmap) {
2933        bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2934                     length >> qemu_target_page_bits());
2935    }
2936
2937    return ram_block_discard_range(rb, start, length);
2938}
2939
2940/*
2941 * For every allocation, we will try not to crash the VM if the
2942 * allocation failed.
2943 */
2944static int xbzrle_init(void)
2945{
2946    Error *local_err = NULL;
2947
2948    if (!migrate_use_xbzrle()) {
2949        return 0;
2950    }
2951
2952    XBZRLE_cache_lock();
2953
2954    XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2955    if (!XBZRLE.zero_target_page) {
2956        error_report("%s: Error allocating zero page", __func__);
2957        goto err_out;
2958    }
2959
2960    XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2961                              TARGET_PAGE_SIZE, &local_err);
2962    if (!XBZRLE.cache) {
2963        error_report_err(local_err);
2964        goto free_zero_page;
2965    }
2966
2967    XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2968    if (!XBZRLE.encoded_buf) {
2969        error_report("%s: Error allocating encoded_buf", __func__);
2970        goto free_cache;
2971    }
2972
2973    XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2974    if (!XBZRLE.current_buf) {
2975        error_report("%s: Error allocating current_buf", __func__);
2976        goto free_encoded_buf;
2977    }
2978
2979    /* We are all good */
2980    XBZRLE_cache_unlock();
2981    return 0;
2982
2983free_encoded_buf:
2984    g_free(XBZRLE.encoded_buf);
2985    XBZRLE.encoded_buf = NULL;
2986free_cache:
2987    cache_fini(XBZRLE.cache);
2988    XBZRLE.cache = NULL;
2989free_zero_page:
2990    g_free(XBZRLE.zero_target_page);
2991    XBZRLE.zero_target_page = NULL;
2992err_out:
2993    XBZRLE_cache_unlock();
2994    return -ENOMEM;
2995}
2996
2997static int ram_state_init(RAMState **rsp)
2998{
2999    *rsp = g_try_new0(RAMState, 1);
3000
3001    if (!*rsp) {
3002        error_report("%s: Init ramstate fail", __func__);
3003        return -1;
3004    }
3005
3006    qemu_mutex_init(&(*rsp)->bitmap_mutex);
3007    qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3008    QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3009
3010    /*
3011     * Count the total number of pages used by ram blocks not including any
3012     * gaps due to alignment or unplugs.
3013     * This must match with the initial values of dirty bitmap.
3014     */
3015    (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3016    ram_state_reset(*rsp);
3017
3018    return 0;
3019}
3020
3021static void ram_list_init_bitmaps(void)
3022{
3023    MigrationState *ms = migrate_get_current();
3024    RAMBlock *block;
3025    unsigned long pages;
3026    uint8_t shift;
3027
3028    /* Skip setting bitmap if there is no RAM */
3029    if (ram_bytes_total()) {
3030        shift = ms->clear_bitmap_shift;
3031        if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3032            error_report("clear_bitmap_shift (%u) too big, using "
3033                         "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3034            shift = CLEAR_BITMAP_SHIFT_MAX;
3035        } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3036            error_report("clear_bitmap_shift (%u) too small, using "
3037                         "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3038            shift = CLEAR_BITMAP_SHIFT_MIN;
3039        }
3040
3041        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3042            pages = block->max_length >> TARGET_PAGE_BITS;
3043            /*
3044             * The initial dirty bitmap for migration must be set with all
3045             * ones to make sure we'll migrate every guest RAM page to
3046             * destination.
3047             * Here we set RAMBlock.bmap all to 1 because when rebegin a
3048             * new migration after a failed migration, ram_list.
3049             * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3050             * guest memory.
3051             */
3052            block->bmap = bitmap_new(pages);
3053            bitmap_set(block->bmap, 0, pages);
3054            block->clear_bmap_shift = shift;
3055            block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3056        }
3057    }
3058}
3059
3060static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3061{
3062    unsigned long pages;
3063    RAMBlock *rb;
3064
3065    RCU_READ_LOCK_GUARD();
3066
3067    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3068            pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3069            rs->migration_dirty_pages -= pages;
3070    }
3071}
3072
3073static void ram_init_bitmaps(RAMState *rs)
3074{
3075    /* For memory_global_dirty_log_start below.  */
3076    qemu_mutex_lock_iothread();
3077    qemu_mutex_lock_ramlist();
3078
3079    WITH_RCU_READ_LOCK_GUARD() {
3080        ram_list_init_bitmaps();
3081        /* We don't use dirty log with background snapshots */
3082        if (!migrate_background_snapshot()) {
3083            memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3084            migration_bitmap_sync_precopy(rs);
3085        }
3086    }
3087    qemu_mutex_unlock_ramlist();
3088    qemu_mutex_unlock_iothread();
3089
3090    /*
3091     * After an eventual first bitmap sync, fixup the initial bitmap
3092     * containing all 1s to exclude any discarded pages from migration.
3093     */
3094    migration_bitmap_clear_discarded_pages(rs);
3095}
3096
3097static int ram_init_all(RAMState **rsp)
3098{
3099    if (ram_state_init(rsp)) {
3100        return -1;
3101    }
3102
3103    if (xbzrle_init()) {
3104        ram_state_cleanup(rsp);
3105        return -1;
3106    }
3107
3108    ram_init_bitmaps(*rsp);
3109
3110    return 0;
3111}
3112
3113static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3114{
3115    RAMBlock *block;
3116    uint64_t pages = 0;
3117
3118    /*
3119     * Postcopy is not using xbzrle/compression, so no need for that.
3120     * Also, since source are already halted, we don't need to care
3121     * about dirty page logging as well.
3122     */
3123
3124    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3125        pages += bitmap_count_one(block->bmap,
3126                                  block->used_length >> TARGET_PAGE_BITS);
3127    }
3128
3129    /* This may not be aligned with current bitmaps. Recalculate. */
3130    rs->migration_dirty_pages = pages;
3131
3132    ram_state_reset(rs);
3133
3134    /* Update RAMState cache of output QEMUFile */
3135    rs->f = out;
3136
3137    trace_ram_state_resume_prepare(pages);
3138}
3139
3140/*
3141 * This function clears bits of the free pages reported by the caller from the
3142 * migration dirty bitmap. @addr is the host address corresponding to the
3143 * start of the continuous guest free pages, and @len is the total bytes of
3144 * those pages.
3145 */
3146void qemu_guest_free_page_hint(void *addr, size_t len)
3147{
3148    RAMBlock *block;
3149    ram_addr_t offset;
3150    size_t used_len, start, npages;
3151    MigrationState *s = migrate_get_current();
3152
3153    /* This function is currently expected to be used during live migration */
3154    if (!migration_is_setup_or_active(s->state)) {
3155        return;
3156    }
3157
3158    for (; len > 0; len -= used_len, addr += used_len) {
3159        block = qemu_ram_block_from_host(addr, false, &offset);
3160        if (unlikely(!block || offset >= block->used_length)) {
3161            /*
3162             * The implementation might not support RAMBlock resize during
3163             * live migration, but it could happen in theory with future
3164             * updates. So we add a check here to capture that case.
3165             */
3166            error_report_once("%s unexpected error", __func__);
3167            return;
3168        }
3169
3170        if (len <= block->used_length - offset) {
3171            used_len = len;
3172        } else {
3173            used_len = block->used_length - offset;
3174        }
3175
3176        start = offset >> TARGET_PAGE_BITS;
3177        npages = used_len >> TARGET_PAGE_BITS;
3178
3179        qemu_mutex_lock(&ram_state->bitmap_mutex);
3180        /*
3181         * The skipped free pages are equavalent to be sent from clear_bmap's
3182         * perspective, so clear the bits from the memory region bitmap which
3183         * are initially set. Otherwise those skipped pages will be sent in
3184         * the next round after syncing from the memory region bitmap.
3185         */
3186        migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3187        ram_state->migration_dirty_pages -=
3188                      bitmap_count_one_with_offset(block->bmap, start, npages);
3189        bitmap_clear(block->bmap, start, npages);
3190        qemu_mutex_unlock(&ram_state->bitmap_mutex);
3191    }
3192}
3193
3194/*
3195 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3196 * long-running RCU critical section.  When rcu-reclaims in the code
3197 * start to become numerous it will be necessary to reduce the
3198 * granularity of these critical sections.
3199 */
3200
3201/**
3202 * ram_save_setup: Setup RAM for migration
3203 *
3204 * Returns zero to indicate success and negative for error
3205 *
3206 * @f: QEMUFile where to send the data
3207 * @opaque: RAMState pointer
3208 */
3209static int ram_save_setup(QEMUFile *f, void *opaque)
3210{
3211    RAMState **rsp = opaque;
3212    RAMBlock *block;
3213    int ret;
3214
3215    if (compress_threads_save_setup()) {
3216        return -1;
3217    }
3218
3219    /* migration has already setup the bitmap, reuse it. */
3220    if (!migration_in_colo_state()) {
3221        if (ram_init_all(rsp) != 0) {
3222            compress_threads_save_cleanup();
3223            return -1;
3224        }
3225    }
3226    (*rsp)->f = f;
3227
3228    WITH_RCU_READ_LOCK_GUARD() {
3229        qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3230
3231        RAMBLOCK_FOREACH_MIGRATABLE(block) {
3232            qemu_put_byte(f, strlen(block->idstr));
3233            qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3234            qemu_put_be64(f, block->used_length);
3235            if (migrate_postcopy_ram() && block->page_size !=
3236                                          qemu_host_page_size) {
3237                qemu_put_be64(f, block->page_size);
3238            }
3239            if (migrate_ignore_shared()) {
3240                qemu_put_be64(f, block->mr->addr);
3241            }
3242        }
3243    }
3244
3245    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3246    ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3247
3248    ret =  multifd_send_sync_main(f);
3249    if (ret < 0) {
3250        return ret;
3251    }
3252
3253    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3254    qemu_fflush(f);
3255
3256    return 0;
3257}
3258
3259/**
3260 * ram_save_iterate: iterative stage for migration
3261 *
3262 * Returns zero to indicate success and negative for error
3263 *
3264 * @f: QEMUFile where to send the data
3265 * @opaque: RAMState pointer
3266 */
3267static int ram_save_iterate(QEMUFile *f, void *opaque)
3268{
3269    RAMState **temp = opaque;
3270    RAMState *rs = *temp;
3271    int ret = 0;
3272    int i;
3273    int64_t t0;
3274    int done = 0;
3275
3276    if (blk_mig_bulk_active()) {
3277        /* Avoid transferring ram during bulk phase of block migration as
3278         * the bulk phase will usually take a long time and transferring
3279         * ram updates during that time is pointless. */
3280        goto out;
3281    }
3282
3283    /*
3284     * We'll take this lock a little bit long, but it's okay for two reasons.
3285     * Firstly, the only possible other thread to take it is who calls
3286     * qemu_guest_free_page_hint(), which should be rare; secondly, see
3287     * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3288     * guarantees that we'll at least released it in a regular basis.
3289     */
3290    qemu_mutex_lock(&rs->bitmap_mutex);
3291    WITH_RCU_READ_LOCK_GUARD() {
3292        if (ram_list.version != rs->last_version) {
3293            ram_state_reset(rs);
3294        }
3295
3296        /* Read version before ram_list.blocks */
3297        smp_rmb();
3298
3299        ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3300
3301        t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3302        i = 0;
3303        while ((ret = qemu_file_rate_limit(f)) == 0 ||
3304               postcopy_has_request(rs)) {
3305            int pages;
3306
3307            if (qemu_file_get_error(f)) {
3308                break;
3309            }
3310
3311            pages = ram_find_and_save_block(rs);
3312            /* no more pages to sent */
3313            if (pages == 0) {
3314                done = 1;
3315                break;
3316            }
3317
3318            if (pages < 0) {
3319                qemu_file_set_error(f, pages);
3320                break;
3321            }
3322
3323            rs->target_page_count += pages;
3324
3325            /*
3326             * During postcopy, it is necessary to make sure one whole host
3327             * page is sent in one chunk.
3328             */
3329            if (migrate_postcopy_ram()) {
3330                flush_compressed_data(rs);
3331            }
3332
3333            /*
3334             * we want to check in the 1st loop, just in case it was the 1st
3335             * time and we had to sync the dirty bitmap.
3336             * qemu_clock_get_ns() is a bit expensive, so we only check each
3337             * some iterations
3338             */
3339            if ((i & 63) == 0) {
3340                uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3341                              1000000;
3342                if (t1 > MAX_WAIT) {
3343                    trace_ram_save_iterate_big_wait(t1, i);
3344                    break;
3345                }
3346            }
3347            i++;
3348        }
3349    }
3350    qemu_mutex_unlock(&rs->bitmap_mutex);
3351
3352    postcopy_preempt_reset_channel(rs);
3353
3354    /*
3355     * Must occur before EOS (or any QEMUFile operation)
3356     * because of RDMA protocol.
3357     */
3358    ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3359
3360out:
3361    if (ret >= 0
3362        && migration_is_setup_or_active(migrate_get_current()->state)) {
3363        ret = multifd_send_sync_main(rs->f);
3364        if (ret < 0) {
3365            return ret;
3366        }
3367
3368        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3369        qemu_fflush(f);
3370        ram_transferred_add(8);
3371
3372        ret = qemu_file_get_error(f);
3373    }
3374    if (ret < 0) {
3375        return ret;
3376    }
3377
3378    return done;
3379}
3380
3381/**
3382 * ram_save_complete: function called to send the remaining amount of ram
3383 *
3384 * Returns zero to indicate success or negative on error
3385 *
3386 * Called with iothread lock
3387 *
3388 * @f: QEMUFile where to send the data
3389 * @opaque: RAMState pointer
3390 */
3391static int ram_save_complete(QEMUFile *f, void *opaque)
3392{
3393    RAMState **temp = opaque;
3394    RAMState *rs = *temp;
3395    int ret = 0;
3396
3397    rs->last_stage = !migration_in_colo_state();
3398
3399    WITH_RCU_READ_LOCK_GUARD() {
3400        if (!migration_in_postcopy()) {
3401            migration_bitmap_sync_precopy(rs);
3402        }
3403
3404        ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3405
3406        /* try transferring iterative blocks of memory */
3407
3408        /* flush all remaining blocks regardless of rate limiting */
3409        while (true) {
3410            int pages;
3411
3412            pages = ram_find_and_save_block(rs);
3413            /* no more blocks to sent */
3414            if (pages == 0) {
3415                break;
3416            }
3417            if (pages < 0) {
3418                ret = pages;
3419                break;
3420            }
3421        }
3422
3423        flush_compressed_data(rs);
3424        ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3425    }
3426
3427    if (ret < 0) {
3428        return ret;
3429    }
3430
3431    postcopy_preempt_reset_channel(rs);
3432
3433    ret = multifd_send_sync_main(rs->f);
3434    if (ret < 0) {
3435        return ret;
3436    }
3437
3438    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3439    qemu_fflush(f);
3440
3441    return 0;
3442}
3443
3444static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3445                             uint64_t *res_precopy_only,
3446                             uint64_t *res_compatible,
3447                             uint64_t *res_postcopy_only)
3448{
3449    RAMState **temp = opaque;
3450    RAMState *rs = *temp;
3451    uint64_t remaining_size;
3452
3453    remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3454
3455    if (!migration_in_postcopy() &&
3456        remaining_size < max_size) {
3457        qemu_mutex_lock_iothread();
3458        WITH_RCU_READ_LOCK_GUARD() {
3459            migration_bitmap_sync_precopy(rs);
3460        }
3461        qemu_mutex_unlock_iothread();
3462        remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3463    }
3464
3465    if (migrate_postcopy_ram()) {
3466        /* We can do postcopy, and all the data is postcopiable */
3467        *res_compatible += remaining_size;
3468    } else {
3469        *res_precopy_only += remaining_size;
3470    }
3471}
3472
3473static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3474{
3475    unsigned int xh_len;
3476    int xh_flags;
3477    uint8_t *loaded_data;
3478
3479    /* extract RLE header */
3480    xh_flags = qemu_get_byte(f);
3481    xh_len = qemu_get_be16(f);
3482
3483    if (xh_flags != ENCODING_FLAG_XBZRLE) {
3484        error_report("Failed to load XBZRLE page - wrong compression!");
3485        return -1;
3486    }
3487
3488    if (xh_len > TARGET_PAGE_SIZE) {
3489        error_report("Failed to load XBZRLE page - len overflow!");
3490        return -1;
3491    }
3492    loaded_data = XBZRLE.decoded_buf;
3493    /* load data and decode */
3494    /* it can change loaded_data to point to an internal buffer */
3495    qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3496
3497    /* decode RLE */
3498    if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3499                             TARGET_PAGE_SIZE) == -1) {
3500        error_report("Failed to load XBZRLE page - decode error!");
3501        return -1;
3502    }
3503
3504    return 0;
3505}
3506
3507/**
3508 * ram_block_from_stream: read a RAMBlock id from the migration stream
3509 *
3510 * Must be called from within a rcu critical section.
3511 *
3512 * Returns a pointer from within the RCU-protected ram_list.
3513 *
3514 * @mis: the migration incoming state pointer
3515 * @f: QEMUFile where to read the data from
3516 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3517 * @channel: the channel we're using
3518 */
3519static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3520                                              QEMUFile *f, int flags,
3521                                              int channel)
3522{
3523    RAMBlock *block = mis->last_recv_block[channel];
3524    char id[256];
3525    uint8_t len;
3526
3527    if (flags & RAM_SAVE_FLAG_CONTINUE) {
3528        if (!block) {
3529            error_report("Ack, bad migration stream!");
3530            return NULL;
3531        }
3532        return block;
3533    }
3534
3535    len = qemu_get_byte(f);
3536    qemu_get_buffer(f, (uint8_t *)id, len);
3537    id[len] = 0;
3538
3539    block = qemu_ram_block_by_name(id);
3540    if (!block) {
3541        error_report("Can't find block %s", id);
3542        return NULL;
3543    }
3544
3545    if (ramblock_is_ignored(block)) {
3546        error_report("block %s should not be migrated !", id);
3547        return NULL;
3548    }
3549
3550    mis->last_recv_block[channel] = block;
3551
3552    return block;
3553}
3554
3555static inline void *host_from_ram_block_offset(RAMBlock *block,
3556                                               ram_addr_t offset)
3557{
3558    if (!offset_in_ramblock(block, offset)) {
3559        return NULL;
3560    }
3561
3562    return block->host + offset;
3563}
3564
3565static void *host_page_from_ram_block_offset(RAMBlock *block,
3566                                             ram_addr_t offset)
3567{
3568    /* Note: Explicitly no check against offset_in_ramblock(). */
3569    return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3570                                   block->page_size);
3571}
3572
3573static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3574                                                         ram_addr_t offset)
3575{
3576    return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3577}
3578
3579static inline void *colo_cache_from_block_offset(RAMBlock *block,
3580                             ram_addr_t offset, bool record_bitmap)
3581{
3582    if (!offset_in_ramblock(block, offset)) {
3583        return NULL;
3584    }
3585    if (!block->colo_cache) {
3586        error_report("%s: colo_cache is NULL in block :%s",
3587                     __func__, block->idstr);
3588        return NULL;
3589    }
3590
3591    /*
3592    * During colo checkpoint, we need bitmap of these migrated pages.
3593    * It help us to decide which pages in ram cache should be flushed
3594    * into VM's RAM later.
3595    */
3596    if (record_bitmap &&
3597        !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3598        ram_state->migration_dirty_pages++;
3599    }
3600    return block->colo_cache + offset;
3601}
3602
3603/**
3604 * ram_handle_compressed: handle the zero page case
3605 *
3606 * If a page (or a whole RDMA chunk) has been
3607 * determined to be zero, then zap it.
3608 *
3609 * @host: host address for the zero page
3610 * @ch: what the page is filled from.  We only support zero
3611 * @size: size of the zero page
3612 */
3613void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3614{
3615    if (ch != 0 || !buffer_is_zero(host, size)) {
3616        memset(host, ch, size);
3617    }
3618}
3619
3620/* return the size after decompression, or negative value on error */
3621static int
3622qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3623                     const uint8_t *source, size_t source_len)
3624{
3625    int err;
3626
3627    err = inflateReset(stream);
3628    if (err != Z_OK) {
3629        return -1;
3630    }
3631
3632    stream->avail_in = source_len;
3633    stream->next_in = (uint8_t *)source;
3634    stream->avail_out = dest_len;
3635    stream->next_out = dest;
3636
3637    err = inflate(stream, Z_NO_FLUSH);
3638    if (err != Z_STREAM_END) {
3639        return -1;
3640    }
3641
3642    return stream->total_out;
3643}
3644
3645static void *do_data_decompress(void *opaque)
3646{
3647    DecompressParam *param = opaque;
3648    unsigned long pagesize;
3649    uint8_t *des;
3650    int len, ret;
3651
3652    qemu_mutex_lock(&param->mutex);
3653    while (!param->quit) {
3654        if (param->des) {
3655            des = param->des;
3656            len = param->len;
3657            param->des = 0;
3658            qemu_mutex_unlock(&param->mutex);
3659
3660            pagesize = TARGET_PAGE_SIZE;
3661
3662            ret = qemu_uncompress_data(&param->stream, des, pagesize,
3663                                       param->compbuf, len);
3664            if (ret < 0 && migrate_get_current()->decompress_error_check) {
3665                error_report("decompress data failed");
3666                qemu_file_set_error(decomp_file, ret);
3667            }
3668
3669            qemu_mutex_lock(&decomp_done_lock);
3670            param->done = true;
3671            qemu_cond_signal(&decomp_done_cond);
3672            qemu_mutex_unlock(&decomp_done_lock);
3673
3674            qemu_mutex_lock(&param->mutex);
3675        } else {
3676            qemu_cond_wait(&param->cond, &param->mutex);
3677        }
3678    }
3679    qemu_mutex_unlock(&param->mutex);
3680
3681    return NULL;
3682}
3683
3684static int wait_for_decompress_done(void)
3685{
3686    int idx, thread_count;
3687
3688    if (!migrate_use_compression()) {
3689        return 0;
3690    }
3691
3692    thread_count = migrate_decompress_threads();
3693    qemu_mutex_lock(&decomp_done_lock);
3694    for (idx = 0; idx < thread_count; idx++) {
3695        while (!decomp_param[idx].done) {
3696            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3697        }
3698    }
3699    qemu_mutex_unlock(&decomp_done_lock);
3700    return qemu_file_get_error(decomp_file);
3701}
3702
3703static void compress_threads_load_cleanup(void)
3704{
3705    int i, thread_count;
3706
3707    if (!migrate_use_compression()) {
3708        return;
3709    }
3710    thread_count = migrate_decompress_threads();
3711    for (i = 0; i < thread_count; i++) {
3712        /*
3713         * we use it as a indicator which shows if the thread is
3714         * properly init'd or not
3715         */
3716        if (!decomp_param[i].compbuf) {
3717            break;
3718        }
3719
3720        qemu_mutex_lock(&decomp_param[i].mutex);
3721        decomp_param[i].quit = true;
3722        qemu_cond_signal(&decomp_param[i].cond);
3723        qemu_mutex_unlock(&decomp_param[i].mutex);
3724    }
3725    for (i = 0; i < thread_count; i++) {
3726        if (!decomp_param[i].compbuf) {
3727            break;
3728        }
3729
3730        qemu_thread_join(decompress_threads + i);
3731        qemu_mutex_destroy(&decomp_param[i].mutex);
3732        qemu_cond_destroy(&decomp_param[i].cond);
3733        inflateEnd(&decomp_param[i].stream);
3734        g_free(decomp_param[i].compbuf);
3735        decomp_param[i].compbuf = NULL;
3736    }
3737    g_free(decompress_threads);
3738    g_free(decomp_param);
3739    decompress_threads = NULL;
3740    decomp_param = NULL;
3741    decomp_file = NULL;
3742}
3743
3744static int compress_threads_load_setup(QEMUFile *f)
3745{
3746    int i, thread_count;
3747
3748    if (!migrate_use_compression()) {
3749        return 0;
3750    }
3751
3752    thread_count = migrate_decompress_threads();
3753    decompress_threads = g_new0(QemuThread, thread_count);
3754    decomp_param = g_new0(DecompressParam, thread_count);
3755    qemu_mutex_init(&decomp_done_lock);
3756    qemu_cond_init(&decomp_done_cond);
3757    decomp_file = f;
3758    for (i = 0; i < thread_count; i++) {
3759        if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3760            goto exit;
3761        }
3762
3763        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3764        qemu_mutex_init(&decomp_param[i].mutex);
3765        qemu_cond_init(&decomp_param[i].cond);
3766        decomp_param[i].done = true;
3767        decomp_param[i].quit = false;
3768        qemu_thread_create(decompress_threads + i, "decompress",
3769                           do_data_decompress, decomp_param + i,
3770                           QEMU_THREAD_JOINABLE);
3771    }
3772    return 0;
3773exit:
3774    compress_threads_load_cleanup();
3775    return -1;
3776}
3777
3778static void decompress_data_with_multi_threads(QEMUFile *f,
3779                                               void *host, int len)
3780{
3781    int idx, thread_count;
3782
3783    thread_count = migrate_decompress_threads();
3784    QEMU_LOCK_GUARD(&decomp_done_lock);
3785    while (true) {
3786        for (idx = 0; idx < thread_count; idx++) {
3787            if (decomp_param[idx].done) {
3788                decomp_param[idx].done = false;
3789                qemu_mutex_lock(&decomp_param[idx].mutex);
3790                qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3791                decomp_param[idx].des = host;
3792                decomp_param[idx].len = len;
3793                qemu_cond_signal(&decomp_param[idx].cond);
3794                qemu_mutex_unlock(&decomp_param[idx].mutex);
3795                break;
3796            }
3797        }
3798        if (idx < thread_count) {
3799            break;
3800        } else {
3801            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3802        }
3803    }
3804}
3805
3806static void colo_init_ram_state(void)
3807{
3808    ram_state_init(&ram_state);
3809}
3810
3811/*
3812 * colo cache: this is for secondary VM, we cache the whole
3813 * memory of the secondary VM, it is need to hold the global lock
3814 * to call this helper.
3815 */
3816int colo_init_ram_cache(void)
3817{
3818    RAMBlock *block;
3819
3820    WITH_RCU_READ_LOCK_GUARD() {
3821        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3822            block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3823                                                    NULL, false, false);
3824            if (!block->colo_cache) {
3825                error_report("%s: Can't alloc memory for COLO cache of block %s,"
3826                             "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3827                             block->used_length);
3828                RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3829                    if (block->colo_cache) {
3830                        qemu_anon_ram_free(block->colo_cache, block->used_length);
3831                        block->colo_cache = NULL;
3832                    }
3833                }
3834                return -errno;
3835            }
3836            if (!machine_dump_guest_core(current_machine)) {
3837                qemu_madvise(block->colo_cache, block->used_length,
3838                             QEMU_MADV_DONTDUMP);
3839            }
3840        }
3841    }
3842
3843    /*
3844    * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3845    * with to decide which page in cache should be flushed into SVM's RAM. Here
3846    * we use the same name 'ram_bitmap' as for migration.
3847    */
3848    if (ram_bytes_total()) {
3849        RAMBlock *block;
3850
3851        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3852            unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3853            block->bmap = bitmap_new(pages);
3854        }
3855    }
3856
3857    colo_init_ram_state();
3858    return 0;
3859}
3860
3861/* TODO: duplicated with ram_init_bitmaps */
3862void colo_incoming_start_dirty_log(void)
3863{
3864    RAMBlock *block = NULL;
3865    /* For memory_global_dirty_log_start below. */
3866    qemu_mutex_lock_iothread();
3867    qemu_mutex_lock_ramlist();
3868
3869    memory_global_dirty_log_sync();
3870    WITH_RCU_READ_LOCK_GUARD() {
3871        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3872            ramblock_sync_dirty_bitmap(ram_state, block);
3873            /* Discard this dirty bitmap record */
3874            bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3875        }
3876        memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3877    }
3878    ram_state->migration_dirty_pages = 0;
3879    qemu_mutex_unlock_ramlist();
3880    qemu_mutex_unlock_iothread();
3881}
3882
3883/* It is need to hold the global lock to call this helper */
3884void colo_release_ram_cache(void)
3885{
3886    RAMBlock *block;
3887
3888    memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3889    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3890        g_free(block->bmap);
3891        block->bmap = NULL;
3892    }
3893
3894    WITH_RCU_READ_LOCK_GUARD() {
3895        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3896            if (block->colo_cache) {
3897                qemu_anon_ram_free(block->colo_cache, block->used_length);
3898                block->colo_cache = NULL;
3899            }
3900        }
3901    }
3902    ram_state_cleanup(&ram_state);
3903}
3904
3905/**
3906 * ram_load_setup: Setup RAM for migration incoming side
3907 *
3908 * Returns zero to indicate success and negative for error
3909 *
3910 * @f: QEMUFile where to receive the data
3911 * @opaque: RAMState pointer
3912 */
3913static int ram_load_setup(QEMUFile *f, void *opaque)
3914{
3915    if (compress_threads_load_setup(f)) {
3916        return -1;
3917    }
3918
3919    xbzrle_load_setup();
3920    ramblock_recv_map_init();
3921
3922    return 0;
3923}
3924
3925static int ram_load_cleanup(void *opaque)
3926{
3927    RAMBlock *rb;
3928
3929    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3930        qemu_ram_block_writeback(rb);
3931    }
3932
3933    xbzrle_load_cleanup();
3934    compress_threads_load_cleanup();
3935
3936    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3937        g_free(rb->receivedmap);
3938        rb->receivedmap = NULL;
3939    }
3940
3941    return 0;
3942}
3943
3944/**
3945 * ram_postcopy_incoming_init: allocate postcopy data structures
3946 *
3947 * Returns 0 for success and negative if there was one error
3948 *
3949 * @mis: current migration incoming state
3950 *
3951 * Allocate data structures etc needed by incoming migration with
3952 * postcopy-ram. postcopy-ram's similarly names
3953 * postcopy_ram_incoming_init does the work.
3954 */
3955int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3956{
3957    return postcopy_ram_incoming_init(mis);
3958}
3959
3960/**
3961 * ram_load_postcopy: load a page in postcopy case
3962 *
3963 * Returns 0 for success or -errno in case of error
3964 *
3965 * Called in postcopy mode by ram_load().
3966 * rcu_read_lock is taken prior to this being called.
3967 *
3968 * @f: QEMUFile where to send the data
3969 * @channel: the channel to use for loading
3970 */
3971int ram_load_postcopy(QEMUFile *f, int channel)
3972{
3973    int flags = 0, ret = 0;
3974    bool place_needed = false;
3975    bool matches_target_page_size = false;
3976    MigrationIncomingState *mis = migration_incoming_get_current();
3977    PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3978
3979    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3980        ram_addr_t addr;
3981        void *page_buffer = NULL;
3982        void *place_source = NULL;
3983        RAMBlock *block = NULL;
3984        uint8_t ch;
3985        int len;
3986
3987        addr = qemu_get_be64(f);
3988
3989        /*
3990         * If qemu file error, we should stop here, and then "addr"
3991         * may be invalid
3992         */
3993        ret = qemu_file_get_error(f);
3994        if (ret) {
3995            break;
3996        }
3997
3998        flags = addr & ~TARGET_PAGE_MASK;
3999        addr &= TARGET_PAGE_MASK;
4000
4001        trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4002        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4003                     RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4004            block = ram_block_from_stream(mis, f, flags, channel);
4005            if (!block) {
4006                ret = -EINVAL;
4007                break;
4008            }
4009
4010            /*
4011             * Relying on used_length is racy and can result in false positives.
4012             * We might place pages beyond used_length in case RAM was shrunk
4013             * while in postcopy, which is fine - trying to place via
4014             * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4015             */
4016            if (!block->host || addr >= block->postcopy_length) {
4017                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4018                ret = -EINVAL;
4019                break;
4020            }
4021            tmp_page->target_pages++;
4022            matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4023            /*
4024             * Postcopy requires that we place whole host pages atomically;
4025             * these may be huge pages for RAMBlocks that are backed by
4026             * hugetlbfs.
4027             * To make it atomic, the data is read into a temporary page
4028             * that's moved into place later.
4029             * The migration protocol uses,  possibly smaller, target-pages
4030             * however the source ensures it always sends all the components
4031             * of a host page in one chunk.
4032             */
4033            page_buffer = tmp_page->tmp_huge_page +
4034                          host_page_offset_from_ram_block_offset(block, addr);
4035            /* If all TP are zero then we can optimise the place */
4036            if (tmp_page->target_pages == 1) {
4037                tmp_page->host_addr =
4038                    host_page_from_ram_block_offset(block, addr);
4039            } else if (tmp_page->host_addr !=
4040                       host_page_from_ram_block_offset(block, addr)) {
4041                /* not the 1st TP within the HP */
4042                error_report("Non-same host page detected on channel %d: "
4043                             "Target host page %p, received host page %p "
4044                             "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4045                             channel, tmp_page->host_addr,
4046                             host_page_from_ram_block_offset(block, addr),
4047                             block->idstr, addr, tmp_page->target_pages);
4048                ret = -EINVAL;
4049                break;
4050            }
4051
4052            /*
4053             * If it's the last part of a host page then we place the host
4054             * page
4055             */
4056            if (tmp_page->target_pages ==
4057                (block->page_size / TARGET_PAGE_SIZE)) {
4058                place_needed = true;
4059            }
4060            place_source = tmp_page->tmp_huge_page;
4061        }
4062
4063        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4064        case RAM_SAVE_FLAG_ZERO:
4065            ch = qemu_get_byte(f);
4066            /*
4067             * Can skip to set page_buffer when
4068             * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4069             */
4070            if (ch || !matches_target_page_size) {
4071                memset(page_buffer, ch, TARGET_PAGE_SIZE);
4072            }
4073            if (ch) {
4074                tmp_page->all_zero = false;
4075            }
4076            break;
4077
4078        case RAM_SAVE_FLAG_PAGE:
4079            tmp_page->all_zero = false;
4080            if (!matches_target_page_size) {
4081                /* For huge pages, we always use temporary buffer */
4082                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4083            } else {
4084                /*
4085                 * For small pages that matches target page size, we
4086                 * avoid the qemu_file copy.  Instead we directly use
4087                 * the buffer of QEMUFile to place the page.  Note: we
4088                 * cannot do any QEMUFile operation before using that
4089                 * buffer to make sure the buffer is valid when
4090                 * placing the page.
4091                 */
4092                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4093                                         TARGET_PAGE_SIZE);
4094            }
4095            break;
4096        case RAM_SAVE_FLAG_COMPRESS_PAGE:
4097            tmp_page->all_zero = false;
4098            len = qemu_get_be32(f);
4099            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4100                error_report("Invalid compressed data length: %d", len);
4101                ret = -EINVAL;
4102                break;
4103            }
4104            decompress_data_with_multi_threads(f, page_buffer, len);
4105            break;
4106
4107        case RAM_SAVE_FLAG_EOS:
4108            /* normal exit */
4109            multifd_recv_sync_main();
4110            break;
4111        default:
4112            error_report("Unknown combination of migration flags: 0x%x"
4113                         " (postcopy mode)", flags);
4114            ret = -EINVAL;
4115            break;
4116        }
4117
4118        /* Got the whole host page, wait for decompress before placing. */
4119        if (place_needed) {
4120            ret |= wait_for_decompress_done();
4121        }
4122
4123        /* Detect for any possible file errors */
4124        if (!ret && qemu_file_get_error(f)) {
4125            ret = qemu_file_get_error(f);
4126        }
4127
4128        if (!ret && place_needed) {
4129            if (tmp_page->all_zero) {
4130                ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4131            } else {
4132                ret = postcopy_place_page(mis, tmp_page->host_addr,
4133                                          place_source, block);
4134            }
4135            place_needed = false;
4136            postcopy_temp_page_reset(tmp_page);
4137        }
4138    }
4139
4140    return ret;
4141}
4142
4143static bool postcopy_is_advised(void)
4144{
4145    PostcopyState ps = postcopy_state_get();
4146    return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4147}
4148
4149static bool postcopy_is_running(void)
4150{
4151    PostcopyState ps = postcopy_state_get();
4152    return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4153}
4154
4155/*
4156 * Flush content of RAM cache into SVM's memory.
4157 * Only flush the pages that be dirtied by PVM or SVM or both.
4158 */
4159void colo_flush_ram_cache(void)
4160{
4161    RAMBlock *block = NULL;
4162    void *dst_host;
4163    void *src_host;
4164    unsigned long offset = 0;
4165
4166    memory_global_dirty_log_sync();
4167    WITH_RCU_READ_LOCK_GUARD() {
4168        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4169            ramblock_sync_dirty_bitmap(ram_state, block);
4170        }
4171    }
4172
4173    trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4174    WITH_RCU_READ_LOCK_GUARD() {
4175        block = QLIST_FIRST_RCU(&ram_list.blocks);
4176
4177        while (block) {
4178            unsigned long num = 0;
4179
4180            offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4181            if (!offset_in_ramblock(block,
4182                                    ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4183                offset = 0;
4184                num = 0;
4185                block = QLIST_NEXT_RCU(block, next);
4186            } else {
4187                unsigned long i = 0;
4188
4189                for (i = 0; i < num; i++) {
4190                    migration_bitmap_clear_dirty(ram_state, block, offset + i);
4191                }
4192                dst_host = block->host
4193                         + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4194                src_host = block->colo_cache
4195                         + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4196                memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4197                offset += num;
4198            }
4199        }
4200    }
4201    trace_colo_flush_ram_cache_end();
4202}
4203
4204/**
4205 * ram_load_precopy: load pages in precopy case
4206 *
4207 * Returns 0 for success or -errno in case of error
4208 *
4209 * Called in precopy mode by ram_load().
4210 * rcu_read_lock is taken prior to this being called.
4211 *
4212 * @f: QEMUFile where to send the data
4213 */
4214static int ram_load_precopy(QEMUFile *f)
4215{
4216    MigrationIncomingState *mis = migration_incoming_get_current();
4217    int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4218    /* ADVISE is earlier, it shows the source has the postcopy capability on */
4219    bool postcopy_advised = postcopy_is_advised();
4220    if (!migrate_use_compression()) {
4221        invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4222    }
4223
4224    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4225        ram_addr_t addr, total_ram_bytes;
4226        void *host = NULL, *host_bak = NULL;
4227        uint8_t ch;
4228
4229        /*
4230         * Yield periodically to let main loop run, but an iteration of
4231         * the main loop is expensive, so do it each some iterations
4232         */
4233        if ((i & 32767) == 0 && qemu_in_coroutine()) {
4234            aio_co_schedule(qemu_get_current_aio_context(),
4235                            qemu_coroutine_self());
4236            qemu_coroutine_yield();
4237        }
4238        i++;
4239
4240        addr = qemu_get_be64(f);
4241        flags = addr & ~TARGET_PAGE_MASK;
4242        addr &= TARGET_PAGE_MASK;
4243
4244        if (flags & invalid_flags) {
4245            if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4246                error_report("Received an unexpected compressed page");
4247            }
4248
4249            ret = -EINVAL;
4250            break;
4251        }
4252
4253        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4254                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4255            RAMBlock *block = ram_block_from_stream(mis, f, flags,
4256                                                    RAM_CHANNEL_PRECOPY);
4257
4258            host = host_from_ram_block_offset(block, addr);
4259            /*
4260             * After going into COLO stage, we should not load the page
4261             * into SVM's memory directly, we put them into colo_cache firstly.
4262             * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4263             * Previously, we copied all these memory in preparing stage of COLO
4264             * while we need to stop VM, which is a time-consuming process.
4265             * Here we optimize it by a trick, back-up every page while in
4266             * migration process while COLO is enabled, though it affects the
4267             * speed of the migration, but it obviously reduce the downtime of
4268             * back-up all SVM'S memory in COLO preparing stage.
4269             */
4270            if (migration_incoming_colo_enabled()) {
4271                if (migration_incoming_in_colo_state()) {
4272                    /* In COLO stage, put all pages into cache temporarily */
4273                    host = colo_cache_from_block_offset(block, addr, true);
4274                } else {
4275                   /*
4276                    * In migration stage but before COLO stage,
4277                    * Put all pages into both cache and SVM's memory.
4278                    */
4279                    host_bak = colo_cache_from_block_offset(block, addr, false);
4280                }
4281            }
4282            if (!host) {
4283                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4284                ret = -EINVAL;
4285                break;
4286            }
4287            if (!migration_incoming_in_colo_state()) {
4288                ramblock_recv_bitmap_set(block, host);
4289            }
4290
4291            trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4292        }
4293
4294        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4295        case RAM_SAVE_FLAG_MEM_SIZE:
4296            /* Synchronize RAM block list */
4297            total_ram_bytes = addr;
4298            while (!ret && total_ram_bytes) {
4299                RAMBlock *block;
4300                char id[256];
4301                ram_addr_t length;
4302
4303                len = qemu_get_byte(f);
4304                qemu_get_buffer(f, (uint8_t *)id, len);
4305                id[len] = 0;
4306                length = qemu_get_be64(f);
4307
4308                block = qemu_ram_block_by_name(id);
4309                if (block && !qemu_ram_is_migratable(block)) {
4310                    error_report("block %s should not be migrated !", id);
4311                    ret = -EINVAL;
4312                } else if (block) {
4313                    if (length != block->used_length) {
4314                        Error *local_err = NULL;
4315
4316                        ret = qemu_ram_resize(block, length,
4317                                              &local_err);
4318                        if (local_err) {
4319                            error_report_err(local_err);
4320                        }
4321                    }
4322                    /* For postcopy we need to check hugepage sizes match */
4323                    if (postcopy_advised && migrate_postcopy_ram() &&
4324                        block->page_size != qemu_host_page_size) {
4325                        uint64_t remote_page_size = qemu_get_be64(f);
4326                        if (remote_page_size != block->page_size) {
4327                            error_report("Mismatched RAM page size %s "
4328                                         "(local) %zd != %" PRId64,
4329                                         id, block->page_size,
4330                                         remote_page_size);
4331                            ret = -EINVAL;
4332                        }
4333                    }
4334                    if (migrate_ignore_shared()) {
4335                        hwaddr addr = qemu_get_be64(f);
4336                        if (ramblock_is_ignored(block) &&
4337                            block->mr->addr != addr) {
4338                            error_report("Mismatched GPAs for block %s "
4339                                         "%" PRId64 "!= %" PRId64,
4340                                         id, (uint64_t)addr,
4341                                         (uint64_t)block->mr->addr);
4342                            ret = -EINVAL;
4343                        }
4344                    }
4345                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4346                                          block->idstr);
4347                } else {
4348                    error_report("Unknown ramblock \"%s\", cannot "
4349                                 "accept migration", id);
4350                    ret = -EINVAL;
4351                }
4352
4353                total_ram_bytes -= length;
4354            }
4355            break;
4356
4357        case RAM_SAVE_FLAG_ZERO:
4358            ch = qemu_get_byte(f);
4359            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4360            break;
4361
4362        case RAM_SAVE_FLAG_PAGE:
4363            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4364            break;
4365
4366        case RAM_SAVE_FLAG_COMPRESS_PAGE:
4367            len = qemu_get_be32(f);
4368            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4369                error_report("Invalid compressed data length: %d", len);
4370                ret = -EINVAL;
4371                break;
4372            }
4373            decompress_data_with_multi_threads(f, host, len);
4374            break;
4375
4376        case RAM_SAVE_FLAG_XBZRLE:
4377            if (load_xbzrle(f, addr, host) < 0) {
4378                error_report("Failed to decompress XBZRLE page at "
4379                             RAM_ADDR_FMT, addr);
4380                ret = -EINVAL;
4381                break;
4382            }
4383            break;
4384        case RAM_SAVE_FLAG_EOS:
4385            /* normal exit */
4386            multifd_recv_sync_main();
4387            break;
4388        default:
4389            if (flags & RAM_SAVE_FLAG_HOOK) {
4390                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4391            } else {
4392                error_report("Unknown combination of migration flags: 0x%x",
4393                             flags);
4394                ret = -EINVAL;
4395            }
4396        }
4397        if (!ret) {
4398            ret = qemu_file_get_error(f);
4399        }
4400        if (!ret && host_bak) {
4401            memcpy(host_bak, host, TARGET_PAGE_SIZE);
4402        }
4403    }
4404
4405    ret |= wait_for_decompress_done();
4406    return ret;
4407}
4408
4409static int ram_load(QEMUFile *f, void *opaque, int version_id)
4410{
4411    int ret = 0;
4412    static uint64_t seq_iter;
4413    /*
4414     * If system is running in postcopy mode, page inserts to host memory must
4415     * be atomic
4416     */
4417    bool postcopy_running = postcopy_is_running();
4418
4419    seq_iter++;
4420
4421    if (version_id != 4) {
4422        return -EINVAL;
4423    }
4424
4425    /*
4426     * This RCU critical section can be very long running.
4427     * When RCU reclaims in the code start to become numerous,
4428     * it will be necessary to reduce the granularity of this
4429     * critical section.
4430     */
4431    WITH_RCU_READ_LOCK_GUARD() {
4432        if (postcopy_running) {
4433            /*
4434             * Note!  Here RAM_CHANNEL_PRECOPY is the precopy channel of
4435             * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4436             * service fast page faults.
4437             */
4438            ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4439        } else {
4440            ret = ram_load_precopy(f);
4441        }
4442    }
4443    trace_ram_load_complete(ret, seq_iter);
4444
4445    return ret;
4446}
4447
4448static bool ram_has_postcopy(void *opaque)
4449{
4450    RAMBlock *rb;
4451    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4452        if (ramblock_is_pmem(rb)) {
4453            info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4454                         "is not supported now!", rb->idstr, rb->host);
4455            return false;
4456        }
4457    }
4458
4459    return migrate_postcopy_ram();
4460}
4461
4462/* Sync all the dirty bitmap with destination VM.  */
4463static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4464{
4465    RAMBlock *block;
4466    QEMUFile *file = s->to_dst_file;
4467    int ramblock_count = 0;
4468
4469    trace_ram_dirty_bitmap_sync_start();
4470
4471    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4472        qemu_savevm_send_recv_bitmap(file, block->idstr);
4473        trace_ram_dirty_bitmap_request(block->idstr);
4474        ramblock_count++;
4475    }
4476
4477    trace_ram_dirty_bitmap_sync_wait();
4478
4479    /* Wait until all the ramblocks' dirty bitmap synced */
4480    while (ramblock_count--) {
4481        qemu_sem_wait(&s->rp_state.rp_sem);
4482    }
4483
4484    trace_ram_dirty_bitmap_sync_complete();
4485
4486    return 0;
4487}
4488
4489static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4490{
4491    qemu_sem_post(&s->rp_state.rp_sem);
4492}
4493
4494/*
4495 * Read the received bitmap, revert it as the initial dirty bitmap.
4496 * This is only used when the postcopy migration is paused but wants
4497 * to resume from a middle point.
4498 */
4499int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4500{
4501    int ret = -EINVAL;
4502    /* from_dst_file is always valid because we're within rp_thread */
4503    QEMUFile *file = s->rp_state.from_dst_file;
4504    unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4505    uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4506    uint64_t size, end_mark;
4507
4508    trace_ram_dirty_bitmap_reload_begin(block->idstr);
4509
4510    if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4511        error_report("%s: incorrect state %s", __func__,
4512                     MigrationStatus_str(s->state));
4513        return -EINVAL;
4514    }
4515
4516    /*
4517     * Note: see comments in ramblock_recv_bitmap_send() on why we
4518     * need the endianness conversion, and the paddings.
4519     */
4520    local_size = ROUND_UP(local_size, 8);
4521
4522    /* Add paddings */
4523    le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4524
4525    size = qemu_get_be64(file);
4526
4527    /* The size of the bitmap should match with our ramblock */
4528    if (size != local_size) {
4529        error_report("%s: ramblock '%s' bitmap size mismatch "
4530                     "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4531                     block->idstr, size, local_size);
4532        ret = -EINVAL;
4533        goto out;
4534    }
4535
4536    size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4537    end_mark = qemu_get_be64(file);
4538
4539    ret = qemu_file_get_error(file);
4540    if (ret || size != local_size) {
4541        error_report("%s: read bitmap failed for ramblock '%s': %d"
4542                     " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4543                     __func__, block->idstr, ret, local_size, size);
4544        ret = -EIO;
4545        goto out;
4546    }
4547
4548    if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4549        error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4550                     __func__, block->idstr, end_mark);
4551        ret = -EINVAL;
4552        goto out;
4553    }
4554
4555    /*
4556     * Endianness conversion. We are during postcopy (though paused).
4557     * The dirty bitmap won't change. We can directly modify it.
4558     */
4559    bitmap_from_le(block->bmap, le_bitmap, nbits);
4560
4561    /*
4562     * What we received is "received bitmap". Revert it as the initial
4563     * dirty bitmap for this ramblock.
4564     */
4565    bitmap_complement(block->bmap, block->bmap, nbits);
4566
4567    /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4568    ramblock_dirty_bitmap_clear_discarded_pages(block);
4569
4570    /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4571    trace_ram_dirty_bitmap_reload_complete(block->idstr);
4572
4573    /*
4574     * We succeeded to sync bitmap for current ramblock. If this is
4575     * the last one to sync, we need to notify the main send thread.
4576     */
4577    ram_dirty_bitmap_reload_notify(s);
4578
4579    ret = 0;
4580out:
4581    g_free(le_bitmap);
4582    return ret;
4583}
4584
4585static int ram_resume_prepare(MigrationState *s, void *opaque)
4586{
4587    RAMState *rs = *(RAMState **)opaque;
4588    int ret;
4589
4590    ret = ram_dirty_bitmap_sync_all(s, rs);
4591    if (ret) {
4592        return ret;
4593    }
4594
4595    ram_state_resume_prepare(rs, s->to_dst_file);
4596
4597    return 0;
4598}
4599
4600void postcopy_preempt_shutdown_file(MigrationState *s)
4601{
4602    qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4603    qemu_fflush(s->postcopy_qemufile_src);
4604}
4605
4606static SaveVMHandlers savevm_ram_handlers = {
4607    .save_setup = ram_save_setup,
4608    .save_live_iterate = ram_save_iterate,
4609    .save_live_complete_postcopy = ram_save_complete,
4610    .save_live_complete_precopy = ram_save_complete,
4611    .has_postcopy = ram_has_postcopy,
4612    .save_live_pending = ram_save_pending,
4613    .load_state = ram_load,
4614    .save_cleanup = ram_save_cleanup,
4615    .load_setup = ram_load_setup,
4616    .load_cleanup = ram_load_cleanup,
4617    .resume_prepare = ram_resume_prepare,
4618};
4619
4620static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4621                                      size_t old_size, size_t new_size)
4622{
4623    PostcopyState ps = postcopy_state_get();
4624    ram_addr_t offset;
4625    RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4626    Error *err = NULL;
4627
4628    if (ramblock_is_ignored(rb)) {
4629        return;
4630    }
4631
4632    if (!migration_is_idle()) {
4633        /*
4634         * Precopy code on the source cannot deal with the size of RAM blocks
4635         * changing at random points in time - especially after sending the
4636         * RAM block sizes in the migration stream, they must no longer change.
4637         * Abort and indicate a proper reason.
4638         */
4639        error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4640        migration_cancel(err);
4641        error_free(err);
4642    }
4643
4644    switch (ps) {
4645    case POSTCOPY_INCOMING_ADVISE:
4646        /*
4647         * Update what ram_postcopy_incoming_init()->init_range() does at the
4648         * time postcopy was advised. Syncing RAM blocks with the source will
4649         * result in RAM resizes.
4650         */
4651        if (old_size < new_size) {
4652            if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4653                error_report("RAM block '%s' discard of resized RAM failed",
4654                             rb->idstr);
4655            }
4656        }
4657        rb->postcopy_length = new_size;
4658        break;
4659    case POSTCOPY_INCOMING_NONE:
4660    case POSTCOPY_INCOMING_RUNNING:
4661    case POSTCOPY_INCOMING_END:
4662        /*
4663         * Once our guest is running, postcopy does no longer care about
4664         * resizes. When growing, the new memory was not available on the
4665         * source, no handler needed.
4666         */
4667        break;
4668    default:
4669        error_report("RAM block '%s' resized during postcopy state: %d",
4670                     rb->idstr, ps);
4671        exit(-1);
4672    }
4673}
4674
4675static RAMBlockNotifier ram_mig_ram_notifier = {
4676    .ram_block_resized = ram_mig_ram_block_resized,
4677};
4678
4679void ram_mig_init(void)
4680{
4681    qemu_mutex_init(&XBZRLE.lock);
4682    register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4683    ram_block_notifier_add(&ram_mig_ram_notifier);
4684}
4685