qemu/migration/ram.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2011-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include "qemu/cutils.h"
  31#include "qemu/bitops.h"
  32#include "qemu/bitmap.h"
  33#include "qemu/main-loop.h"
  34#include "xbzrle.h"
  35#include "ram.h"
  36#include "migration.h"
  37#include "migration/register.h"
  38#include "migration/misc.h"
  39#include "qemu-file.h"
  40#include "postcopy-ram.h"
  41#include "page_cache.h"
  42#include "qemu/error-report.h"
  43#include "qapi/error.h"
  44#include "qapi/qapi-types-migration.h"
  45#include "qapi/qapi-events-migration.h"
  46#include "qapi/qmp/qerror.h"
  47#include "trace.h"
  48#include "exec/ram_addr.h"
  49#include "exec/target_page.h"
  50#include "qemu/rcu_queue.h"
  51#include "migration/colo.h"
  52#include "block.h"
  53#include "sysemu/cpu-throttle.h"
  54#include "savevm.h"
  55#include "qemu/iov.h"
  56#include "multifd.h"
  57#include "sysemu/runstate.h"
  58
  59#include "hw/boards.h" /* for machine_dump_guest_core() */
  60
  61#if defined(__linux__)
  62#include "qemu/userfaultfd.h"
  63#endif /* defined(__linux__) */
  64
  65/***********************************************************/
  66/* ram save/restore */
  67
  68/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  69 * worked for pages that where filled with the same char.  We switched
  70 * it to only search for the zero value.  And to avoid confusion with
  71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  72 */
  73
  74#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  75#define RAM_SAVE_FLAG_ZERO     0x02
  76#define RAM_SAVE_FLAG_MEM_SIZE 0x04
  77#define RAM_SAVE_FLAG_PAGE     0x08
  78#define RAM_SAVE_FLAG_EOS      0x10
  79#define RAM_SAVE_FLAG_CONTINUE 0x20
  80#define RAM_SAVE_FLAG_XBZRLE   0x40
  81/* 0x80 is reserved in migration.h start with 0x100 next */
  82#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  83
  84static inline bool is_zero_range(uint8_t *p, uint64_t size)
  85{
  86    return buffer_is_zero(p, size);
  87}
  88
  89XBZRLECacheStats xbzrle_counters;
  90
  91/* struct contains XBZRLE cache and a static page
  92   used by the compression */
  93static struct {
  94    /* buffer used for XBZRLE encoding */
  95    uint8_t *encoded_buf;
  96    /* buffer for storing page content */
  97    uint8_t *current_buf;
  98    /* Cache for XBZRLE, Protected by lock. */
  99    PageCache *cache;
 100    QemuMutex lock;
 101    /* it will store a page full of zeros */
 102    uint8_t *zero_target_page;
 103    /* buffer used for XBZRLE decoding */
 104    uint8_t *decoded_buf;
 105} XBZRLE;
 106
 107static void XBZRLE_cache_lock(void)
 108{
 109    if (migrate_use_xbzrle()) {
 110        qemu_mutex_lock(&XBZRLE.lock);
 111    }
 112}
 113
 114static void XBZRLE_cache_unlock(void)
 115{
 116    if (migrate_use_xbzrle()) {
 117        qemu_mutex_unlock(&XBZRLE.lock);
 118    }
 119}
 120
 121/**
 122 * xbzrle_cache_resize: resize the xbzrle cache
 123 *
 124 * This function is called from migrate_params_apply in main
 125 * thread, possibly while a migration is in progress.  A running
 126 * migration may be using the cache and might finish during this call,
 127 * hence changes to the cache are protected by XBZRLE.lock().
 128 *
 129 * Returns 0 for success or -1 for error
 130 *
 131 * @new_size: new cache size
 132 * @errp: set *errp if the check failed, with reason
 133 */
 134int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 135{
 136    PageCache *new_cache;
 137    int64_t ret = 0;
 138
 139    /* Check for truncation */
 140    if (new_size != (size_t)new_size) {
 141        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 142                   "exceeding address space");
 143        return -1;
 144    }
 145
 146    if (new_size == migrate_xbzrle_cache_size()) {
 147        /* nothing to do */
 148        return 0;
 149    }
 150
 151    XBZRLE_cache_lock();
 152
 153    if (XBZRLE.cache != NULL) {
 154        new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 155        if (!new_cache) {
 156            ret = -1;
 157            goto out;
 158        }
 159
 160        cache_fini(XBZRLE.cache);
 161        XBZRLE.cache = new_cache;
 162    }
 163out:
 164    XBZRLE_cache_unlock();
 165    return ret;
 166}
 167
 168bool ramblock_is_ignored(RAMBlock *block)
 169{
 170    return !qemu_ram_is_migratable(block) ||
 171           (migrate_ignore_shared() && qemu_ram_is_shared(block));
 172}
 173
 174#undef RAMBLOCK_FOREACH
 175
 176int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 177{
 178    RAMBlock *block;
 179    int ret = 0;
 180
 181    RCU_READ_LOCK_GUARD();
 182
 183    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 184        ret = func(block, opaque);
 185        if (ret) {
 186            break;
 187        }
 188    }
 189    return ret;
 190}
 191
 192static void ramblock_recv_map_init(void)
 193{
 194    RAMBlock *rb;
 195
 196    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 197        assert(!rb->receivedmap);
 198        rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 199    }
 200}
 201
 202int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 203{
 204    return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 205                    rb->receivedmap);
 206}
 207
 208bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 209{
 210    return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 211}
 212
 213void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 214{
 215    set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 216}
 217
 218void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 219                                    size_t nr)
 220{
 221    bitmap_set_atomic(rb->receivedmap,
 222                      ramblock_recv_bitmap_offset(host_addr, rb),
 223                      nr);
 224}
 225
 226#define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 227
 228/*
 229 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 230 *
 231 * Returns >0 if success with sent bytes, or <0 if error.
 232 */
 233int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 234                                  const char *block_name)
 235{
 236    RAMBlock *block = qemu_ram_block_by_name(block_name);
 237    unsigned long *le_bitmap, nbits;
 238    uint64_t size;
 239
 240    if (!block) {
 241        error_report("%s: invalid block name: %s", __func__, block_name);
 242        return -1;
 243    }
 244
 245    nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 246
 247    /*
 248     * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 249     * machines we may need 4 more bytes for padding (see below
 250     * comment). So extend it a bit before hand.
 251     */
 252    le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 253
 254    /*
 255     * Always use little endian when sending the bitmap. This is
 256     * required that when source and destination VMs are not using the
 257     * same endianness. (Note: big endian won't work.)
 258     */
 259    bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 260
 261    /* Size of the bitmap, in bytes */
 262    size = DIV_ROUND_UP(nbits, 8);
 263
 264    /*
 265     * size is always aligned to 8 bytes for 64bit machines, but it
 266     * may not be true for 32bit machines. We need this padding to
 267     * make sure the migration can survive even between 32bit and
 268     * 64bit machines.
 269     */
 270    size = ROUND_UP(size, 8);
 271
 272    qemu_put_be64(file, size);
 273    qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 274    /*
 275     * Mark as an end, in case the middle part is screwed up due to
 276     * some "mysterious" reason.
 277     */
 278    qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 279    qemu_fflush(file);
 280
 281    g_free(le_bitmap);
 282
 283    if (qemu_file_get_error(file)) {
 284        return qemu_file_get_error(file);
 285    }
 286
 287    return size + sizeof(size);
 288}
 289
 290/*
 291 * An outstanding page request, on the source, having been received
 292 * and queued
 293 */
 294struct RAMSrcPageRequest {
 295    RAMBlock *rb;
 296    hwaddr    offset;
 297    hwaddr    len;
 298
 299    QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 300};
 301
 302/* State of RAM for migration */
 303struct RAMState {
 304    /* QEMUFile used for this migration */
 305    QEMUFile *f;
 306    /* UFFD file descriptor, used in 'write-tracking' migration */
 307    int uffdio_fd;
 308    /* Last block that we have visited searching for dirty pages */
 309    RAMBlock *last_seen_block;
 310    /* Last block from where we have sent data */
 311    RAMBlock *last_sent_block;
 312    /* Last dirty target page we have sent */
 313    ram_addr_t last_page;
 314    /* last ram version we have seen */
 315    uint32_t last_version;
 316    /* How many times we have dirty too many pages */
 317    int dirty_rate_high_cnt;
 318    /* these variables are used for bitmap sync */
 319    /* last time we did a full bitmap_sync */
 320    int64_t time_last_bitmap_sync;
 321    /* bytes transferred at start_time */
 322    uint64_t bytes_xfer_prev;
 323    /* number of dirty pages since start_time */
 324    uint64_t num_dirty_pages_period;
 325    /* xbzrle misses since the beginning of the period */
 326    uint64_t xbzrle_cache_miss_prev;
 327    /* Amount of xbzrle pages since the beginning of the period */
 328    uint64_t xbzrle_pages_prev;
 329    /* Amount of xbzrle encoded bytes since the beginning of the period */
 330    uint64_t xbzrle_bytes_prev;
 331    /* Start using XBZRLE (e.g., after the first round). */
 332    bool xbzrle_enabled;
 333
 334    /* compression statistics since the beginning of the period */
 335    /* amount of count that no free thread to compress data */
 336    uint64_t compress_thread_busy_prev;
 337    /* amount bytes after compression */
 338    uint64_t compressed_size_prev;
 339    /* amount of compressed pages */
 340    uint64_t compress_pages_prev;
 341
 342    /* total handled target pages at the beginning of period */
 343    uint64_t target_page_count_prev;
 344    /* total handled target pages since start */
 345    uint64_t target_page_count;
 346    /* number of dirty bits in the bitmap */
 347    uint64_t migration_dirty_pages;
 348    /* Protects modification of the bitmap and migration dirty pages */
 349    QemuMutex bitmap_mutex;
 350    /* The RAMBlock used in the last src_page_requests */
 351    RAMBlock *last_req_rb;
 352    /* Queue of outstanding page requests from the destination */
 353    QemuMutex src_page_req_mutex;
 354    QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 355};
 356typedef struct RAMState RAMState;
 357
 358static RAMState *ram_state;
 359
 360static NotifierWithReturnList precopy_notifier_list;
 361
 362void precopy_infrastructure_init(void)
 363{
 364    notifier_with_return_list_init(&precopy_notifier_list);
 365}
 366
 367void precopy_add_notifier(NotifierWithReturn *n)
 368{
 369    notifier_with_return_list_add(&precopy_notifier_list, n);
 370}
 371
 372void precopy_remove_notifier(NotifierWithReturn *n)
 373{
 374    notifier_with_return_remove(n);
 375}
 376
 377int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 378{
 379    PrecopyNotifyData pnd;
 380    pnd.reason = reason;
 381    pnd.errp = errp;
 382
 383    return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 384}
 385
 386uint64_t ram_bytes_remaining(void)
 387{
 388    return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 389                       0;
 390}
 391
 392MigrationStats ram_counters;
 393
 394/* used by the search for pages to send */
 395struct PageSearchStatus {
 396    /* Current block being searched */
 397    RAMBlock    *block;
 398    /* Current page to search from */
 399    unsigned long page;
 400    /* Set once we wrap around */
 401    bool         complete_round;
 402};
 403typedef struct PageSearchStatus PageSearchStatus;
 404
 405CompressionStats compression_counters;
 406
 407struct CompressParam {
 408    bool done;
 409    bool quit;
 410    bool zero_page;
 411    QEMUFile *file;
 412    QemuMutex mutex;
 413    QemuCond cond;
 414    RAMBlock *block;
 415    ram_addr_t offset;
 416
 417    /* internally used fields */
 418    z_stream stream;
 419    uint8_t *originbuf;
 420};
 421typedef struct CompressParam CompressParam;
 422
 423struct DecompressParam {
 424    bool done;
 425    bool quit;
 426    QemuMutex mutex;
 427    QemuCond cond;
 428    void *des;
 429    uint8_t *compbuf;
 430    int len;
 431    z_stream stream;
 432};
 433typedef struct DecompressParam DecompressParam;
 434
 435static CompressParam *comp_param;
 436static QemuThread *compress_threads;
 437/* comp_done_cond is used to wake up the migration thread when
 438 * one of the compression threads has finished the compression.
 439 * comp_done_lock is used to co-work with comp_done_cond.
 440 */
 441static QemuMutex comp_done_lock;
 442static QemuCond comp_done_cond;
 443/* The empty QEMUFileOps will be used by file in CompressParam */
 444static const QEMUFileOps empty_ops = { };
 445
 446static QEMUFile *decomp_file;
 447static DecompressParam *decomp_param;
 448static QemuThread *decompress_threads;
 449static QemuMutex decomp_done_lock;
 450static QemuCond decomp_done_cond;
 451
 452static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 453                                 ram_addr_t offset, uint8_t *source_buf);
 454
 455static void *do_data_compress(void *opaque)
 456{
 457    CompressParam *param = opaque;
 458    RAMBlock *block;
 459    ram_addr_t offset;
 460    bool zero_page;
 461
 462    qemu_mutex_lock(&param->mutex);
 463    while (!param->quit) {
 464        if (param->block) {
 465            block = param->block;
 466            offset = param->offset;
 467            param->block = NULL;
 468            qemu_mutex_unlock(&param->mutex);
 469
 470            zero_page = do_compress_ram_page(param->file, &param->stream,
 471                                             block, offset, param->originbuf);
 472
 473            qemu_mutex_lock(&comp_done_lock);
 474            param->done = true;
 475            param->zero_page = zero_page;
 476            qemu_cond_signal(&comp_done_cond);
 477            qemu_mutex_unlock(&comp_done_lock);
 478
 479            qemu_mutex_lock(&param->mutex);
 480        } else {
 481            qemu_cond_wait(&param->cond, &param->mutex);
 482        }
 483    }
 484    qemu_mutex_unlock(&param->mutex);
 485
 486    return NULL;
 487}
 488
 489static void compress_threads_save_cleanup(void)
 490{
 491    int i, thread_count;
 492
 493    if (!migrate_use_compression() || !comp_param) {
 494        return;
 495    }
 496
 497    thread_count = migrate_compress_threads();
 498    for (i = 0; i < thread_count; i++) {
 499        /*
 500         * we use it as a indicator which shows if the thread is
 501         * properly init'd or not
 502         */
 503        if (!comp_param[i].file) {
 504            break;
 505        }
 506
 507        qemu_mutex_lock(&comp_param[i].mutex);
 508        comp_param[i].quit = true;
 509        qemu_cond_signal(&comp_param[i].cond);
 510        qemu_mutex_unlock(&comp_param[i].mutex);
 511
 512        qemu_thread_join(compress_threads + i);
 513        qemu_mutex_destroy(&comp_param[i].mutex);
 514        qemu_cond_destroy(&comp_param[i].cond);
 515        deflateEnd(&comp_param[i].stream);
 516        g_free(comp_param[i].originbuf);
 517        qemu_fclose(comp_param[i].file);
 518        comp_param[i].file = NULL;
 519    }
 520    qemu_mutex_destroy(&comp_done_lock);
 521    qemu_cond_destroy(&comp_done_cond);
 522    g_free(compress_threads);
 523    g_free(comp_param);
 524    compress_threads = NULL;
 525    comp_param = NULL;
 526}
 527
 528static int compress_threads_save_setup(void)
 529{
 530    int i, thread_count;
 531
 532    if (!migrate_use_compression()) {
 533        return 0;
 534    }
 535    thread_count = migrate_compress_threads();
 536    compress_threads = g_new0(QemuThread, thread_count);
 537    comp_param = g_new0(CompressParam, thread_count);
 538    qemu_cond_init(&comp_done_cond);
 539    qemu_mutex_init(&comp_done_lock);
 540    for (i = 0; i < thread_count; i++) {
 541        comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 542        if (!comp_param[i].originbuf) {
 543            goto exit;
 544        }
 545
 546        if (deflateInit(&comp_param[i].stream,
 547                        migrate_compress_level()) != Z_OK) {
 548            g_free(comp_param[i].originbuf);
 549            goto exit;
 550        }
 551
 552        /* comp_param[i].file is just used as a dummy buffer to save data,
 553         * set its ops to empty.
 554         */
 555        comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
 556        comp_param[i].done = true;
 557        comp_param[i].quit = false;
 558        qemu_mutex_init(&comp_param[i].mutex);
 559        qemu_cond_init(&comp_param[i].cond);
 560        qemu_thread_create(compress_threads + i, "compress",
 561                           do_data_compress, comp_param + i,
 562                           QEMU_THREAD_JOINABLE);
 563    }
 564    return 0;
 565
 566exit:
 567    compress_threads_save_cleanup();
 568    return -1;
 569}
 570
 571/**
 572 * save_page_header: write page header to wire
 573 *
 574 * If this is the 1st block, it also writes the block identification
 575 *
 576 * Returns the number of bytes written
 577 *
 578 * @f: QEMUFile where to send the data
 579 * @block: block that contains the page we want to send
 580 * @offset: offset inside the block for the page
 581 *          in the lower bits, it contains flags
 582 */
 583static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 584                               ram_addr_t offset)
 585{
 586    size_t size, len;
 587
 588    if (block == rs->last_sent_block) {
 589        offset |= RAM_SAVE_FLAG_CONTINUE;
 590    }
 591    qemu_put_be64(f, offset);
 592    size = 8;
 593
 594    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 595        len = strlen(block->idstr);
 596        qemu_put_byte(f, len);
 597        qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 598        size += 1 + len;
 599        rs->last_sent_block = block;
 600    }
 601    return size;
 602}
 603
 604/**
 605 * mig_throttle_guest_down: throttle down the guest
 606 *
 607 * Reduce amount of guest cpu execution to hopefully slow down memory
 608 * writes. If guest dirty memory rate is reduced below the rate at
 609 * which we can transfer pages to the destination then we should be
 610 * able to complete migration. Some workloads dirty memory way too
 611 * fast and will not effectively converge, even with auto-converge.
 612 */
 613static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 614                                    uint64_t bytes_dirty_threshold)
 615{
 616    MigrationState *s = migrate_get_current();
 617    uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 618    uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 619    bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 620    int pct_max = s->parameters.max_cpu_throttle;
 621
 622    uint64_t throttle_now = cpu_throttle_get_percentage();
 623    uint64_t cpu_now, cpu_ideal, throttle_inc;
 624
 625    /* We have not started throttling yet. Let's start it. */
 626    if (!cpu_throttle_active()) {
 627        cpu_throttle_set(pct_initial);
 628    } else {
 629        /* Throttling already on, just increase the rate */
 630        if (!pct_tailslow) {
 631            throttle_inc = pct_increment;
 632        } else {
 633            /* Compute the ideal CPU percentage used by Guest, which may
 634             * make the dirty rate match the dirty rate threshold. */
 635            cpu_now = 100 - throttle_now;
 636            cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 637                        bytes_dirty_period);
 638            throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 639        }
 640        cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 641    }
 642}
 643
 644void mig_throttle_counter_reset(void)
 645{
 646    RAMState *rs = ram_state;
 647
 648    rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 649    rs->num_dirty_pages_period = 0;
 650    rs->bytes_xfer_prev = ram_counters.transferred;
 651}
 652
 653/**
 654 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 655 *
 656 * @rs: current RAM state
 657 * @current_addr: address for the zero page
 658 *
 659 * Update the xbzrle cache to reflect a page that's been sent as all 0.
 660 * The important thing is that a stale (not-yet-0'd) page be replaced
 661 * by the new data.
 662 * As a bonus, if the page wasn't in the cache it gets added so that
 663 * when a small write is made into the 0'd page it gets XBZRLE sent.
 664 */
 665static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 666{
 667    if (!rs->xbzrle_enabled) {
 668        return;
 669    }
 670
 671    /* We don't care if this fails to allocate a new cache page
 672     * as long as it updated an old one */
 673    cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 674                 ram_counters.dirty_sync_count);
 675}
 676
 677#define ENCODING_FLAG_XBZRLE 0x1
 678
 679/**
 680 * save_xbzrle_page: compress and send current page
 681 *
 682 * Returns: 1 means that we wrote the page
 683 *          0 means that page is identical to the one already sent
 684 *          -1 means that xbzrle would be longer than normal
 685 *
 686 * @rs: current RAM state
 687 * @current_data: pointer to the address of the page contents
 688 * @current_addr: addr of the page
 689 * @block: block that contains the page we want to send
 690 * @offset: offset inside the block for the page
 691 * @last_stage: if we are at the completion stage
 692 */
 693static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 694                            ram_addr_t current_addr, RAMBlock *block,
 695                            ram_addr_t offset, bool last_stage)
 696{
 697    int encoded_len = 0, bytes_xbzrle;
 698    uint8_t *prev_cached_page;
 699
 700    if (!cache_is_cached(XBZRLE.cache, current_addr,
 701                         ram_counters.dirty_sync_count)) {
 702        xbzrle_counters.cache_miss++;
 703        if (!last_stage) {
 704            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 705                             ram_counters.dirty_sync_count) == -1) {
 706                return -1;
 707            } else {
 708                /* update *current_data when the page has been
 709                   inserted into cache */
 710                *current_data = get_cached_data(XBZRLE.cache, current_addr);
 711            }
 712        }
 713        return -1;
 714    }
 715
 716    /*
 717     * Reaching here means the page has hit the xbzrle cache, no matter what
 718     * encoding result it is (normal encoding, overflow or skipping the page),
 719     * count the page as encoded. This is used to calculate the encoding rate.
 720     *
 721     * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 722     * 2nd page turns out to be skipped (i.e. no new bytes written to the
 723     * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 724     * skipped page included. In this way, the encoding rate can tell if the
 725     * guest page is good for xbzrle encoding.
 726     */
 727    xbzrle_counters.pages++;
 728    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 729
 730    /* save current buffer into memory */
 731    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 732
 733    /* XBZRLE encoding (if there is no overflow) */
 734    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 735                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 736                                       TARGET_PAGE_SIZE);
 737
 738    /*
 739     * Update the cache contents, so that it corresponds to the data
 740     * sent, in all cases except where we skip the page.
 741     */
 742    if (!last_stage && encoded_len != 0) {
 743        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 744        /*
 745         * In the case where we couldn't compress, ensure that the caller
 746         * sends the data from the cache, since the guest might have
 747         * changed the RAM since we copied it.
 748         */
 749        *current_data = prev_cached_page;
 750    }
 751
 752    if (encoded_len == 0) {
 753        trace_save_xbzrle_page_skipping();
 754        return 0;
 755    } else if (encoded_len == -1) {
 756        trace_save_xbzrle_page_overflow();
 757        xbzrle_counters.overflow++;
 758        xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 759        return -1;
 760    }
 761
 762    /* Send XBZRLE based compressed page */
 763    bytes_xbzrle = save_page_header(rs, rs->f, block,
 764                                    offset | RAM_SAVE_FLAG_XBZRLE);
 765    qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 766    qemu_put_be16(rs->f, encoded_len);
 767    qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 768    bytes_xbzrle += encoded_len + 1 + 2;
 769    /*
 770     * Like compressed_size (please see update_compress_thread_counts),
 771     * the xbzrle encoded bytes don't count the 8 byte header with
 772     * RAM_SAVE_FLAG_CONTINUE.
 773     */
 774    xbzrle_counters.bytes += bytes_xbzrle - 8;
 775    ram_counters.transferred += bytes_xbzrle;
 776
 777    return 1;
 778}
 779
 780/**
 781 * migration_bitmap_find_dirty: find the next dirty page from start
 782 *
 783 * Returns the page offset within memory region of the start of a dirty page
 784 *
 785 * @rs: current RAM state
 786 * @rb: RAMBlock where to search for dirty pages
 787 * @start: page where we start the search
 788 */
 789static inline
 790unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 791                                          unsigned long start)
 792{
 793    unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 794    unsigned long *bitmap = rb->bmap;
 795
 796    if (ramblock_is_ignored(rb)) {
 797        return size;
 798    }
 799
 800    return find_next_bit(bitmap, size, start);
 801}
 802
 803static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 804                                                       unsigned long page)
 805{
 806    uint8_t shift;
 807    hwaddr size, start;
 808
 809    if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 810        return;
 811    }
 812
 813    shift = rb->clear_bmap_shift;
 814    /*
 815     * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 816     * can make things easier sometimes since then start address
 817     * of the small chunk will always be 64 pages aligned so the
 818     * bitmap will always be aligned to unsigned long. We should
 819     * even be able to remove this restriction but I'm simply
 820     * keeping it.
 821     */
 822    assert(shift >= 6);
 823
 824    size = 1ULL << (TARGET_PAGE_BITS + shift);
 825    start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 826    trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 827    memory_region_clear_dirty_bitmap(rb->mr, start, size);
 828}
 829
 830static void
 831migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 832                                                 unsigned long start,
 833                                                 unsigned long npages)
 834{
 835    unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 836    unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 837    unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 838
 839    /*
 840     * Clear pages from start to start + npages - 1, so the end boundary is
 841     * exclusive.
 842     */
 843    for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 844        migration_clear_memory_region_dirty_bitmap(rb, i);
 845    }
 846}
 847
 848/*
 849 * colo_bitmap_find_diry:find contiguous dirty pages from start
 850 *
 851 * Returns the page offset within memory region of the start of the contiguout
 852 * dirty page
 853 *
 854 * @rs: current RAM state
 855 * @rb: RAMBlock where to search for dirty pages
 856 * @start: page where we start the search
 857 * @num: the number of contiguous dirty pages
 858 */
 859static inline
 860unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 861                                     unsigned long start, unsigned long *num)
 862{
 863    unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 864    unsigned long *bitmap = rb->bmap;
 865    unsigned long first, next;
 866
 867    *num = 0;
 868
 869    if (ramblock_is_ignored(rb)) {
 870        return size;
 871    }
 872
 873    first = find_next_bit(bitmap, size, start);
 874    if (first >= size) {
 875        return first;
 876    }
 877    next = find_next_zero_bit(bitmap, size, first + 1);
 878    assert(next >= first);
 879    *num = next - first;
 880    return first;
 881}
 882
 883static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 884                                                RAMBlock *rb,
 885                                                unsigned long page)
 886{
 887    bool ret;
 888
 889    /*
 890     * Clear dirty bitmap if needed.  This _must_ be called before we
 891     * send any of the page in the chunk because we need to make sure
 892     * we can capture further page content changes when we sync dirty
 893     * log the next time.  So as long as we are going to send any of
 894     * the page in the chunk we clear the remote dirty bitmap for all.
 895     * Clearing it earlier won't be a problem, but too late will.
 896     */
 897    migration_clear_memory_region_dirty_bitmap(rb, page);
 898
 899    ret = test_and_clear_bit(page, rb->bmap);
 900    if (ret) {
 901        rs->migration_dirty_pages--;
 902    }
 903
 904    return ret;
 905}
 906
 907static void dirty_bitmap_clear_section(MemoryRegionSection *section,
 908                                       void *opaque)
 909{
 910    const hwaddr offset = section->offset_within_region;
 911    const hwaddr size = int128_get64(section->size);
 912    const unsigned long start = offset >> TARGET_PAGE_BITS;
 913    const unsigned long npages = size >> TARGET_PAGE_BITS;
 914    RAMBlock *rb = section->mr->ram_block;
 915    uint64_t *cleared_bits = opaque;
 916
 917    /*
 918     * We don't grab ram_state->bitmap_mutex because we expect to run
 919     * only when starting migration or during postcopy recovery where
 920     * we don't have concurrent access.
 921     */
 922    if (!migration_in_postcopy() && !migrate_background_snapshot()) {
 923        migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
 924    }
 925    *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
 926    bitmap_clear(rb->bmap, start, npages);
 927}
 928
 929/*
 930 * Exclude all dirty pages from migration that fall into a discarded range as
 931 * managed by a RamDiscardManager responsible for the mapped memory region of
 932 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
 933 *
 934 * Discarded pages ("logically unplugged") have undefined content and must
 935 * not get migrated, because even reading these pages for migration might
 936 * result in undesired behavior.
 937 *
 938 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
 939 *
 940 * Note: The result is only stable while migrating (precopy/postcopy).
 941 */
 942static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
 943{
 944    uint64_t cleared_bits = 0;
 945
 946    if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
 947        RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 948        MemoryRegionSection section = {
 949            .mr = rb->mr,
 950            .offset_within_region = 0,
 951            .size = int128_make64(qemu_ram_get_used_length(rb)),
 952        };
 953
 954        ram_discard_manager_replay_discarded(rdm, &section,
 955                                             dirty_bitmap_clear_section,
 956                                             &cleared_bits);
 957    }
 958    return cleared_bits;
 959}
 960
 961/*
 962 * Check if a host-page aligned page falls into a discarded range as managed by
 963 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
 964 *
 965 * Note: The result is only stable while migrating (precopy/postcopy).
 966 */
 967bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
 968{
 969    if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
 970        RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 971        MemoryRegionSection section = {
 972            .mr = rb->mr,
 973            .offset_within_region = start,
 974            .size = int128_make64(qemu_ram_pagesize(rb)),
 975        };
 976
 977        return !ram_discard_manager_is_populated(rdm, &section);
 978    }
 979    return false;
 980}
 981
 982/* Called with RCU critical section */
 983static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 984{
 985    uint64_t new_dirty_pages =
 986        cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 987
 988    rs->migration_dirty_pages += new_dirty_pages;
 989    rs->num_dirty_pages_period += new_dirty_pages;
 990}
 991
 992/**
 993 * ram_pagesize_summary: calculate all the pagesizes of a VM
 994 *
 995 * Returns a summary bitmap of the page sizes of all RAMBlocks
 996 *
 997 * For VMs with just normal pages this is equivalent to the host page
 998 * size. If it's got some huge pages then it's the OR of all the
 999 * different page sizes.
1000 */
1001uint64_t ram_pagesize_summary(void)
1002{
1003    RAMBlock *block;
1004    uint64_t summary = 0;
1005
1006    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1007        summary |= block->page_size;
1008    }
1009
1010    return summary;
1011}
1012
1013uint64_t ram_get_total_transferred_pages(void)
1014{
1015    return  ram_counters.normal + ram_counters.duplicate +
1016                compression_counters.pages + xbzrle_counters.pages;
1017}
1018
1019static void migration_update_rates(RAMState *rs, int64_t end_time)
1020{
1021    uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1022    double compressed_size;
1023
1024    /* calculate period counters */
1025    ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1026                / (end_time - rs->time_last_bitmap_sync);
1027
1028    if (!page_count) {
1029        return;
1030    }
1031
1032    if (migrate_use_xbzrle()) {
1033        double encoded_size, unencoded_size;
1034
1035        xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1036            rs->xbzrle_cache_miss_prev) / page_count;
1037        rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1038        unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1039                         TARGET_PAGE_SIZE;
1040        encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1041        if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1042            xbzrle_counters.encoding_rate = 0;
1043        } else {
1044            xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1045        }
1046        rs->xbzrle_pages_prev = xbzrle_counters.pages;
1047        rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1048    }
1049
1050    if (migrate_use_compression()) {
1051        compression_counters.busy_rate = (double)(compression_counters.busy -
1052            rs->compress_thread_busy_prev) / page_count;
1053        rs->compress_thread_busy_prev = compression_counters.busy;
1054
1055        compressed_size = compression_counters.compressed_size -
1056                          rs->compressed_size_prev;
1057        if (compressed_size) {
1058            double uncompressed_size = (compression_counters.pages -
1059                                    rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1060
1061            /* Compression-Ratio = Uncompressed-size / Compressed-size */
1062            compression_counters.compression_rate =
1063                                        uncompressed_size / compressed_size;
1064
1065            rs->compress_pages_prev = compression_counters.pages;
1066            rs->compressed_size_prev = compression_counters.compressed_size;
1067        }
1068    }
1069}
1070
1071static void migration_trigger_throttle(RAMState *rs)
1072{
1073    MigrationState *s = migrate_get_current();
1074    uint64_t threshold = s->parameters.throttle_trigger_threshold;
1075
1076    uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1077    uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1078    uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1079
1080    /* During block migration the auto-converge logic incorrectly detects
1081     * that ram migration makes no progress. Avoid this by disabling the
1082     * throttling logic during the bulk phase of block migration. */
1083    if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1084        /* The following detection logic can be refined later. For now:
1085           Check to see if the ratio between dirtied bytes and the approx.
1086           amount of bytes that just got transferred since the last time
1087           we were in this routine reaches the threshold. If that happens
1088           twice, start or increase throttling. */
1089
1090        if ((bytes_dirty_period > bytes_dirty_threshold) &&
1091            (++rs->dirty_rate_high_cnt >= 2)) {
1092            trace_migration_throttle();
1093            rs->dirty_rate_high_cnt = 0;
1094            mig_throttle_guest_down(bytes_dirty_period,
1095                                    bytes_dirty_threshold);
1096        }
1097    }
1098}
1099
1100static void migration_bitmap_sync(RAMState *rs)
1101{
1102    RAMBlock *block;
1103    int64_t end_time;
1104
1105    ram_counters.dirty_sync_count++;
1106
1107    if (!rs->time_last_bitmap_sync) {
1108        rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1109    }
1110
1111    trace_migration_bitmap_sync_start();
1112    memory_global_dirty_log_sync();
1113
1114    qemu_mutex_lock(&rs->bitmap_mutex);
1115    WITH_RCU_READ_LOCK_GUARD() {
1116        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1117            ramblock_sync_dirty_bitmap(rs, block);
1118        }
1119        ram_counters.remaining = ram_bytes_remaining();
1120    }
1121    qemu_mutex_unlock(&rs->bitmap_mutex);
1122
1123    memory_global_after_dirty_log_sync();
1124    trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1125
1126    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1127
1128    /* more than 1 second = 1000 millisecons */
1129    if (end_time > rs->time_last_bitmap_sync + 1000) {
1130        migration_trigger_throttle(rs);
1131
1132        migration_update_rates(rs, end_time);
1133
1134        rs->target_page_count_prev = rs->target_page_count;
1135
1136        /* reset period counters */
1137        rs->time_last_bitmap_sync = end_time;
1138        rs->num_dirty_pages_period = 0;
1139        rs->bytes_xfer_prev = ram_counters.transferred;
1140    }
1141    if (migrate_use_events()) {
1142        qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1143    }
1144}
1145
1146static void migration_bitmap_sync_precopy(RAMState *rs)
1147{
1148    Error *local_err = NULL;
1149
1150    /*
1151     * The current notifier usage is just an optimization to migration, so we
1152     * don't stop the normal migration process in the error case.
1153     */
1154    if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1155        error_report_err(local_err);
1156        local_err = NULL;
1157    }
1158
1159    migration_bitmap_sync(rs);
1160
1161    if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1162        error_report_err(local_err);
1163    }
1164}
1165
1166/**
1167 * save_zero_page_to_file: send the zero page to the file
1168 *
1169 * Returns the size of data written to the file, 0 means the page is not
1170 * a zero page
1171 *
1172 * @rs: current RAM state
1173 * @file: the file where the data is saved
1174 * @block: block that contains the page we want to send
1175 * @offset: offset inside the block for the page
1176 */
1177static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1178                                  RAMBlock *block, ram_addr_t offset)
1179{
1180    uint8_t *p = block->host + offset;
1181    int len = 0;
1182
1183    if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1184        len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1185        qemu_put_byte(file, 0);
1186        len += 1;
1187    }
1188    return len;
1189}
1190
1191/**
1192 * save_zero_page: send the zero page to the stream
1193 *
1194 * Returns the number of pages written.
1195 *
1196 * @rs: current RAM state
1197 * @block: block that contains the page we want to send
1198 * @offset: offset inside the block for the page
1199 */
1200static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1201{
1202    int len = save_zero_page_to_file(rs, rs->f, block, offset);
1203
1204    if (len) {
1205        ram_counters.duplicate++;
1206        ram_counters.transferred += len;
1207        return 1;
1208    }
1209    return -1;
1210}
1211
1212static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1213{
1214    if (!migrate_release_ram() || !migration_in_postcopy()) {
1215        return;
1216    }
1217
1218    ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1219}
1220
1221/*
1222 * @pages: the number of pages written by the control path,
1223 *        < 0 - error
1224 *        > 0 - number of pages written
1225 *
1226 * Return true if the pages has been saved, otherwise false is returned.
1227 */
1228static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1229                              int *pages)
1230{
1231    uint64_t bytes_xmit = 0;
1232    int ret;
1233
1234    *pages = -1;
1235    ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1236                                &bytes_xmit);
1237    if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1238        return false;
1239    }
1240
1241    if (bytes_xmit) {
1242        ram_counters.transferred += bytes_xmit;
1243        *pages = 1;
1244    }
1245
1246    if (ret == RAM_SAVE_CONTROL_DELAYED) {
1247        return true;
1248    }
1249
1250    if (bytes_xmit > 0) {
1251        ram_counters.normal++;
1252    } else if (bytes_xmit == 0) {
1253        ram_counters.duplicate++;
1254    }
1255
1256    return true;
1257}
1258
1259/*
1260 * directly send the page to the stream
1261 *
1262 * Returns the number of pages written.
1263 *
1264 * @rs: current RAM state
1265 * @block: block that contains the page we want to send
1266 * @offset: offset inside the block for the page
1267 * @buf: the page to be sent
1268 * @async: send to page asyncly
1269 */
1270static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1271                            uint8_t *buf, bool async)
1272{
1273    ram_counters.transferred += save_page_header(rs, rs->f, block,
1274                                                 offset | RAM_SAVE_FLAG_PAGE);
1275    if (async) {
1276        qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1277                              migrate_release_ram() &
1278                              migration_in_postcopy());
1279    } else {
1280        qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1281    }
1282    ram_counters.transferred += TARGET_PAGE_SIZE;
1283    ram_counters.normal++;
1284    return 1;
1285}
1286
1287/**
1288 * ram_save_page: send the given page to the stream
1289 *
1290 * Returns the number of pages written.
1291 *          < 0 - error
1292 *          >=0 - Number of pages written - this might legally be 0
1293 *                if xbzrle noticed the page was the same.
1294 *
1295 * @rs: current RAM state
1296 * @block: block that contains the page we want to send
1297 * @offset: offset inside the block for the page
1298 * @last_stage: if we are at the completion stage
1299 */
1300static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1301{
1302    int pages = -1;
1303    uint8_t *p;
1304    bool send_async = true;
1305    RAMBlock *block = pss->block;
1306    ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1307    ram_addr_t current_addr = block->offset + offset;
1308
1309    p = block->host + offset;
1310    trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1311
1312    XBZRLE_cache_lock();
1313    if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1314        pages = save_xbzrle_page(rs, &p, current_addr, block,
1315                                 offset, last_stage);
1316        if (!last_stage) {
1317            /* Can't send this cached data async, since the cache page
1318             * might get updated before it gets to the wire
1319             */
1320            send_async = false;
1321        }
1322    }
1323
1324    /* XBZRLE overflow or normal page */
1325    if (pages == -1) {
1326        pages = save_normal_page(rs, block, offset, p, send_async);
1327    }
1328
1329    XBZRLE_cache_unlock();
1330
1331    return pages;
1332}
1333
1334static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1335                                 ram_addr_t offset)
1336{
1337    if (multifd_queue_page(rs->f, block, offset) < 0) {
1338        return -1;
1339    }
1340    ram_counters.normal++;
1341
1342    return 1;
1343}
1344
1345static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1346                                 ram_addr_t offset, uint8_t *source_buf)
1347{
1348    RAMState *rs = ram_state;
1349    uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1350    bool zero_page = false;
1351    int ret;
1352
1353    if (save_zero_page_to_file(rs, f, block, offset)) {
1354        zero_page = true;
1355        goto exit;
1356    }
1357
1358    save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1359
1360    /*
1361     * copy it to a internal buffer to avoid it being modified by VM
1362     * so that we can catch up the error during compression and
1363     * decompression
1364     */
1365    memcpy(source_buf, p, TARGET_PAGE_SIZE);
1366    ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1367    if (ret < 0) {
1368        qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1369        error_report("compressed data failed!");
1370        return false;
1371    }
1372
1373exit:
1374    ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1375    return zero_page;
1376}
1377
1378static void
1379update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1380{
1381    ram_counters.transferred += bytes_xmit;
1382
1383    if (param->zero_page) {
1384        ram_counters.duplicate++;
1385        return;
1386    }
1387
1388    /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1389    compression_counters.compressed_size += bytes_xmit - 8;
1390    compression_counters.pages++;
1391}
1392
1393static bool save_page_use_compression(RAMState *rs);
1394
1395static void flush_compressed_data(RAMState *rs)
1396{
1397    int idx, len, thread_count;
1398
1399    if (!save_page_use_compression(rs)) {
1400        return;
1401    }
1402    thread_count = migrate_compress_threads();
1403
1404    qemu_mutex_lock(&comp_done_lock);
1405    for (idx = 0; idx < thread_count; idx++) {
1406        while (!comp_param[idx].done) {
1407            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1408        }
1409    }
1410    qemu_mutex_unlock(&comp_done_lock);
1411
1412    for (idx = 0; idx < thread_count; idx++) {
1413        qemu_mutex_lock(&comp_param[idx].mutex);
1414        if (!comp_param[idx].quit) {
1415            len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1416            /*
1417             * it's safe to fetch zero_page without holding comp_done_lock
1418             * as there is no further request submitted to the thread,
1419             * i.e, the thread should be waiting for a request at this point.
1420             */
1421            update_compress_thread_counts(&comp_param[idx], len);
1422        }
1423        qemu_mutex_unlock(&comp_param[idx].mutex);
1424    }
1425}
1426
1427static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1428                                       ram_addr_t offset)
1429{
1430    param->block = block;
1431    param->offset = offset;
1432}
1433
1434static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1435                                           ram_addr_t offset)
1436{
1437    int idx, thread_count, bytes_xmit = -1, pages = -1;
1438    bool wait = migrate_compress_wait_thread();
1439
1440    thread_count = migrate_compress_threads();
1441    qemu_mutex_lock(&comp_done_lock);
1442retry:
1443    for (idx = 0; idx < thread_count; idx++) {
1444        if (comp_param[idx].done) {
1445            comp_param[idx].done = false;
1446            bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1447            qemu_mutex_lock(&comp_param[idx].mutex);
1448            set_compress_params(&comp_param[idx], block, offset);
1449            qemu_cond_signal(&comp_param[idx].cond);
1450            qemu_mutex_unlock(&comp_param[idx].mutex);
1451            pages = 1;
1452            update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1453            break;
1454        }
1455    }
1456
1457    /*
1458     * wait for the free thread if the user specifies 'compress-wait-thread',
1459     * otherwise we will post the page out in the main thread as normal page.
1460     */
1461    if (pages < 0 && wait) {
1462        qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1463        goto retry;
1464    }
1465    qemu_mutex_unlock(&comp_done_lock);
1466
1467    return pages;
1468}
1469
1470/**
1471 * find_dirty_block: find the next dirty page and update any state
1472 * associated with the search process.
1473 *
1474 * Returns true if a page is found
1475 *
1476 * @rs: current RAM state
1477 * @pss: data about the state of the current dirty page scan
1478 * @again: set to false if the search has scanned the whole of RAM
1479 */
1480static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1481{
1482    pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1483    if (pss->complete_round && pss->block == rs->last_seen_block &&
1484        pss->page >= rs->last_page) {
1485        /*
1486         * We've been once around the RAM and haven't found anything.
1487         * Give up.
1488         */
1489        *again = false;
1490        return false;
1491    }
1492    if (!offset_in_ramblock(pss->block,
1493                            ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1494        /* Didn't find anything in this RAM Block */
1495        pss->page = 0;
1496        pss->block = QLIST_NEXT_RCU(pss->block, next);
1497        if (!pss->block) {
1498            /*
1499             * If memory migration starts over, we will meet a dirtied page
1500             * which may still exists in compression threads's ring, so we
1501             * should flush the compressed data to make sure the new page
1502             * is not overwritten by the old one in the destination.
1503             *
1504             * Also If xbzrle is on, stop using the data compression at this
1505             * point. In theory, xbzrle can do better than compression.
1506             */
1507            flush_compressed_data(rs);
1508
1509            /* Hit the end of the list */
1510            pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1511            /* Flag that we've looped */
1512            pss->complete_round = true;
1513            /* After the first round, enable XBZRLE. */
1514            if (migrate_use_xbzrle()) {
1515                rs->xbzrle_enabled = true;
1516            }
1517        }
1518        /* Didn't find anything this time, but try again on the new block */
1519        *again = true;
1520        return false;
1521    } else {
1522        /* Can go around again, but... */
1523        *again = true;
1524        /* We've found something so probably don't need to */
1525        return true;
1526    }
1527}
1528
1529/**
1530 * unqueue_page: gets a page of the queue
1531 *
1532 * Helper for 'get_queued_page' - gets a page off the queue
1533 *
1534 * Returns the block of the page (or NULL if none available)
1535 *
1536 * @rs: current RAM state
1537 * @offset: used to return the offset within the RAMBlock
1538 */
1539static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1540{
1541    RAMBlock *block = NULL;
1542
1543    if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1544        return NULL;
1545    }
1546
1547    QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1548    if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1549        struct RAMSrcPageRequest *entry =
1550                                QSIMPLEQ_FIRST(&rs->src_page_requests);
1551        block = entry->rb;
1552        *offset = entry->offset;
1553
1554        if (entry->len > TARGET_PAGE_SIZE) {
1555            entry->len -= TARGET_PAGE_SIZE;
1556            entry->offset += TARGET_PAGE_SIZE;
1557        } else {
1558            memory_region_unref(block->mr);
1559            QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1560            g_free(entry);
1561            migration_consume_urgent_request();
1562        }
1563    }
1564
1565    return block;
1566}
1567
1568#if defined(__linux__)
1569/**
1570 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1571 *   is found, return RAM block pointer and page offset
1572 *
1573 * Returns pointer to the RAMBlock containing faulting page,
1574 *   NULL if no write faults are pending
1575 *
1576 * @rs: current RAM state
1577 * @offset: page offset from the beginning of the block
1578 */
1579static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1580{
1581    struct uffd_msg uffd_msg;
1582    void *page_address;
1583    RAMBlock *block;
1584    int res;
1585
1586    if (!migrate_background_snapshot()) {
1587        return NULL;
1588    }
1589
1590    res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1591    if (res <= 0) {
1592        return NULL;
1593    }
1594
1595    page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1596    block = qemu_ram_block_from_host(page_address, false, offset);
1597    assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1598    return block;
1599}
1600
1601/**
1602 * ram_save_release_protection: release UFFD write protection after
1603 *   a range of pages has been saved
1604 *
1605 * @rs: current RAM state
1606 * @pss: page-search-status structure
1607 * @start_page: index of the first page in the range relative to pss->block
1608 *
1609 * Returns 0 on success, negative value in case of an error
1610*/
1611static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1612        unsigned long start_page)
1613{
1614    int res = 0;
1615
1616    /* Check if page is from UFFD-managed region. */
1617    if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1618        void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1619        uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1620
1621        /* Flush async buffers before un-protect. */
1622        qemu_fflush(rs->f);
1623        /* Un-protect memory range. */
1624        res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1625                false, false);
1626    }
1627
1628    return res;
1629}
1630
1631/* ram_write_tracking_available: check if kernel supports required UFFD features
1632 *
1633 * Returns true if supports, false otherwise
1634 */
1635bool ram_write_tracking_available(void)
1636{
1637    uint64_t uffd_features;
1638    int res;
1639
1640    res = uffd_query_features(&uffd_features);
1641    return (res == 0 &&
1642            (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1643}
1644
1645/* ram_write_tracking_compatible: check if guest configuration is
1646 *   compatible with 'write-tracking'
1647 *
1648 * Returns true if compatible, false otherwise
1649 */
1650bool ram_write_tracking_compatible(void)
1651{
1652    const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1653    int uffd_fd;
1654    RAMBlock *block;
1655    bool ret = false;
1656
1657    /* Open UFFD file descriptor */
1658    uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1659    if (uffd_fd < 0) {
1660        return false;
1661    }
1662
1663    RCU_READ_LOCK_GUARD();
1664
1665    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1666        uint64_t uffd_ioctls;
1667
1668        /* Nothing to do with read-only and MMIO-writable regions */
1669        if (block->mr->readonly || block->mr->rom_device) {
1670            continue;
1671        }
1672        /* Try to register block memory via UFFD-IO to track writes */
1673        if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1674                UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1675            goto out;
1676        }
1677        if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1678            goto out;
1679        }
1680    }
1681    ret = true;
1682
1683out:
1684    uffd_close_fd(uffd_fd);
1685    return ret;
1686}
1687
1688static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1689                                       ram_addr_t size)
1690{
1691    /*
1692     * We read one byte of each page; this will preallocate page tables if
1693     * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1694     * where no page was populated yet. This might require adaption when
1695     * supporting other mappings, like shmem.
1696     */
1697    for (; offset < size; offset += block->page_size) {
1698        char tmp = *((char *)block->host + offset);
1699
1700        /* Don't optimize the read out */
1701        asm volatile("" : "+r" (tmp));
1702    }
1703}
1704
1705static inline int populate_read_section(MemoryRegionSection *section,
1706                                        void *opaque)
1707{
1708    const hwaddr size = int128_get64(section->size);
1709    hwaddr offset = section->offset_within_region;
1710    RAMBlock *block = section->mr->ram_block;
1711
1712    populate_read_range(block, offset, size);
1713    return 0;
1714}
1715
1716/*
1717 * ram_block_populate_read: preallocate page tables and populate pages in the
1718 *   RAM block by reading a byte of each page.
1719 *
1720 * Since it's solely used for userfault_fd WP feature, here we just
1721 *   hardcode page size to qemu_real_host_page_size.
1722 *
1723 * @block: RAM block to populate
1724 */
1725static void ram_block_populate_read(RAMBlock *rb)
1726{
1727    /*
1728     * Skip populating all pages that fall into a discarded range as managed by
1729     * a RamDiscardManager responsible for the mapped memory region of the
1730     * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1731     * must not get populated automatically. We don't have to track
1732     * modifications via userfaultfd WP reliably, because these pages will
1733     * not be part of the migration stream either way -- see
1734     * ramblock_dirty_bitmap_exclude_discarded_pages().
1735     *
1736     * Note: The result is only stable while migrating (precopy/postcopy).
1737     */
1738    if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1739        RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1740        MemoryRegionSection section = {
1741            .mr = rb->mr,
1742            .offset_within_region = 0,
1743            .size = rb->mr->size,
1744        };
1745
1746        ram_discard_manager_replay_populated(rdm, &section,
1747                                             populate_read_section, NULL);
1748    } else {
1749        populate_read_range(rb, 0, rb->used_length);
1750    }
1751}
1752
1753/*
1754 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1755 */
1756void ram_write_tracking_prepare(void)
1757{
1758    RAMBlock *block;
1759
1760    RCU_READ_LOCK_GUARD();
1761
1762    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1763        /* Nothing to do with read-only and MMIO-writable regions */
1764        if (block->mr->readonly || block->mr->rom_device) {
1765            continue;
1766        }
1767
1768        /*
1769         * Populate pages of the RAM block before enabling userfault_fd
1770         * write protection.
1771         *
1772         * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1773         * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1774         * pages with pte_none() entries in page table.
1775         */
1776        ram_block_populate_read(block);
1777    }
1778}
1779
1780/*
1781 * ram_write_tracking_start: start UFFD-WP memory tracking
1782 *
1783 * Returns 0 for success or negative value in case of error
1784 */
1785int ram_write_tracking_start(void)
1786{
1787    int uffd_fd;
1788    RAMState *rs = ram_state;
1789    RAMBlock *block;
1790
1791    /* Open UFFD file descriptor */
1792    uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1793    if (uffd_fd < 0) {
1794        return uffd_fd;
1795    }
1796    rs->uffdio_fd = uffd_fd;
1797
1798    RCU_READ_LOCK_GUARD();
1799
1800    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1801        /* Nothing to do with read-only and MMIO-writable regions */
1802        if (block->mr->readonly || block->mr->rom_device) {
1803            continue;
1804        }
1805
1806        /* Register block memory with UFFD to track writes */
1807        if (uffd_register_memory(rs->uffdio_fd, block->host,
1808                block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1809            goto fail;
1810        }
1811        /* Apply UFFD write protection to the block memory range */
1812        if (uffd_change_protection(rs->uffdio_fd, block->host,
1813                block->max_length, true, false)) {
1814            goto fail;
1815        }
1816        block->flags |= RAM_UF_WRITEPROTECT;
1817        memory_region_ref(block->mr);
1818
1819        trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1820                block->host, block->max_length);
1821    }
1822
1823    return 0;
1824
1825fail:
1826    error_report("ram_write_tracking_start() failed: restoring initial memory state");
1827
1828    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1829        if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1830            continue;
1831        }
1832        /*
1833         * In case some memory block failed to be write-protected
1834         * remove protection and unregister all succeeded RAM blocks
1835         */
1836        uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1837                false, false);
1838        uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1839        /* Cleanup flags and remove reference */
1840        block->flags &= ~RAM_UF_WRITEPROTECT;
1841        memory_region_unref(block->mr);
1842    }
1843
1844    uffd_close_fd(uffd_fd);
1845    rs->uffdio_fd = -1;
1846    return -1;
1847}
1848
1849/**
1850 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1851 */
1852void ram_write_tracking_stop(void)
1853{
1854    RAMState *rs = ram_state;
1855    RAMBlock *block;
1856
1857    RCU_READ_LOCK_GUARD();
1858
1859    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1860        if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1861            continue;
1862        }
1863        /* Remove protection and unregister all affected RAM blocks */
1864        uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1865                false, false);
1866        uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1867
1868        trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1869                block->host, block->max_length);
1870
1871        /* Cleanup flags and remove reference */
1872        block->flags &= ~RAM_UF_WRITEPROTECT;
1873        memory_region_unref(block->mr);
1874    }
1875
1876    /* Finally close UFFD file descriptor */
1877    uffd_close_fd(rs->uffdio_fd);
1878    rs->uffdio_fd = -1;
1879}
1880
1881#else
1882/* No target OS support, stubs just fail or ignore */
1883
1884static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1885{
1886    (void) rs;
1887    (void) offset;
1888
1889    return NULL;
1890}
1891
1892static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1893        unsigned long start_page)
1894{
1895    (void) rs;
1896    (void) pss;
1897    (void) start_page;
1898
1899    return 0;
1900}
1901
1902bool ram_write_tracking_available(void)
1903{
1904    return false;
1905}
1906
1907bool ram_write_tracking_compatible(void)
1908{
1909    assert(0);
1910    return false;
1911}
1912
1913int ram_write_tracking_start(void)
1914{
1915    assert(0);
1916    return -1;
1917}
1918
1919void ram_write_tracking_stop(void)
1920{
1921    assert(0);
1922}
1923#endif /* defined(__linux__) */
1924
1925/**
1926 * get_queued_page: unqueue a page from the postcopy requests
1927 *
1928 * Skips pages that are already sent (!dirty)
1929 *
1930 * Returns true if a queued page is found
1931 *
1932 * @rs: current RAM state
1933 * @pss: data about the state of the current dirty page scan
1934 */
1935static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1936{
1937    RAMBlock  *block;
1938    ram_addr_t offset;
1939    bool dirty;
1940
1941    do {
1942        block = unqueue_page(rs, &offset);
1943        /*
1944         * We're sending this page, and since it's postcopy nothing else
1945         * will dirty it, and we must make sure it doesn't get sent again
1946         * even if this queue request was received after the background
1947         * search already sent it.
1948         */
1949        if (block) {
1950            unsigned long page;
1951
1952            page = offset >> TARGET_PAGE_BITS;
1953            dirty = test_bit(page, block->bmap);
1954            if (!dirty) {
1955                trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1956                                                page);
1957            } else {
1958                trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1959            }
1960        }
1961
1962    } while (block && !dirty);
1963
1964    if (!block) {
1965        /*
1966         * Poll write faults too if background snapshot is enabled; that's
1967         * when we have vcpus got blocked by the write protected pages.
1968         */
1969        block = poll_fault_page(rs, &offset);
1970    }
1971
1972    if (block) {
1973        /*
1974         * We want the background search to continue from the queued page
1975         * since the guest is likely to want other pages near to the page
1976         * it just requested.
1977         */
1978        pss->block = block;
1979        pss->page = offset >> TARGET_PAGE_BITS;
1980
1981        /*
1982         * This unqueued page would break the "one round" check, even is
1983         * really rare.
1984         */
1985        pss->complete_round = false;
1986    }
1987
1988    return !!block;
1989}
1990
1991/**
1992 * migration_page_queue_free: drop any remaining pages in the ram
1993 * request queue
1994 *
1995 * It should be empty at the end anyway, but in error cases there may
1996 * be some left.  in case that there is any page left, we drop it.
1997 *
1998 */
1999static void migration_page_queue_free(RAMState *rs)
2000{
2001    struct RAMSrcPageRequest *mspr, *next_mspr;
2002    /* This queue generally should be empty - but in the case of a failed
2003     * migration might have some droppings in.
2004     */
2005    RCU_READ_LOCK_GUARD();
2006    QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2007        memory_region_unref(mspr->rb->mr);
2008        QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2009        g_free(mspr);
2010    }
2011}
2012
2013/**
2014 * ram_save_queue_pages: queue the page for transmission
2015 *
2016 * A request from postcopy destination for example.
2017 *
2018 * Returns zero on success or negative on error
2019 *
2020 * @rbname: Name of the RAMBLock of the request. NULL means the
2021 *          same that last one.
2022 * @start: starting address from the start of the RAMBlock
2023 * @len: length (in bytes) to send
2024 */
2025int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2026{
2027    RAMBlock *ramblock;
2028    RAMState *rs = ram_state;
2029
2030    ram_counters.postcopy_requests++;
2031    RCU_READ_LOCK_GUARD();
2032
2033    if (!rbname) {
2034        /* Reuse last RAMBlock */
2035        ramblock = rs->last_req_rb;
2036
2037        if (!ramblock) {
2038            /*
2039             * Shouldn't happen, we can't reuse the last RAMBlock if
2040             * it's the 1st request.
2041             */
2042            error_report("ram_save_queue_pages no previous block");
2043            return -1;
2044        }
2045    } else {
2046        ramblock = qemu_ram_block_by_name(rbname);
2047
2048        if (!ramblock) {
2049            /* We shouldn't be asked for a non-existent RAMBlock */
2050            error_report("ram_save_queue_pages no block '%s'", rbname);
2051            return -1;
2052        }
2053        rs->last_req_rb = ramblock;
2054    }
2055    trace_ram_save_queue_pages(ramblock->idstr, start, len);
2056    if (!offset_in_ramblock(ramblock, start + len - 1)) {
2057        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2058                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2059                     __func__, start, len, ramblock->used_length);
2060        return -1;
2061    }
2062
2063    struct RAMSrcPageRequest *new_entry =
2064        g_malloc0(sizeof(struct RAMSrcPageRequest));
2065    new_entry->rb = ramblock;
2066    new_entry->offset = start;
2067    new_entry->len = len;
2068
2069    memory_region_ref(ramblock->mr);
2070    qemu_mutex_lock(&rs->src_page_req_mutex);
2071    QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2072    migration_make_urgent_request();
2073    qemu_mutex_unlock(&rs->src_page_req_mutex);
2074
2075    return 0;
2076}
2077
2078static bool save_page_use_compression(RAMState *rs)
2079{
2080    if (!migrate_use_compression()) {
2081        return false;
2082    }
2083
2084    /*
2085     * If xbzrle is enabled (e.g., after first round of migration), stop
2086     * using the data compression. In theory, xbzrle can do better than
2087     * compression.
2088     */
2089    if (rs->xbzrle_enabled) {
2090        return false;
2091    }
2092
2093    return true;
2094}
2095
2096/*
2097 * try to compress the page before posting it out, return true if the page
2098 * has been properly handled by compression, otherwise needs other
2099 * paths to handle it
2100 */
2101static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2102{
2103    if (!save_page_use_compression(rs)) {
2104        return false;
2105    }
2106
2107    /*
2108     * When starting the process of a new block, the first page of
2109     * the block should be sent out before other pages in the same
2110     * block, and all the pages in last block should have been sent
2111     * out, keeping this order is important, because the 'cont' flag
2112     * is used to avoid resending the block name.
2113     *
2114     * We post the fist page as normal page as compression will take
2115     * much CPU resource.
2116     */
2117    if (block != rs->last_sent_block) {
2118        flush_compressed_data(rs);
2119        return false;
2120    }
2121
2122    if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2123        return true;
2124    }
2125
2126    compression_counters.busy++;
2127    return false;
2128}
2129
2130/**
2131 * ram_save_target_page: save one target page
2132 *
2133 * Returns the number of pages written
2134 *
2135 * @rs: current RAM state
2136 * @pss: data about the page we want to send
2137 * @last_stage: if we are at the completion stage
2138 */
2139static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2140                                bool last_stage)
2141{
2142    RAMBlock *block = pss->block;
2143    ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2144    int res;
2145
2146    if (control_save_page(rs, block, offset, &res)) {
2147        return res;
2148    }
2149
2150    if (save_compress_page(rs, block, offset)) {
2151        return 1;
2152    }
2153
2154    res = save_zero_page(rs, block, offset);
2155    if (res > 0) {
2156        /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2157         * page would be stale
2158         */
2159        if (!save_page_use_compression(rs)) {
2160            XBZRLE_cache_lock();
2161            xbzrle_cache_zero_page(rs, block->offset + offset);
2162            XBZRLE_cache_unlock();
2163        }
2164        ram_release_pages(block->idstr, offset, res);
2165        return res;
2166    }
2167
2168    /*
2169     * Do not use multifd for:
2170     * 1. Compression as the first page in the new block should be posted out
2171     *    before sending the compressed page
2172     * 2. In postcopy as one whole host page should be placed
2173     */
2174    if (!save_page_use_compression(rs) && migrate_use_multifd()
2175        && !migration_in_postcopy()) {
2176        return ram_save_multifd_page(rs, block, offset);
2177    }
2178
2179    return ram_save_page(rs, pss, last_stage);
2180}
2181
2182/**
2183 * ram_save_host_page: save a whole host page
2184 *
2185 * Starting at *offset send pages up to the end of the current host
2186 * page. It's valid for the initial offset to point into the middle of
2187 * a host page in which case the remainder of the hostpage is sent.
2188 * Only dirty target pages are sent. Note that the host page size may
2189 * be a huge page for this block.
2190 * The saving stops at the boundary of the used_length of the block
2191 * if the RAMBlock isn't a multiple of the host page size.
2192 *
2193 * Returns the number of pages written or negative on error
2194 *
2195 * @rs: current RAM state
2196 * @ms: current migration state
2197 * @pss: data about the page we want to send
2198 * @last_stage: if we are at the completion stage
2199 */
2200static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2201                              bool last_stage)
2202{
2203    int tmppages, pages = 0;
2204    size_t pagesize_bits =
2205        qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2206    unsigned long hostpage_boundary =
2207        QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2208    unsigned long start_page = pss->page;
2209    int res;
2210
2211    if (ramblock_is_ignored(pss->block)) {
2212        error_report("block %s should not be migrated !", pss->block->idstr);
2213        return 0;
2214    }
2215
2216    do {
2217        /* Check the pages is dirty and if it is send it */
2218        if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2219            tmppages = ram_save_target_page(rs, pss, last_stage);
2220            if (tmppages < 0) {
2221                return tmppages;
2222            }
2223
2224            pages += tmppages;
2225            /*
2226             * Allow rate limiting to happen in the middle of huge pages if
2227             * something is sent in the current iteration.
2228             */
2229            if (pagesize_bits > 1 && tmppages > 0) {
2230                migration_rate_limit();
2231            }
2232        }
2233        pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2234    } while ((pss->page < hostpage_boundary) &&
2235             offset_in_ramblock(pss->block,
2236                                ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2237    /* The offset we leave with is the min boundary of host page and block */
2238    pss->page = MIN(pss->page, hostpage_boundary) - 1;
2239
2240    res = ram_save_release_protection(rs, pss, start_page);
2241    return (res < 0 ? res : pages);
2242}
2243
2244/**
2245 * ram_find_and_save_block: finds a dirty page and sends it to f
2246 *
2247 * Called within an RCU critical section.
2248 *
2249 * Returns the number of pages written where zero means no dirty pages,
2250 * or negative on error
2251 *
2252 * @rs: current RAM state
2253 * @last_stage: if we are at the completion stage
2254 *
2255 * On systems where host-page-size > target-page-size it will send all the
2256 * pages in a host page that are dirty.
2257 */
2258
2259static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2260{
2261    PageSearchStatus pss;
2262    int pages = 0;
2263    bool again, found;
2264
2265    /* No dirty page as there is zero RAM */
2266    if (!ram_bytes_total()) {
2267        return pages;
2268    }
2269
2270    pss.block = rs->last_seen_block;
2271    pss.page = rs->last_page;
2272    pss.complete_round = false;
2273
2274    if (!pss.block) {
2275        pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2276    }
2277
2278    do {
2279        again = true;
2280        found = get_queued_page(rs, &pss);
2281
2282        if (!found) {
2283            /* priority queue empty, so just search for something dirty */
2284            found = find_dirty_block(rs, &pss, &again);
2285        }
2286
2287        if (found) {
2288            pages = ram_save_host_page(rs, &pss, last_stage);
2289        }
2290    } while (!pages && again);
2291
2292    rs->last_seen_block = pss.block;
2293    rs->last_page = pss.page;
2294
2295    return pages;
2296}
2297
2298void acct_update_position(QEMUFile *f, size_t size, bool zero)
2299{
2300    uint64_t pages = size / TARGET_PAGE_SIZE;
2301
2302    if (zero) {
2303        ram_counters.duplicate += pages;
2304    } else {
2305        ram_counters.normal += pages;
2306        ram_counters.transferred += size;
2307        qemu_update_position(f, size);
2308    }
2309}
2310
2311static uint64_t ram_bytes_total_common(bool count_ignored)
2312{
2313    RAMBlock *block;
2314    uint64_t total = 0;
2315
2316    RCU_READ_LOCK_GUARD();
2317
2318    if (count_ignored) {
2319        RAMBLOCK_FOREACH_MIGRATABLE(block) {
2320            total += block->used_length;
2321        }
2322    } else {
2323        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2324            total += block->used_length;
2325        }
2326    }
2327    return total;
2328}
2329
2330uint64_t ram_bytes_total(void)
2331{
2332    return ram_bytes_total_common(false);
2333}
2334
2335static void xbzrle_load_setup(void)
2336{
2337    XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2338}
2339
2340static void xbzrle_load_cleanup(void)
2341{
2342    g_free(XBZRLE.decoded_buf);
2343    XBZRLE.decoded_buf = NULL;
2344}
2345
2346static void ram_state_cleanup(RAMState **rsp)
2347{
2348    if (*rsp) {
2349        migration_page_queue_free(*rsp);
2350        qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2351        qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2352        g_free(*rsp);
2353        *rsp = NULL;
2354    }
2355}
2356
2357static void xbzrle_cleanup(void)
2358{
2359    XBZRLE_cache_lock();
2360    if (XBZRLE.cache) {
2361        cache_fini(XBZRLE.cache);
2362        g_free(XBZRLE.encoded_buf);
2363        g_free(XBZRLE.current_buf);
2364        g_free(XBZRLE.zero_target_page);
2365        XBZRLE.cache = NULL;
2366        XBZRLE.encoded_buf = NULL;
2367        XBZRLE.current_buf = NULL;
2368        XBZRLE.zero_target_page = NULL;
2369    }
2370    XBZRLE_cache_unlock();
2371}
2372
2373static void ram_save_cleanup(void *opaque)
2374{
2375    RAMState **rsp = opaque;
2376    RAMBlock *block;
2377
2378    /* We don't use dirty log with background snapshots */
2379    if (!migrate_background_snapshot()) {
2380        /* caller have hold iothread lock or is in a bh, so there is
2381         * no writing race against the migration bitmap
2382         */
2383        if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2384            /*
2385             * do not stop dirty log without starting it, since
2386             * memory_global_dirty_log_stop will assert that
2387             * memory_global_dirty_log_start/stop used in pairs
2388             */
2389            memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2390        }
2391    }
2392
2393    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2394        g_free(block->clear_bmap);
2395        block->clear_bmap = NULL;
2396        g_free(block->bmap);
2397        block->bmap = NULL;
2398    }
2399
2400    xbzrle_cleanup();
2401    compress_threads_save_cleanup();
2402    ram_state_cleanup(rsp);
2403}
2404
2405static void ram_state_reset(RAMState *rs)
2406{
2407    rs->last_seen_block = NULL;
2408    rs->last_sent_block = NULL;
2409    rs->last_page = 0;
2410    rs->last_version = ram_list.version;
2411    rs->xbzrle_enabled = false;
2412}
2413
2414#define MAX_WAIT 50 /* ms, half buffered_file limit */
2415
2416/*
2417 * 'expected' is the value you expect the bitmap mostly to be full
2418 * of; it won't bother printing lines that are all this value.
2419 * If 'todump' is null the migration bitmap is dumped.
2420 */
2421void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2422                           unsigned long pages)
2423{
2424    int64_t cur;
2425    int64_t linelen = 128;
2426    char linebuf[129];
2427
2428    for (cur = 0; cur < pages; cur += linelen) {
2429        int64_t curb;
2430        bool found = false;
2431        /*
2432         * Last line; catch the case where the line length
2433         * is longer than remaining ram
2434         */
2435        if (cur + linelen > pages) {
2436            linelen = pages - cur;
2437        }
2438        for (curb = 0; curb < linelen; curb++) {
2439            bool thisbit = test_bit(cur + curb, todump);
2440            linebuf[curb] = thisbit ? '1' : '.';
2441            found = found || (thisbit != expected);
2442        }
2443        if (found) {
2444            linebuf[curb] = '\0';
2445            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2446        }
2447    }
2448}
2449
2450/* **** functions for postcopy ***** */
2451
2452void ram_postcopy_migrated_memory_release(MigrationState *ms)
2453{
2454    struct RAMBlock *block;
2455
2456    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2457        unsigned long *bitmap = block->bmap;
2458        unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2459        unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2460
2461        while (run_start < range) {
2462            unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2463            ram_discard_range(block->idstr,
2464                              ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2465                              ((ram_addr_t)(run_end - run_start))
2466                                << TARGET_PAGE_BITS);
2467            run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2468        }
2469    }
2470}
2471
2472/**
2473 * postcopy_send_discard_bm_ram: discard a RAMBlock
2474 *
2475 * Returns zero on success
2476 *
2477 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2478 *
2479 * @ms: current migration state
2480 * @block: RAMBlock to discard
2481 */
2482static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2483{
2484    unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2485    unsigned long current;
2486    unsigned long *bitmap = block->bmap;
2487
2488    for (current = 0; current < end; ) {
2489        unsigned long one = find_next_bit(bitmap, end, current);
2490        unsigned long zero, discard_length;
2491
2492        if (one >= end) {
2493            break;
2494        }
2495
2496        zero = find_next_zero_bit(bitmap, end, one + 1);
2497
2498        if (zero >= end) {
2499            discard_length = end - one;
2500        } else {
2501            discard_length = zero - one;
2502        }
2503        postcopy_discard_send_range(ms, one, discard_length);
2504        current = one + discard_length;
2505    }
2506
2507    return 0;
2508}
2509
2510/**
2511 * postcopy_each_ram_send_discard: discard all RAMBlocks
2512 *
2513 * Returns 0 for success or negative for error
2514 *
2515 * Utility for the outgoing postcopy code.
2516 *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2517 *   passing it bitmap indexes and name.
2518 * (qemu_ram_foreach_block ends up passing unscaled lengths
2519 *  which would mean postcopy code would have to deal with target page)
2520 *
2521 * @ms: current migration state
2522 */
2523static int postcopy_each_ram_send_discard(MigrationState *ms)
2524{
2525    struct RAMBlock *block;
2526    int ret;
2527
2528    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2529        postcopy_discard_send_init(ms, block->idstr);
2530
2531        /*
2532         * Postcopy sends chunks of bitmap over the wire, but it
2533         * just needs indexes at this point, avoids it having
2534         * target page specific code.
2535         */
2536        ret = postcopy_send_discard_bm_ram(ms, block);
2537        postcopy_discard_send_finish(ms);
2538        if (ret) {
2539            return ret;
2540        }
2541    }
2542
2543    return 0;
2544}
2545
2546/**
2547 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2548 *
2549 * Helper for postcopy_chunk_hostpages; it's called twice to
2550 * canonicalize the two bitmaps, that are similar, but one is
2551 * inverted.
2552 *
2553 * Postcopy requires that all target pages in a hostpage are dirty or
2554 * clean, not a mix.  This function canonicalizes the bitmaps.
2555 *
2556 * @ms: current migration state
2557 * @block: block that contains the page we want to canonicalize
2558 */
2559static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2560{
2561    RAMState *rs = ram_state;
2562    unsigned long *bitmap = block->bmap;
2563    unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2564    unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2565    unsigned long run_start;
2566
2567    if (block->page_size == TARGET_PAGE_SIZE) {
2568        /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2569        return;
2570    }
2571
2572    /* Find a dirty page */
2573    run_start = find_next_bit(bitmap, pages, 0);
2574
2575    while (run_start < pages) {
2576
2577        /*
2578         * If the start of this run of pages is in the middle of a host
2579         * page, then we need to fixup this host page.
2580         */
2581        if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2582            /* Find the end of this run */
2583            run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2584            /*
2585             * If the end isn't at the start of a host page, then the
2586             * run doesn't finish at the end of a host page
2587             * and we need to discard.
2588             */
2589        }
2590
2591        if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2592            unsigned long page;
2593            unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2594                                                             host_ratio);
2595            run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2596
2597            /* Clean up the bitmap */
2598            for (page = fixup_start_addr;
2599                 page < fixup_start_addr + host_ratio; page++) {
2600                /*
2601                 * Remark them as dirty, updating the count for any pages
2602                 * that weren't previously dirty.
2603                 */
2604                rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2605            }
2606        }
2607
2608        /* Find the next dirty page for the next iteration */
2609        run_start = find_next_bit(bitmap, pages, run_start);
2610    }
2611}
2612
2613/**
2614 * postcopy_chunk_hostpages: discard any partially sent host page
2615 *
2616 * Utility for the outgoing postcopy code.
2617 *
2618 * Discard any partially sent host-page size chunks, mark any partially
2619 * dirty host-page size chunks as all dirty.  In this case the host-page
2620 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2621 *
2622 * Returns zero on success
2623 *
2624 * @ms: current migration state
2625 * @block: block we want to work with
2626 */
2627static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2628{
2629    postcopy_discard_send_init(ms, block->idstr);
2630
2631    /*
2632     * Ensure that all partially dirty host pages are made fully dirty.
2633     */
2634    postcopy_chunk_hostpages_pass(ms, block);
2635
2636    postcopy_discard_send_finish(ms);
2637    return 0;
2638}
2639
2640/**
2641 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2642 *
2643 * Returns zero on success
2644 *
2645 * Transmit the set of pages to be discarded after precopy to the target
2646 * these are pages that:
2647 *     a) Have been previously transmitted but are now dirty again
2648 *     b) Pages that have never been transmitted, this ensures that
2649 *        any pages on the destination that have been mapped by background
2650 *        tasks get discarded (transparent huge pages is the specific concern)
2651 * Hopefully this is pretty sparse
2652 *
2653 * @ms: current migration state
2654 */
2655int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2656{
2657    RAMState *rs = ram_state;
2658    RAMBlock *block;
2659    int ret;
2660
2661    RCU_READ_LOCK_GUARD();
2662
2663    /* This should be our last sync, the src is now paused */
2664    migration_bitmap_sync(rs);
2665
2666    /* Easiest way to make sure we don't resume in the middle of a host-page */
2667    rs->last_seen_block = NULL;
2668    rs->last_sent_block = NULL;
2669    rs->last_page = 0;
2670
2671    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2672        /* Deal with TPS != HPS and huge pages */
2673        ret = postcopy_chunk_hostpages(ms, block);
2674        if (ret) {
2675            return ret;
2676        }
2677
2678#ifdef DEBUG_POSTCOPY
2679        ram_debug_dump_bitmap(block->bmap, true,
2680                              block->used_length >> TARGET_PAGE_BITS);
2681#endif
2682    }
2683    trace_ram_postcopy_send_discard_bitmap();
2684
2685    return postcopy_each_ram_send_discard(ms);
2686}
2687
2688/**
2689 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2690 *
2691 * Returns zero on success
2692 *
2693 * @rbname: name of the RAMBlock of the request. NULL means the
2694 *          same that last one.
2695 * @start: RAMBlock starting page
2696 * @length: RAMBlock size
2697 */
2698int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2699{
2700    trace_ram_discard_range(rbname, start, length);
2701
2702    RCU_READ_LOCK_GUARD();
2703    RAMBlock *rb = qemu_ram_block_by_name(rbname);
2704
2705    if (!rb) {
2706        error_report("ram_discard_range: Failed to find block '%s'", rbname);
2707        return -1;
2708    }
2709
2710    /*
2711     * On source VM, we don't need to update the received bitmap since
2712     * we don't even have one.
2713     */
2714    if (rb->receivedmap) {
2715        bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2716                     length >> qemu_target_page_bits());
2717    }
2718
2719    return ram_block_discard_range(rb, start, length);
2720}
2721
2722/*
2723 * For every allocation, we will try not to crash the VM if the
2724 * allocation failed.
2725 */
2726static int xbzrle_init(void)
2727{
2728    Error *local_err = NULL;
2729
2730    if (!migrate_use_xbzrle()) {
2731        return 0;
2732    }
2733
2734    XBZRLE_cache_lock();
2735
2736    XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2737    if (!XBZRLE.zero_target_page) {
2738        error_report("%s: Error allocating zero page", __func__);
2739        goto err_out;
2740    }
2741
2742    XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2743                              TARGET_PAGE_SIZE, &local_err);
2744    if (!XBZRLE.cache) {
2745        error_report_err(local_err);
2746        goto free_zero_page;
2747    }
2748
2749    XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2750    if (!XBZRLE.encoded_buf) {
2751        error_report("%s: Error allocating encoded_buf", __func__);
2752        goto free_cache;
2753    }
2754
2755    XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2756    if (!XBZRLE.current_buf) {
2757        error_report("%s: Error allocating current_buf", __func__);
2758        goto free_encoded_buf;
2759    }
2760
2761    /* We are all good */
2762    XBZRLE_cache_unlock();
2763    return 0;
2764
2765free_encoded_buf:
2766    g_free(XBZRLE.encoded_buf);
2767    XBZRLE.encoded_buf = NULL;
2768free_cache:
2769    cache_fini(XBZRLE.cache);
2770    XBZRLE.cache = NULL;
2771free_zero_page:
2772    g_free(XBZRLE.zero_target_page);
2773    XBZRLE.zero_target_page = NULL;
2774err_out:
2775    XBZRLE_cache_unlock();
2776    return -ENOMEM;
2777}
2778
2779static int ram_state_init(RAMState **rsp)
2780{
2781    *rsp = g_try_new0(RAMState, 1);
2782
2783    if (!*rsp) {
2784        error_report("%s: Init ramstate fail", __func__);
2785        return -1;
2786    }
2787
2788    qemu_mutex_init(&(*rsp)->bitmap_mutex);
2789    qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2790    QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2791
2792    /*
2793     * Count the total number of pages used by ram blocks not including any
2794     * gaps due to alignment or unplugs.
2795     * This must match with the initial values of dirty bitmap.
2796     */
2797    (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2798    ram_state_reset(*rsp);
2799
2800    return 0;
2801}
2802
2803static void ram_list_init_bitmaps(void)
2804{
2805    MigrationState *ms = migrate_get_current();
2806    RAMBlock *block;
2807    unsigned long pages;
2808    uint8_t shift;
2809
2810    /* Skip setting bitmap if there is no RAM */
2811    if (ram_bytes_total()) {
2812        shift = ms->clear_bitmap_shift;
2813        if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2814            error_report("clear_bitmap_shift (%u) too big, using "
2815                         "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2816            shift = CLEAR_BITMAP_SHIFT_MAX;
2817        } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2818            error_report("clear_bitmap_shift (%u) too small, using "
2819                         "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2820            shift = CLEAR_BITMAP_SHIFT_MIN;
2821        }
2822
2823        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2824            pages = block->max_length >> TARGET_PAGE_BITS;
2825            /*
2826             * The initial dirty bitmap for migration must be set with all
2827             * ones to make sure we'll migrate every guest RAM page to
2828             * destination.
2829             * Here we set RAMBlock.bmap all to 1 because when rebegin a
2830             * new migration after a failed migration, ram_list.
2831             * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2832             * guest memory.
2833             */
2834            block->bmap = bitmap_new(pages);
2835            bitmap_set(block->bmap, 0, pages);
2836            block->clear_bmap_shift = shift;
2837            block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2838        }
2839    }
2840}
2841
2842static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2843{
2844    unsigned long pages;
2845    RAMBlock *rb;
2846
2847    RCU_READ_LOCK_GUARD();
2848
2849    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2850            pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2851            rs->migration_dirty_pages -= pages;
2852    }
2853}
2854
2855static void ram_init_bitmaps(RAMState *rs)
2856{
2857    /* For memory_global_dirty_log_start below.  */
2858    qemu_mutex_lock_iothread();
2859    qemu_mutex_lock_ramlist();
2860
2861    WITH_RCU_READ_LOCK_GUARD() {
2862        ram_list_init_bitmaps();
2863        /* We don't use dirty log with background snapshots */
2864        if (!migrate_background_snapshot()) {
2865            memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2866            migration_bitmap_sync_precopy(rs);
2867        }
2868    }
2869    qemu_mutex_unlock_ramlist();
2870    qemu_mutex_unlock_iothread();
2871
2872    /*
2873     * After an eventual first bitmap sync, fixup the initial bitmap
2874     * containing all 1s to exclude any discarded pages from migration.
2875     */
2876    migration_bitmap_clear_discarded_pages(rs);
2877}
2878
2879static int ram_init_all(RAMState **rsp)
2880{
2881    if (ram_state_init(rsp)) {
2882        return -1;
2883    }
2884
2885    if (xbzrle_init()) {
2886        ram_state_cleanup(rsp);
2887        return -1;
2888    }
2889
2890    ram_init_bitmaps(*rsp);
2891
2892    return 0;
2893}
2894
2895static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2896{
2897    RAMBlock *block;
2898    uint64_t pages = 0;
2899
2900    /*
2901     * Postcopy is not using xbzrle/compression, so no need for that.
2902     * Also, since source are already halted, we don't need to care
2903     * about dirty page logging as well.
2904     */
2905
2906    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2907        pages += bitmap_count_one(block->bmap,
2908                                  block->used_length >> TARGET_PAGE_BITS);
2909    }
2910
2911    /* This may not be aligned with current bitmaps. Recalculate. */
2912    rs->migration_dirty_pages = pages;
2913
2914    ram_state_reset(rs);
2915
2916    /* Update RAMState cache of output QEMUFile */
2917    rs->f = out;
2918
2919    trace_ram_state_resume_prepare(pages);
2920}
2921
2922/*
2923 * This function clears bits of the free pages reported by the caller from the
2924 * migration dirty bitmap. @addr is the host address corresponding to the
2925 * start of the continuous guest free pages, and @len is the total bytes of
2926 * those pages.
2927 */
2928void qemu_guest_free_page_hint(void *addr, size_t len)
2929{
2930    RAMBlock *block;
2931    ram_addr_t offset;
2932    size_t used_len, start, npages;
2933    MigrationState *s = migrate_get_current();
2934
2935    /* This function is currently expected to be used during live migration */
2936    if (!migration_is_setup_or_active(s->state)) {
2937        return;
2938    }
2939
2940    for (; len > 0; len -= used_len, addr += used_len) {
2941        block = qemu_ram_block_from_host(addr, false, &offset);
2942        if (unlikely(!block || offset >= block->used_length)) {
2943            /*
2944             * The implementation might not support RAMBlock resize during
2945             * live migration, but it could happen in theory with future
2946             * updates. So we add a check here to capture that case.
2947             */
2948            error_report_once("%s unexpected error", __func__);
2949            return;
2950        }
2951
2952        if (len <= block->used_length - offset) {
2953            used_len = len;
2954        } else {
2955            used_len = block->used_length - offset;
2956        }
2957
2958        start = offset >> TARGET_PAGE_BITS;
2959        npages = used_len >> TARGET_PAGE_BITS;
2960
2961        qemu_mutex_lock(&ram_state->bitmap_mutex);
2962        /*
2963         * The skipped free pages are equavalent to be sent from clear_bmap's
2964         * perspective, so clear the bits from the memory region bitmap which
2965         * are initially set. Otherwise those skipped pages will be sent in
2966         * the next round after syncing from the memory region bitmap.
2967         */
2968        migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2969        ram_state->migration_dirty_pages -=
2970                      bitmap_count_one_with_offset(block->bmap, start, npages);
2971        bitmap_clear(block->bmap, start, npages);
2972        qemu_mutex_unlock(&ram_state->bitmap_mutex);
2973    }
2974}
2975
2976/*
2977 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2978 * long-running RCU critical section.  When rcu-reclaims in the code
2979 * start to become numerous it will be necessary to reduce the
2980 * granularity of these critical sections.
2981 */
2982
2983/**
2984 * ram_save_setup: Setup RAM for migration
2985 *
2986 * Returns zero to indicate success and negative for error
2987 *
2988 * @f: QEMUFile where to send the data
2989 * @opaque: RAMState pointer
2990 */
2991static int ram_save_setup(QEMUFile *f, void *opaque)
2992{
2993    RAMState **rsp = opaque;
2994    RAMBlock *block;
2995
2996    if (compress_threads_save_setup()) {
2997        return -1;
2998    }
2999
3000    /* migration has already setup the bitmap, reuse it. */
3001    if (!migration_in_colo_state()) {
3002        if (ram_init_all(rsp) != 0) {
3003            compress_threads_save_cleanup();
3004            return -1;
3005        }
3006    }
3007    (*rsp)->f = f;
3008
3009    WITH_RCU_READ_LOCK_GUARD() {
3010        qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3011
3012        RAMBLOCK_FOREACH_MIGRATABLE(block) {
3013            qemu_put_byte(f, strlen(block->idstr));
3014            qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3015            qemu_put_be64(f, block->used_length);
3016            if (migrate_postcopy_ram() && block->page_size !=
3017                                          qemu_host_page_size) {
3018                qemu_put_be64(f, block->page_size);
3019            }
3020            if (migrate_ignore_shared()) {
3021                qemu_put_be64(f, block->mr->addr);
3022            }
3023        }
3024    }
3025
3026    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3027    ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3028
3029    multifd_send_sync_main(f);
3030    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3031    qemu_fflush(f);
3032
3033    return 0;
3034}
3035
3036/**
3037 * ram_save_iterate: iterative stage for migration
3038 *
3039 * Returns zero to indicate success and negative for error
3040 *
3041 * @f: QEMUFile where to send the data
3042 * @opaque: RAMState pointer
3043 */
3044static int ram_save_iterate(QEMUFile *f, void *opaque)
3045{
3046    RAMState **temp = opaque;
3047    RAMState *rs = *temp;
3048    int ret = 0;
3049    int i;
3050    int64_t t0;
3051    int done = 0;
3052
3053    if (blk_mig_bulk_active()) {
3054        /* Avoid transferring ram during bulk phase of block migration as
3055         * the bulk phase will usually take a long time and transferring
3056         * ram updates during that time is pointless. */
3057        goto out;
3058    }
3059
3060    /*
3061     * We'll take this lock a little bit long, but it's okay for two reasons.
3062     * Firstly, the only possible other thread to take it is who calls
3063     * qemu_guest_free_page_hint(), which should be rare; secondly, see
3064     * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3065     * guarantees that we'll at least released it in a regular basis.
3066     */
3067    qemu_mutex_lock(&rs->bitmap_mutex);
3068    WITH_RCU_READ_LOCK_GUARD() {
3069        if (ram_list.version != rs->last_version) {
3070            ram_state_reset(rs);
3071        }
3072
3073        /* Read version before ram_list.blocks */
3074        smp_rmb();
3075
3076        ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3077
3078        t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3079        i = 0;
3080        while ((ret = qemu_file_rate_limit(f)) == 0 ||
3081                !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3082            int pages;
3083
3084            if (qemu_file_get_error(f)) {
3085                break;
3086            }
3087
3088            pages = ram_find_and_save_block(rs, false);
3089            /* no more pages to sent */
3090            if (pages == 0) {
3091                done = 1;
3092                break;
3093            }
3094
3095            if (pages < 0) {
3096                qemu_file_set_error(f, pages);
3097                break;
3098            }
3099
3100            rs->target_page_count += pages;
3101
3102            /*
3103             * During postcopy, it is necessary to make sure one whole host
3104             * page is sent in one chunk.
3105             */
3106            if (migrate_postcopy_ram()) {
3107                flush_compressed_data(rs);
3108            }
3109
3110            /*
3111             * we want to check in the 1st loop, just in case it was the 1st
3112             * time and we had to sync the dirty bitmap.
3113             * qemu_clock_get_ns() is a bit expensive, so we only check each
3114             * some iterations
3115             */
3116            if ((i & 63) == 0) {
3117                uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3118                              1000000;
3119                if (t1 > MAX_WAIT) {
3120                    trace_ram_save_iterate_big_wait(t1, i);
3121                    break;
3122                }
3123            }
3124            i++;
3125        }
3126    }
3127    qemu_mutex_unlock(&rs->bitmap_mutex);
3128
3129    /*
3130     * Must occur before EOS (or any QEMUFile operation)
3131     * because of RDMA protocol.
3132     */
3133    ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3134
3135out:
3136    if (ret >= 0
3137        && migration_is_setup_or_active(migrate_get_current()->state)) {
3138        multifd_send_sync_main(rs->f);
3139        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3140        qemu_fflush(f);
3141        ram_counters.transferred += 8;
3142
3143        ret = qemu_file_get_error(f);
3144    }
3145    if (ret < 0) {
3146        return ret;
3147    }
3148
3149    return done;
3150}
3151
3152/**
3153 * ram_save_complete: function called to send the remaining amount of ram
3154 *
3155 * Returns zero to indicate success or negative on error
3156 *
3157 * Called with iothread lock
3158 *
3159 * @f: QEMUFile where to send the data
3160 * @opaque: RAMState pointer
3161 */
3162static int ram_save_complete(QEMUFile *f, void *opaque)
3163{
3164    RAMState **temp = opaque;
3165    RAMState *rs = *temp;
3166    int ret = 0;
3167
3168    WITH_RCU_READ_LOCK_GUARD() {
3169        if (!migration_in_postcopy()) {
3170            migration_bitmap_sync_precopy(rs);
3171        }
3172
3173        ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3174
3175        /* try transferring iterative blocks of memory */
3176
3177        /* flush all remaining blocks regardless of rate limiting */
3178        while (true) {
3179            int pages;
3180
3181            pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3182            /* no more blocks to sent */
3183            if (pages == 0) {
3184                break;
3185            }
3186            if (pages < 0) {
3187                ret = pages;
3188                break;
3189            }
3190        }
3191
3192        flush_compressed_data(rs);
3193        ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3194    }
3195
3196    if (ret >= 0) {
3197        multifd_send_sync_main(rs->f);
3198        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3199        qemu_fflush(f);
3200    }
3201
3202    return ret;
3203}
3204
3205static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3206                             uint64_t *res_precopy_only,
3207                             uint64_t *res_compatible,
3208                             uint64_t *res_postcopy_only)
3209{
3210    RAMState **temp = opaque;
3211    RAMState *rs = *temp;
3212    uint64_t remaining_size;
3213
3214    remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3215
3216    if (!migration_in_postcopy() &&
3217        remaining_size < max_size) {
3218        qemu_mutex_lock_iothread();
3219        WITH_RCU_READ_LOCK_GUARD() {
3220            migration_bitmap_sync_precopy(rs);
3221        }
3222        qemu_mutex_unlock_iothread();
3223        remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3224    }
3225
3226    if (migrate_postcopy_ram()) {
3227        /* We can do postcopy, and all the data is postcopiable */
3228        *res_compatible += remaining_size;
3229    } else {
3230        *res_precopy_only += remaining_size;
3231    }
3232}
3233
3234static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3235{
3236    unsigned int xh_len;
3237    int xh_flags;
3238    uint8_t *loaded_data;
3239
3240    /* extract RLE header */
3241    xh_flags = qemu_get_byte(f);
3242    xh_len = qemu_get_be16(f);
3243
3244    if (xh_flags != ENCODING_FLAG_XBZRLE) {
3245        error_report("Failed to load XBZRLE page - wrong compression!");
3246        return -1;
3247    }
3248
3249    if (xh_len > TARGET_PAGE_SIZE) {
3250        error_report("Failed to load XBZRLE page - len overflow!");
3251        return -1;
3252    }
3253    loaded_data = XBZRLE.decoded_buf;
3254    /* load data and decode */
3255    /* it can change loaded_data to point to an internal buffer */
3256    qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3257
3258    /* decode RLE */
3259    if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3260                             TARGET_PAGE_SIZE) == -1) {
3261        error_report("Failed to load XBZRLE page - decode error!");
3262        return -1;
3263    }
3264
3265    return 0;
3266}
3267
3268/**
3269 * ram_block_from_stream: read a RAMBlock id from the migration stream
3270 *
3271 * Must be called from within a rcu critical section.
3272 *
3273 * Returns a pointer from within the RCU-protected ram_list.
3274 *
3275 * @f: QEMUFile where to read the data from
3276 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3277 */
3278static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3279{
3280    static RAMBlock *block;
3281    char id[256];
3282    uint8_t len;
3283
3284    if (flags & RAM_SAVE_FLAG_CONTINUE) {
3285        if (!block) {
3286            error_report("Ack, bad migration stream!");
3287            return NULL;
3288        }
3289        return block;
3290    }
3291
3292    len = qemu_get_byte(f);
3293    qemu_get_buffer(f, (uint8_t *)id, len);
3294    id[len] = 0;
3295
3296    block = qemu_ram_block_by_name(id);
3297    if (!block) {
3298        error_report("Can't find block %s", id);
3299        return NULL;
3300    }
3301
3302    if (ramblock_is_ignored(block)) {
3303        error_report("block %s should not be migrated !", id);
3304        return NULL;
3305    }
3306
3307    return block;
3308}
3309
3310static inline void *host_from_ram_block_offset(RAMBlock *block,
3311                                               ram_addr_t offset)
3312{
3313    if (!offset_in_ramblock(block, offset)) {
3314        return NULL;
3315    }
3316
3317    return block->host + offset;
3318}
3319
3320static void *host_page_from_ram_block_offset(RAMBlock *block,
3321                                             ram_addr_t offset)
3322{
3323    /* Note: Explicitly no check against offset_in_ramblock(). */
3324    return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3325                                   block->page_size);
3326}
3327
3328static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3329                                                         ram_addr_t offset)
3330{
3331    return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3332}
3333
3334static inline void *colo_cache_from_block_offset(RAMBlock *block,
3335                             ram_addr_t offset, bool record_bitmap)
3336{
3337    if (!offset_in_ramblock(block, offset)) {
3338        return NULL;
3339    }
3340    if (!block->colo_cache) {
3341        error_report("%s: colo_cache is NULL in block :%s",
3342                     __func__, block->idstr);
3343        return NULL;
3344    }
3345
3346    /*
3347    * During colo checkpoint, we need bitmap of these migrated pages.
3348    * It help us to decide which pages in ram cache should be flushed
3349    * into VM's RAM later.
3350    */
3351    if (record_bitmap &&
3352        !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3353        ram_state->migration_dirty_pages++;
3354    }
3355    return block->colo_cache + offset;
3356}
3357
3358/**
3359 * ram_handle_compressed: handle the zero page case
3360 *
3361 * If a page (or a whole RDMA chunk) has been
3362 * determined to be zero, then zap it.
3363 *
3364 * @host: host address for the zero page
3365 * @ch: what the page is filled from.  We only support zero
3366 * @size: size of the zero page
3367 */
3368void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3369{
3370    if (ch != 0 || !is_zero_range(host, size)) {
3371        memset(host, ch, size);
3372    }
3373}
3374
3375/* return the size after decompression, or negative value on error */
3376static int
3377qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3378                     const uint8_t *source, size_t source_len)
3379{
3380    int err;
3381
3382    err = inflateReset(stream);
3383    if (err != Z_OK) {
3384        return -1;
3385    }
3386
3387    stream->avail_in = source_len;
3388    stream->next_in = (uint8_t *)source;
3389    stream->avail_out = dest_len;
3390    stream->next_out = dest;
3391
3392    err = inflate(stream, Z_NO_FLUSH);
3393    if (err != Z_STREAM_END) {
3394        return -1;
3395    }
3396
3397    return stream->total_out;
3398}
3399
3400static void *do_data_decompress(void *opaque)
3401{
3402    DecompressParam *param = opaque;
3403    unsigned long pagesize;
3404    uint8_t *des;
3405    int len, ret;
3406
3407    qemu_mutex_lock(&param->mutex);
3408    while (!param->quit) {
3409        if (param->des) {
3410            des = param->des;
3411            len = param->len;
3412            param->des = 0;
3413            qemu_mutex_unlock(&param->mutex);
3414
3415            pagesize = TARGET_PAGE_SIZE;
3416
3417            ret = qemu_uncompress_data(&param->stream, des, pagesize,
3418                                       param->compbuf, len);
3419            if (ret < 0 && migrate_get_current()->decompress_error_check) {
3420                error_report("decompress data failed");
3421                qemu_file_set_error(decomp_file, ret);
3422            }
3423
3424            qemu_mutex_lock(&decomp_done_lock);
3425            param->done = true;
3426            qemu_cond_signal(&decomp_done_cond);
3427            qemu_mutex_unlock(&decomp_done_lock);
3428
3429            qemu_mutex_lock(&param->mutex);
3430        } else {
3431            qemu_cond_wait(&param->cond, &param->mutex);
3432        }
3433    }
3434    qemu_mutex_unlock(&param->mutex);
3435
3436    return NULL;
3437}
3438
3439static int wait_for_decompress_done(void)
3440{
3441    int idx, thread_count;
3442
3443    if (!migrate_use_compression()) {
3444        return 0;
3445    }
3446
3447    thread_count = migrate_decompress_threads();
3448    qemu_mutex_lock(&decomp_done_lock);
3449    for (idx = 0; idx < thread_count; idx++) {
3450        while (!decomp_param[idx].done) {
3451            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3452        }
3453    }
3454    qemu_mutex_unlock(&decomp_done_lock);
3455    return qemu_file_get_error(decomp_file);
3456}
3457
3458static void compress_threads_load_cleanup(void)
3459{
3460    int i, thread_count;
3461
3462    if (!migrate_use_compression()) {
3463        return;
3464    }
3465    thread_count = migrate_decompress_threads();
3466    for (i = 0; i < thread_count; i++) {
3467        /*
3468         * we use it as a indicator which shows if the thread is
3469         * properly init'd or not
3470         */
3471        if (!decomp_param[i].compbuf) {
3472            break;
3473        }
3474
3475        qemu_mutex_lock(&decomp_param[i].mutex);
3476        decomp_param[i].quit = true;
3477        qemu_cond_signal(&decomp_param[i].cond);
3478        qemu_mutex_unlock(&decomp_param[i].mutex);
3479    }
3480    for (i = 0; i < thread_count; i++) {
3481        if (!decomp_param[i].compbuf) {
3482            break;
3483        }
3484
3485        qemu_thread_join(decompress_threads + i);
3486        qemu_mutex_destroy(&decomp_param[i].mutex);
3487        qemu_cond_destroy(&decomp_param[i].cond);
3488        inflateEnd(&decomp_param[i].stream);
3489        g_free(decomp_param[i].compbuf);
3490        decomp_param[i].compbuf = NULL;
3491    }
3492    g_free(decompress_threads);
3493    g_free(decomp_param);
3494    decompress_threads = NULL;
3495    decomp_param = NULL;
3496    decomp_file = NULL;
3497}
3498
3499static int compress_threads_load_setup(QEMUFile *f)
3500{
3501    int i, thread_count;
3502
3503    if (!migrate_use_compression()) {
3504        return 0;
3505    }
3506
3507    thread_count = migrate_decompress_threads();
3508    decompress_threads = g_new0(QemuThread, thread_count);
3509    decomp_param = g_new0(DecompressParam, thread_count);
3510    qemu_mutex_init(&decomp_done_lock);
3511    qemu_cond_init(&decomp_done_cond);
3512    decomp_file = f;
3513    for (i = 0; i < thread_count; i++) {
3514        if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3515            goto exit;
3516        }
3517
3518        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3519        qemu_mutex_init(&decomp_param[i].mutex);
3520        qemu_cond_init(&decomp_param[i].cond);
3521        decomp_param[i].done = true;
3522        decomp_param[i].quit = false;
3523        qemu_thread_create(decompress_threads + i, "decompress",
3524                           do_data_decompress, decomp_param + i,
3525                           QEMU_THREAD_JOINABLE);
3526    }
3527    return 0;
3528exit:
3529    compress_threads_load_cleanup();
3530    return -1;
3531}
3532
3533static void decompress_data_with_multi_threads(QEMUFile *f,
3534                                               void *host, int len)
3535{
3536    int idx, thread_count;
3537
3538    thread_count = migrate_decompress_threads();
3539    QEMU_LOCK_GUARD(&decomp_done_lock);
3540    while (true) {
3541        for (idx = 0; idx < thread_count; idx++) {
3542            if (decomp_param[idx].done) {
3543                decomp_param[idx].done = false;
3544                qemu_mutex_lock(&decomp_param[idx].mutex);
3545                qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3546                decomp_param[idx].des = host;
3547                decomp_param[idx].len = len;
3548                qemu_cond_signal(&decomp_param[idx].cond);
3549                qemu_mutex_unlock(&decomp_param[idx].mutex);
3550                break;
3551            }
3552        }
3553        if (idx < thread_count) {
3554            break;
3555        } else {
3556            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3557        }
3558    }
3559}
3560
3561static void colo_init_ram_state(void)
3562{
3563    ram_state_init(&ram_state);
3564}
3565
3566/*
3567 * colo cache: this is for secondary VM, we cache the whole
3568 * memory of the secondary VM, it is need to hold the global lock
3569 * to call this helper.
3570 */
3571int colo_init_ram_cache(void)
3572{
3573    RAMBlock *block;
3574
3575    WITH_RCU_READ_LOCK_GUARD() {
3576        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3577            block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3578                                                    NULL, false, false);
3579            if (!block->colo_cache) {
3580                error_report("%s: Can't alloc memory for COLO cache of block %s,"
3581                             "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3582                             block->used_length);
3583                RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3584                    if (block->colo_cache) {
3585                        qemu_anon_ram_free(block->colo_cache, block->used_length);
3586                        block->colo_cache = NULL;
3587                    }
3588                }
3589                return -errno;
3590            }
3591            if (!machine_dump_guest_core(current_machine)) {
3592                qemu_madvise(block->colo_cache, block->used_length,
3593                             QEMU_MADV_DONTDUMP);
3594            }
3595        }
3596    }
3597
3598    /*
3599    * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3600    * with to decide which page in cache should be flushed into SVM's RAM. Here
3601    * we use the same name 'ram_bitmap' as for migration.
3602    */
3603    if (ram_bytes_total()) {
3604        RAMBlock *block;
3605
3606        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3607            unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3608            block->bmap = bitmap_new(pages);
3609        }
3610    }
3611
3612    colo_init_ram_state();
3613    return 0;
3614}
3615
3616/* TODO: duplicated with ram_init_bitmaps */
3617void colo_incoming_start_dirty_log(void)
3618{
3619    RAMBlock *block = NULL;
3620    /* For memory_global_dirty_log_start below. */
3621    qemu_mutex_lock_iothread();
3622    qemu_mutex_lock_ramlist();
3623
3624    memory_global_dirty_log_sync();
3625    WITH_RCU_READ_LOCK_GUARD() {
3626        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3627            ramblock_sync_dirty_bitmap(ram_state, block);
3628            /* Discard this dirty bitmap record */
3629            bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3630        }
3631        memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3632    }
3633    ram_state->migration_dirty_pages = 0;
3634    qemu_mutex_unlock_ramlist();
3635    qemu_mutex_unlock_iothread();
3636}
3637
3638/* It is need to hold the global lock to call this helper */
3639void colo_release_ram_cache(void)
3640{
3641    RAMBlock *block;
3642
3643    memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3644    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3645        g_free(block->bmap);
3646        block->bmap = NULL;
3647    }
3648
3649    WITH_RCU_READ_LOCK_GUARD() {
3650        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3651            if (block->colo_cache) {
3652                qemu_anon_ram_free(block->colo_cache, block->used_length);
3653                block->colo_cache = NULL;
3654            }
3655        }
3656    }
3657    ram_state_cleanup(&ram_state);
3658}
3659
3660/**
3661 * ram_load_setup: Setup RAM for migration incoming side
3662 *
3663 * Returns zero to indicate success and negative for error
3664 *
3665 * @f: QEMUFile where to receive the data
3666 * @opaque: RAMState pointer
3667 */
3668static int ram_load_setup(QEMUFile *f, void *opaque)
3669{
3670    if (compress_threads_load_setup(f)) {
3671        return -1;
3672    }
3673
3674    xbzrle_load_setup();
3675    ramblock_recv_map_init();
3676
3677    return 0;
3678}
3679
3680static int ram_load_cleanup(void *opaque)
3681{
3682    RAMBlock *rb;
3683
3684    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3685        qemu_ram_block_writeback(rb);
3686    }
3687
3688    xbzrle_load_cleanup();
3689    compress_threads_load_cleanup();
3690
3691    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3692        g_free(rb->receivedmap);
3693        rb->receivedmap = NULL;
3694    }
3695
3696    return 0;
3697}
3698
3699/**
3700 * ram_postcopy_incoming_init: allocate postcopy data structures
3701 *
3702 * Returns 0 for success and negative if there was one error
3703 *
3704 * @mis: current migration incoming state
3705 *
3706 * Allocate data structures etc needed by incoming migration with
3707 * postcopy-ram. postcopy-ram's similarly names
3708 * postcopy_ram_incoming_init does the work.
3709 */
3710int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3711{
3712    return postcopy_ram_incoming_init(mis);
3713}
3714
3715/**
3716 * ram_load_postcopy: load a page in postcopy case
3717 *
3718 * Returns 0 for success or -errno in case of error
3719 *
3720 * Called in postcopy mode by ram_load().
3721 * rcu_read_lock is taken prior to this being called.
3722 *
3723 * @f: QEMUFile where to send the data
3724 */
3725static int ram_load_postcopy(QEMUFile *f)
3726{
3727    int flags = 0, ret = 0;
3728    bool place_needed = false;
3729    bool matches_target_page_size = false;
3730    MigrationIncomingState *mis = migration_incoming_get_current();
3731    /* Temporary page that is later 'placed' */
3732    void *postcopy_host_page = mis->postcopy_tmp_page;
3733    void *host_page = NULL;
3734    bool all_zero = true;
3735    int target_pages = 0;
3736
3737    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3738        ram_addr_t addr;
3739        void *page_buffer = NULL;
3740        void *place_source = NULL;
3741        RAMBlock *block = NULL;
3742        uint8_t ch;
3743        int len;
3744
3745        addr = qemu_get_be64(f);
3746
3747        /*
3748         * If qemu file error, we should stop here, and then "addr"
3749         * may be invalid
3750         */
3751        ret = qemu_file_get_error(f);
3752        if (ret) {
3753            break;
3754        }
3755
3756        flags = addr & ~TARGET_PAGE_MASK;
3757        addr &= TARGET_PAGE_MASK;
3758
3759        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3760        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3761                     RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3762            block = ram_block_from_stream(f, flags);
3763            if (!block) {
3764                ret = -EINVAL;
3765                break;
3766            }
3767
3768            /*
3769             * Relying on used_length is racy and can result in false positives.
3770             * We might place pages beyond used_length in case RAM was shrunk
3771             * while in postcopy, which is fine - trying to place via
3772             * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3773             */
3774            if (!block->host || addr >= block->postcopy_length) {
3775                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3776                ret = -EINVAL;
3777                break;
3778            }
3779            target_pages++;
3780            matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3781            /*
3782             * Postcopy requires that we place whole host pages atomically;
3783             * these may be huge pages for RAMBlocks that are backed by
3784             * hugetlbfs.
3785             * To make it atomic, the data is read into a temporary page
3786             * that's moved into place later.
3787             * The migration protocol uses,  possibly smaller, target-pages
3788             * however the source ensures it always sends all the components
3789             * of a host page in one chunk.
3790             */
3791            page_buffer = postcopy_host_page +
3792                          host_page_offset_from_ram_block_offset(block, addr);
3793            /* If all TP are zero then we can optimise the place */
3794            if (target_pages == 1) {
3795                host_page = host_page_from_ram_block_offset(block, addr);
3796            } else if (host_page != host_page_from_ram_block_offset(block,
3797                                                                    addr)) {
3798                /* not the 1st TP within the HP */
3799                error_report("Non-same host page %p/%p", host_page,
3800                             host_page_from_ram_block_offset(block, addr));
3801                ret = -EINVAL;
3802                break;
3803            }
3804
3805            /*
3806             * If it's the last part of a host page then we place the host
3807             * page
3808             */
3809            if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3810                place_needed = true;
3811            }
3812            place_source = postcopy_host_page;
3813        }
3814
3815        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3816        case RAM_SAVE_FLAG_ZERO:
3817            ch = qemu_get_byte(f);
3818            /*
3819             * Can skip to set page_buffer when
3820             * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3821             */
3822            if (ch || !matches_target_page_size) {
3823                memset(page_buffer, ch, TARGET_PAGE_SIZE);
3824            }
3825            if (ch) {
3826                all_zero = false;
3827            }
3828            break;
3829
3830        case RAM_SAVE_FLAG_PAGE:
3831            all_zero = false;
3832            if (!matches_target_page_size) {
3833                /* For huge pages, we always use temporary buffer */
3834                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3835            } else {
3836                /*
3837                 * For small pages that matches target page size, we
3838                 * avoid the qemu_file copy.  Instead we directly use
3839                 * the buffer of QEMUFile to place the page.  Note: we
3840                 * cannot do any QEMUFile operation before using that
3841                 * buffer to make sure the buffer is valid when
3842                 * placing the page.
3843                 */
3844                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3845                                         TARGET_PAGE_SIZE);
3846            }
3847            break;
3848        case RAM_SAVE_FLAG_COMPRESS_PAGE:
3849            all_zero = false;
3850            len = qemu_get_be32(f);
3851            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3852                error_report("Invalid compressed data length: %d", len);
3853                ret = -EINVAL;
3854                break;
3855            }
3856            decompress_data_with_multi_threads(f, page_buffer, len);
3857            break;
3858
3859        case RAM_SAVE_FLAG_EOS:
3860            /* normal exit */
3861            multifd_recv_sync_main();
3862            break;
3863        default:
3864            error_report("Unknown combination of migration flags: 0x%x"
3865                         " (postcopy mode)", flags);
3866            ret = -EINVAL;
3867            break;
3868        }
3869
3870        /* Got the whole host page, wait for decompress before placing. */
3871        if (place_needed) {
3872            ret |= wait_for_decompress_done();
3873        }
3874
3875        /* Detect for any possible file errors */
3876        if (!ret && qemu_file_get_error(f)) {
3877            ret = qemu_file_get_error(f);
3878        }
3879
3880        if (!ret && place_needed) {
3881            if (all_zero) {
3882                ret = postcopy_place_page_zero(mis, host_page, block);
3883            } else {
3884                ret = postcopy_place_page(mis, host_page, place_source,
3885                                          block);
3886            }
3887            place_needed = false;
3888            target_pages = 0;
3889            /* Assume we have a zero page until we detect something different */
3890            all_zero = true;
3891        }
3892    }
3893
3894    return ret;
3895}
3896
3897static bool postcopy_is_advised(void)
3898{
3899    PostcopyState ps = postcopy_state_get();
3900    return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3901}
3902
3903static bool postcopy_is_running(void)
3904{
3905    PostcopyState ps = postcopy_state_get();
3906    return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3907}
3908
3909/*
3910 * Flush content of RAM cache into SVM's memory.
3911 * Only flush the pages that be dirtied by PVM or SVM or both.
3912 */
3913void colo_flush_ram_cache(void)
3914{
3915    RAMBlock *block = NULL;
3916    void *dst_host;
3917    void *src_host;
3918    unsigned long offset = 0;
3919
3920    memory_global_dirty_log_sync();
3921    qemu_mutex_lock(&ram_state->bitmap_mutex);
3922    WITH_RCU_READ_LOCK_GUARD() {
3923        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3924            ramblock_sync_dirty_bitmap(ram_state, block);
3925        }
3926    }
3927
3928    trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3929    WITH_RCU_READ_LOCK_GUARD() {
3930        block = QLIST_FIRST_RCU(&ram_list.blocks);
3931
3932        while (block) {
3933            unsigned long num = 0;
3934
3935            offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3936            if (!offset_in_ramblock(block,
3937                                    ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3938                offset = 0;
3939                num = 0;
3940                block = QLIST_NEXT_RCU(block, next);
3941            } else {
3942                unsigned long i = 0;
3943
3944                for (i = 0; i < num; i++) {
3945                    migration_bitmap_clear_dirty(ram_state, block, offset + i);
3946                }
3947                dst_host = block->host
3948                         + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3949                src_host = block->colo_cache
3950                         + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3951                memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3952                offset += num;
3953            }
3954        }
3955    }
3956    trace_colo_flush_ram_cache_end();
3957    qemu_mutex_unlock(&ram_state->bitmap_mutex);
3958}
3959
3960/**
3961 * ram_load_precopy: load pages in precopy case
3962 *
3963 * Returns 0 for success or -errno in case of error
3964 *
3965 * Called in precopy mode by ram_load().
3966 * rcu_read_lock is taken prior to this being called.
3967 *
3968 * @f: QEMUFile where to send the data
3969 */
3970static int ram_load_precopy(QEMUFile *f)
3971{
3972    int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3973    /* ADVISE is earlier, it shows the source has the postcopy capability on */
3974    bool postcopy_advised = postcopy_is_advised();
3975    if (!migrate_use_compression()) {
3976        invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3977    }
3978
3979    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3980        ram_addr_t addr, total_ram_bytes;
3981        void *host = NULL, *host_bak = NULL;
3982        uint8_t ch;
3983
3984        /*
3985         * Yield periodically to let main loop run, but an iteration of
3986         * the main loop is expensive, so do it each some iterations
3987         */
3988        if ((i & 32767) == 0 && qemu_in_coroutine()) {
3989            aio_co_schedule(qemu_get_current_aio_context(),
3990                            qemu_coroutine_self());
3991            qemu_coroutine_yield();
3992        }
3993        i++;
3994
3995        addr = qemu_get_be64(f);
3996        flags = addr & ~TARGET_PAGE_MASK;
3997        addr &= TARGET_PAGE_MASK;
3998
3999        if (flags & invalid_flags) {
4000            if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4001                error_report("Received an unexpected compressed page");
4002            }
4003
4004            ret = -EINVAL;
4005            break;
4006        }
4007
4008        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4009                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4010            RAMBlock *block = ram_block_from_stream(f, flags);
4011
4012            host = host_from_ram_block_offset(block, addr);
4013            /*
4014             * After going into COLO stage, we should not load the page
4015             * into SVM's memory directly, we put them into colo_cache firstly.
4016             * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4017             * Previously, we copied all these memory in preparing stage of COLO
4018             * while we need to stop VM, which is a time-consuming process.
4019             * Here we optimize it by a trick, back-up every page while in
4020             * migration process while COLO is enabled, though it affects the
4021             * speed of the migration, but it obviously reduce the downtime of
4022             * back-up all SVM'S memory in COLO preparing stage.
4023             */
4024            if (migration_incoming_colo_enabled()) {
4025                if (migration_incoming_in_colo_state()) {
4026                    /* In COLO stage, put all pages into cache temporarily */
4027                    host = colo_cache_from_block_offset(block, addr, true);
4028                } else {
4029                   /*
4030                    * In migration stage but before COLO stage,
4031                    * Put all pages into both cache and SVM's memory.
4032                    */
4033                    host_bak = colo_cache_from_block_offset(block, addr, false);
4034                }
4035            }
4036            if (!host) {
4037                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4038                ret = -EINVAL;
4039                break;
4040            }
4041            if (!migration_incoming_in_colo_state()) {
4042                ramblock_recv_bitmap_set(block, host);
4043            }
4044
4045            trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4046        }
4047
4048        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4049        case RAM_SAVE_FLAG_MEM_SIZE:
4050            /* Synchronize RAM block list */
4051            total_ram_bytes = addr;
4052            while (!ret && total_ram_bytes) {
4053                RAMBlock *block;
4054                char id[256];
4055                ram_addr_t length;
4056
4057                len = qemu_get_byte(f);
4058                qemu_get_buffer(f, (uint8_t *)id, len);
4059                id[len] = 0;
4060                length = qemu_get_be64(f);
4061
4062                block = qemu_ram_block_by_name(id);
4063                if (block && !qemu_ram_is_migratable(block)) {
4064                    error_report("block %s should not be migrated !", id);
4065                    ret = -EINVAL;
4066                } else if (block) {
4067                    if (length != block->used_length) {
4068                        Error *local_err = NULL;
4069
4070                        ret = qemu_ram_resize(block, length,
4071                                              &local_err);
4072                        if (local_err) {
4073                            error_report_err(local_err);
4074                        }
4075                    }
4076                    /* For postcopy we need to check hugepage sizes match */
4077                    if (postcopy_advised && migrate_postcopy_ram() &&
4078                        block->page_size != qemu_host_page_size) {
4079                        uint64_t remote_page_size = qemu_get_be64(f);
4080                        if (remote_page_size != block->page_size) {
4081                            error_report("Mismatched RAM page size %s "
4082                                         "(local) %zd != %" PRId64,
4083                                         id, block->page_size,
4084                                         remote_page_size);
4085                            ret = -EINVAL;
4086                        }
4087                    }
4088                    if (migrate_ignore_shared()) {
4089                        hwaddr addr = qemu_get_be64(f);
4090                        if (ramblock_is_ignored(block) &&
4091                            block->mr->addr != addr) {
4092                            error_report("Mismatched GPAs for block %s "
4093                                         "%" PRId64 "!= %" PRId64,
4094                                         id, (uint64_t)addr,
4095                                         (uint64_t)block->mr->addr);
4096                            ret = -EINVAL;
4097                        }
4098                    }
4099                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4100                                          block->idstr);
4101                } else {
4102                    error_report("Unknown ramblock \"%s\", cannot "
4103                                 "accept migration", id);
4104                    ret = -EINVAL;
4105                }
4106
4107                total_ram_bytes -= length;
4108            }
4109            break;
4110
4111        case RAM_SAVE_FLAG_ZERO:
4112            ch = qemu_get_byte(f);
4113            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4114            break;
4115
4116        case RAM_SAVE_FLAG_PAGE:
4117            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4118            break;
4119
4120        case RAM_SAVE_FLAG_COMPRESS_PAGE:
4121            len = qemu_get_be32(f);
4122            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4123                error_report("Invalid compressed data length: %d", len);
4124                ret = -EINVAL;
4125                break;
4126            }
4127            decompress_data_with_multi_threads(f, host, len);
4128            break;
4129
4130        case RAM_SAVE_FLAG_XBZRLE:
4131            if (load_xbzrle(f, addr, host) < 0) {
4132                error_report("Failed to decompress XBZRLE page at "
4133                             RAM_ADDR_FMT, addr);
4134                ret = -EINVAL;
4135                break;
4136            }
4137            break;
4138        case RAM_SAVE_FLAG_EOS:
4139            /* normal exit */
4140            multifd_recv_sync_main();
4141            break;
4142        default:
4143            if (flags & RAM_SAVE_FLAG_HOOK) {
4144                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4145            } else {
4146                error_report("Unknown combination of migration flags: 0x%x",
4147                             flags);
4148                ret = -EINVAL;
4149            }
4150        }
4151        if (!ret) {
4152            ret = qemu_file_get_error(f);
4153        }
4154        if (!ret && host_bak) {
4155            memcpy(host_bak, host, TARGET_PAGE_SIZE);
4156        }
4157    }
4158
4159    ret |= wait_for_decompress_done();
4160    return ret;
4161}
4162
4163static int ram_load(QEMUFile *f, void *opaque, int version_id)
4164{
4165    int ret = 0;
4166    static uint64_t seq_iter;
4167    /*
4168     * If system is running in postcopy mode, page inserts to host memory must
4169     * be atomic
4170     */
4171    bool postcopy_running = postcopy_is_running();
4172
4173    seq_iter++;
4174
4175    if (version_id != 4) {
4176        return -EINVAL;
4177    }
4178
4179    /*
4180     * This RCU critical section can be very long running.
4181     * When RCU reclaims in the code start to become numerous,
4182     * it will be necessary to reduce the granularity of this
4183     * critical section.
4184     */
4185    WITH_RCU_READ_LOCK_GUARD() {
4186        if (postcopy_running) {
4187            ret = ram_load_postcopy(f);
4188        } else {
4189            ret = ram_load_precopy(f);
4190        }
4191    }
4192    trace_ram_load_complete(ret, seq_iter);
4193
4194    return ret;
4195}
4196
4197static bool ram_has_postcopy(void *opaque)
4198{
4199    RAMBlock *rb;
4200    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4201        if (ramblock_is_pmem(rb)) {
4202            info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4203                         "is not supported now!", rb->idstr, rb->host);
4204            return false;
4205        }
4206    }
4207
4208    return migrate_postcopy_ram();
4209}
4210
4211/* Sync all the dirty bitmap with destination VM.  */
4212static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4213{
4214    RAMBlock *block;
4215    QEMUFile *file = s->to_dst_file;
4216    int ramblock_count = 0;
4217
4218    trace_ram_dirty_bitmap_sync_start();
4219
4220    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4221        qemu_savevm_send_recv_bitmap(file, block->idstr);
4222        trace_ram_dirty_bitmap_request(block->idstr);
4223        ramblock_count++;
4224    }
4225
4226    trace_ram_dirty_bitmap_sync_wait();
4227
4228    /* Wait until all the ramblocks' dirty bitmap synced */
4229    while (ramblock_count--) {
4230        qemu_sem_wait(&s->rp_state.rp_sem);
4231    }
4232
4233    trace_ram_dirty_bitmap_sync_complete();
4234
4235    return 0;
4236}
4237
4238static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4239{
4240    qemu_sem_post(&s->rp_state.rp_sem);
4241}
4242
4243/*
4244 * Read the received bitmap, revert it as the initial dirty bitmap.
4245 * This is only used when the postcopy migration is paused but wants
4246 * to resume from a middle point.
4247 */
4248int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4249{
4250    int ret = -EINVAL;
4251    /* from_dst_file is always valid because we're within rp_thread */
4252    QEMUFile *file = s->rp_state.from_dst_file;
4253    unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4254    uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4255    uint64_t size, end_mark;
4256
4257    trace_ram_dirty_bitmap_reload_begin(block->idstr);
4258
4259    if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4260        error_report("%s: incorrect state %s", __func__,
4261                     MigrationStatus_str(s->state));
4262        return -EINVAL;
4263    }
4264
4265    /*
4266     * Note: see comments in ramblock_recv_bitmap_send() on why we
4267     * need the endianness conversion, and the paddings.
4268     */
4269    local_size = ROUND_UP(local_size, 8);
4270
4271    /* Add paddings */
4272    le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4273
4274    size = qemu_get_be64(file);
4275
4276    /* The size of the bitmap should match with our ramblock */
4277    if (size != local_size) {
4278        error_report("%s: ramblock '%s' bitmap size mismatch "
4279                     "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4280                     block->idstr, size, local_size);
4281        ret = -EINVAL;
4282        goto out;
4283    }
4284
4285    size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4286    end_mark = qemu_get_be64(file);
4287
4288    ret = qemu_file_get_error(file);
4289    if (ret || size != local_size) {
4290        error_report("%s: read bitmap failed for ramblock '%s': %d"
4291                     " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4292                     __func__, block->idstr, ret, local_size, size);
4293        ret = -EIO;
4294        goto out;
4295    }
4296
4297    if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4298        error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4299                     __func__, block->idstr, end_mark);
4300        ret = -EINVAL;
4301        goto out;
4302    }
4303
4304    /*
4305     * Endianness conversion. We are during postcopy (though paused).
4306     * The dirty bitmap won't change. We can directly modify it.
4307     */
4308    bitmap_from_le(block->bmap, le_bitmap, nbits);
4309
4310    /*
4311     * What we received is "received bitmap". Revert it as the initial
4312     * dirty bitmap for this ramblock.
4313     */
4314    bitmap_complement(block->bmap, block->bmap, nbits);
4315
4316    /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4317    ramblock_dirty_bitmap_clear_discarded_pages(block);
4318
4319    /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4320    trace_ram_dirty_bitmap_reload_complete(block->idstr);
4321
4322    /*
4323     * We succeeded to sync bitmap for current ramblock. If this is
4324     * the last one to sync, we need to notify the main send thread.
4325     */
4326    ram_dirty_bitmap_reload_notify(s);
4327
4328    ret = 0;
4329out:
4330    g_free(le_bitmap);
4331    return ret;
4332}
4333
4334static int ram_resume_prepare(MigrationState *s, void *opaque)
4335{
4336    RAMState *rs = *(RAMState **)opaque;
4337    int ret;
4338
4339    ret = ram_dirty_bitmap_sync_all(s, rs);
4340    if (ret) {
4341        return ret;
4342    }
4343
4344    ram_state_resume_prepare(rs, s->to_dst_file);
4345
4346    return 0;
4347}
4348
4349static SaveVMHandlers savevm_ram_handlers = {
4350    .save_setup = ram_save_setup,
4351    .save_live_iterate = ram_save_iterate,
4352    .save_live_complete_postcopy = ram_save_complete,
4353    .save_live_complete_precopy = ram_save_complete,
4354    .has_postcopy = ram_has_postcopy,
4355    .save_live_pending = ram_save_pending,
4356    .load_state = ram_load,
4357    .save_cleanup = ram_save_cleanup,
4358    .load_setup = ram_load_setup,
4359    .load_cleanup = ram_load_cleanup,
4360    .resume_prepare = ram_resume_prepare,
4361};
4362
4363static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4364                                      size_t old_size, size_t new_size)
4365{
4366    PostcopyState ps = postcopy_state_get();
4367    ram_addr_t offset;
4368    RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4369    Error *err = NULL;
4370
4371    if (ramblock_is_ignored(rb)) {
4372        return;
4373    }
4374
4375    if (!migration_is_idle()) {
4376        /*
4377         * Precopy code on the source cannot deal with the size of RAM blocks
4378         * changing at random points in time - especially after sending the
4379         * RAM block sizes in the migration stream, they must no longer change.
4380         * Abort and indicate a proper reason.
4381         */
4382        error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4383        migration_cancel(err);
4384        error_free(err);
4385    }
4386
4387    switch (ps) {
4388    case POSTCOPY_INCOMING_ADVISE:
4389        /*
4390         * Update what ram_postcopy_incoming_init()->init_range() does at the
4391         * time postcopy was advised. Syncing RAM blocks with the source will
4392         * result in RAM resizes.
4393         */
4394        if (old_size < new_size) {
4395            if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4396                error_report("RAM block '%s' discard of resized RAM failed",
4397                             rb->idstr);
4398            }
4399        }
4400        rb->postcopy_length = new_size;
4401        break;
4402    case POSTCOPY_INCOMING_NONE:
4403    case POSTCOPY_INCOMING_RUNNING:
4404    case POSTCOPY_INCOMING_END:
4405        /*
4406         * Once our guest is running, postcopy does no longer care about
4407         * resizes. When growing, the new memory was not available on the
4408         * source, no handler needed.
4409         */
4410        break;
4411    default:
4412        error_report("RAM block '%s' resized during postcopy state: %d",
4413                     rb->idstr, ps);
4414        exit(-1);
4415    }
4416}
4417
4418static RAMBlockNotifier ram_mig_ram_notifier = {
4419    .ram_block_resized = ram_mig_ram_block_resized,
4420};
4421
4422void ram_mig_init(void)
4423{
4424    qemu_mutex_init(&XBZRLE.lock);
4425    register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4426    ram_block_notifier_add(&ram_mig_ram_notifier);
4427}
4428