qemu/migration/ram.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2011-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include "cpu.h"
  31#include "qemu/cutils.h"
  32#include "qemu/bitops.h"
  33#include "qemu/bitmap.h"
  34#include "qemu/main-loop.h"
  35#include "xbzrle.h"
  36#include "ram.h"
  37#include "migration.h"
  38#include "migration/register.h"
  39#include "migration/misc.h"
  40#include "qemu-file.h"
  41#include "postcopy-ram.h"
  42#include "page_cache.h"
  43#include "qemu/error-report.h"
  44#include "qapi/error.h"
  45#include "qapi/qapi-types-migration.h"
  46#include "qapi/qapi-events-migration.h"
  47#include "qapi/qmp/qerror.h"
  48#include "trace.h"
  49#include "exec/ram_addr.h"
  50#include "exec/target_page.h"
  51#include "qemu/rcu_queue.h"
  52#include "migration/colo.h"
  53#include "block.h"
  54#include "sysemu/sysemu.h"
  55#include "sysemu/cpu-throttle.h"
  56#include "savevm.h"
  57#include "qemu/iov.h"
  58#include "multifd.h"
  59#include "sysemu/runstate.h"
  60
  61#if defined(__linux__)
  62#include "qemu/userfaultfd.h"
  63#endif /* defined(__linux__) */
  64
  65/***********************************************************/
  66/* ram save/restore */
  67
  68/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  69 * worked for pages that where filled with the same char.  We switched
  70 * it to only search for the zero value.  And to avoid confusion with
  71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  72 */
  73
  74#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  75#define RAM_SAVE_FLAG_ZERO     0x02
  76#define RAM_SAVE_FLAG_MEM_SIZE 0x04
  77#define RAM_SAVE_FLAG_PAGE     0x08
  78#define RAM_SAVE_FLAG_EOS      0x10
  79#define RAM_SAVE_FLAG_CONTINUE 0x20
  80#define RAM_SAVE_FLAG_XBZRLE   0x40
  81/* 0x80 is reserved in migration.h start with 0x100 next */
  82#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  83
  84static inline bool is_zero_range(uint8_t *p, uint64_t size)
  85{
  86    return buffer_is_zero(p, size);
  87}
  88
  89XBZRLECacheStats xbzrle_counters;
  90
  91/* struct contains XBZRLE cache and a static page
  92   used by the compression */
  93static struct {
  94    /* buffer used for XBZRLE encoding */
  95    uint8_t *encoded_buf;
  96    /* buffer for storing page content */
  97    uint8_t *current_buf;
  98    /* Cache for XBZRLE, Protected by lock. */
  99    PageCache *cache;
 100    QemuMutex lock;
 101    /* it will store a page full of zeros */
 102    uint8_t *zero_target_page;
 103    /* buffer used for XBZRLE decoding */
 104    uint8_t *decoded_buf;
 105} XBZRLE;
 106
 107static void XBZRLE_cache_lock(void)
 108{
 109    if (migrate_use_xbzrle()) {
 110        qemu_mutex_lock(&XBZRLE.lock);
 111    }
 112}
 113
 114static void XBZRLE_cache_unlock(void)
 115{
 116    if (migrate_use_xbzrle()) {
 117        qemu_mutex_unlock(&XBZRLE.lock);
 118    }
 119}
 120
 121/**
 122 * xbzrle_cache_resize: resize the xbzrle cache
 123 *
 124 * This function is called from migrate_params_apply in main
 125 * thread, possibly while a migration is in progress.  A running
 126 * migration may be using the cache and might finish during this call,
 127 * hence changes to the cache are protected by XBZRLE.lock().
 128 *
 129 * Returns 0 for success or -1 for error
 130 *
 131 * @new_size: new cache size
 132 * @errp: set *errp if the check failed, with reason
 133 */
 134int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 135{
 136    PageCache *new_cache;
 137    int64_t ret = 0;
 138
 139    /* Check for truncation */
 140    if (new_size != (size_t)new_size) {
 141        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 142                   "exceeding address space");
 143        return -1;
 144    }
 145
 146    if (new_size == migrate_xbzrle_cache_size()) {
 147        /* nothing to do */
 148        return 0;
 149    }
 150
 151    XBZRLE_cache_lock();
 152
 153    if (XBZRLE.cache != NULL) {
 154        new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 155        if (!new_cache) {
 156            ret = -1;
 157            goto out;
 158        }
 159
 160        cache_fini(XBZRLE.cache);
 161        XBZRLE.cache = new_cache;
 162    }
 163out:
 164    XBZRLE_cache_unlock();
 165    return ret;
 166}
 167
 168bool ramblock_is_ignored(RAMBlock *block)
 169{
 170    return !qemu_ram_is_migratable(block) ||
 171           (migrate_ignore_shared() && qemu_ram_is_shared(block));
 172}
 173
 174#undef RAMBLOCK_FOREACH
 175
 176int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 177{
 178    RAMBlock *block;
 179    int ret = 0;
 180
 181    RCU_READ_LOCK_GUARD();
 182
 183    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 184        ret = func(block, opaque);
 185        if (ret) {
 186            break;
 187        }
 188    }
 189    return ret;
 190}
 191
 192static void ramblock_recv_map_init(void)
 193{
 194    RAMBlock *rb;
 195
 196    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 197        assert(!rb->receivedmap);
 198        rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 199    }
 200}
 201
 202int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 203{
 204    return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 205                    rb->receivedmap);
 206}
 207
 208bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 209{
 210    return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 211}
 212
 213void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 214{
 215    set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 216}
 217
 218void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 219                                    size_t nr)
 220{
 221    bitmap_set_atomic(rb->receivedmap,
 222                      ramblock_recv_bitmap_offset(host_addr, rb),
 223                      nr);
 224}
 225
 226#define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 227
 228/*
 229 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 230 *
 231 * Returns >0 if success with sent bytes, or <0 if error.
 232 */
 233int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 234                                  const char *block_name)
 235{
 236    RAMBlock *block = qemu_ram_block_by_name(block_name);
 237    unsigned long *le_bitmap, nbits;
 238    uint64_t size;
 239
 240    if (!block) {
 241        error_report("%s: invalid block name: %s", __func__, block_name);
 242        return -1;
 243    }
 244
 245    nbits = block->used_length >> TARGET_PAGE_BITS;
 246
 247    /*
 248     * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 249     * machines we may need 4 more bytes for padding (see below
 250     * comment). So extend it a bit before hand.
 251     */
 252    le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 253
 254    /*
 255     * Always use little endian when sending the bitmap. This is
 256     * required that when source and destination VMs are not using the
 257     * same endianness. (Note: big endian won't work.)
 258     */
 259    bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 260
 261    /* Size of the bitmap, in bytes */
 262    size = DIV_ROUND_UP(nbits, 8);
 263
 264    /*
 265     * size is always aligned to 8 bytes for 64bit machines, but it
 266     * may not be true for 32bit machines. We need this padding to
 267     * make sure the migration can survive even between 32bit and
 268     * 64bit machines.
 269     */
 270    size = ROUND_UP(size, 8);
 271
 272    qemu_put_be64(file, size);
 273    qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 274    /*
 275     * Mark as an end, in case the middle part is screwed up due to
 276     * some "mysterious" reason.
 277     */
 278    qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 279    qemu_fflush(file);
 280
 281    g_free(le_bitmap);
 282
 283    if (qemu_file_get_error(file)) {
 284        return qemu_file_get_error(file);
 285    }
 286
 287    return size + sizeof(size);
 288}
 289
 290/*
 291 * An outstanding page request, on the source, having been received
 292 * and queued
 293 */
 294struct RAMSrcPageRequest {
 295    RAMBlock *rb;
 296    hwaddr    offset;
 297    hwaddr    len;
 298
 299    QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 300};
 301
 302/* State of RAM for migration */
 303struct RAMState {
 304    /* QEMUFile used for this migration */
 305    QEMUFile *f;
 306    /* UFFD file descriptor, used in 'write-tracking' migration */
 307    int uffdio_fd;
 308    /* Last block that we have visited searching for dirty pages */
 309    RAMBlock *last_seen_block;
 310    /* Last block from where we have sent data */
 311    RAMBlock *last_sent_block;
 312    /* Last dirty target page we have sent */
 313    ram_addr_t last_page;
 314    /* last ram version we have seen */
 315    uint32_t last_version;
 316    /* We are in the first round */
 317    bool ram_bulk_stage;
 318    /* The free page optimization is enabled */
 319    bool fpo_enabled;
 320    /* How many times we have dirty too many pages */
 321    int dirty_rate_high_cnt;
 322    /* these variables are used for bitmap sync */
 323    /* last time we did a full bitmap_sync */
 324    int64_t time_last_bitmap_sync;
 325    /* bytes transferred at start_time */
 326    uint64_t bytes_xfer_prev;
 327    /* number of dirty pages since start_time */
 328    uint64_t num_dirty_pages_period;
 329    /* xbzrle misses since the beginning of the period */
 330    uint64_t xbzrle_cache_miss_prev;
 331    /* Amount of xbzrle pages since the beginning of the period */
 332    uint64_t xbzrle_pages_prev;
 333    /* Amount of xbzrle encoded bytes since the beginning of the period */
 334    uint64_t xbzrle_bytes_prev;
 335
 336    /* compression statistics since the beginning of the period */
 337    /* amount of count that no free thread to compress data */
 338    uint64_t compress_thread_busy_prev;
 339    /* amount bytes after compression */
 340    uint64_t compressed_size_prev;
 341    /* amount of compressed pages */
 342    uint64_t compress_pages_prev;
 343
 344    /* total handled target pages at the beginning of period */
 345    uint64_t target_page_count_prev;
 346    /* total handled target pages since start */
 347    uint64_t target_page_count;
 348    /* number of dirty bits in the bitmap */
 349    uint64_t migration_dirty_pages;
 350    /* Protects modification of the bitmap and migration dirty pages */
 351    QemuMutex bitmap_mutex;
 352    /* The RAMBlock used in the last src_page_requests */
 353    RAMBlock *last_req_rb;
 354    /* Queue of outstanding page requests from the destination */
 355    QemuMutex src_page_req_mutex;
 356    QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 357};
 358typedef struct RAMState RAMState;
 359
 360static RAMState *ram_state;
 361
 362static NotifierWithReturnList precopy_notifier_list;
 363
 364void precopy_infrastructure_init(void)
 365{
 366    notifier_with_return_list_init(&precopy_notifier_list);
 367}
 368
 369void precopy_add_notifier(NotifierWithReturn *n)
 370{
 371    notifier_with_return_list_add(&precopy_notifier_list, n);
 372}
 373
 374void precopy_remove_notifier(NotifierWithReturn *n)
 375{
 376    notifier_with_return_remove(n);
 377}
 378
 379int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 380{
 381    PrecopyNotifyData pnd;
 382    pnd.reason = reason;
 383    pnd.errp = errp;
 384
 385    return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 386}
 387
 388void precopy_enable_free_page_optimization(void)
 389{
 390    if (!ram_state) {
 391        return;
 392    }
 393
 394    ram_state->fpo_enabled = true;
 395}
 396
 397uint64_t ram_bytes_remaining(void)
 398{
 399    return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 400                       0;
 401}
 402
 403MigrationStats ram_counters;
 404
 405/* used by the search for pages to send */
 406struct PageSearchStatus {
 407    /* Current block being searched */
 408    RAMBlock    *block;
 409    /* Current page to search from */
 410    unsigned long page;
 411    /* Set once we wrap around */
 412    bool         complete_round;
 413};
 414typedef struct PageSearchStatus PageSearchStatus;
 415
 416CompressionStats compression_counters;
 417
 418struct CompressParam {
 419    bool done;
 420    bool quit;
 421    bool zero_page;
 422    QEMUFile *file;
 423    QemuMutex mutex;
 424    QemuCond cond;
 425    RAMBlock *block;
 426    ram_addr_t offset;
 427
 428    /* internally used fields */
 429    z_stream stream;
 430    uint8_t *originbuf;
 431};
 432typedef struct CompressParam CompressParam;
 433
 434struct DecompressParam {
 435    bool done;
 436    bool quit;
 437    QemuMutex mutex;
 438    QemuCond cond;
 439    void *des;
 440    uint8_t *compbuf;
 441    int len;
 442    z_stream stream;
 443};
 444typedef struct DecompressParam DecompressParam;
 445
 446static CompressParam *comp_param;
 447static QemuThread *compress_threads;
 448/* comp_done_cond is used to wake up the migration thread when
 449 * one of the compression threads has finished the compression.
 450 * comp_done_lock is used to co-work with comp_done_cond.
 451 */
 452static QemuMutex comp_done_lock;
 453static QemuCond comp_done_cond;
 454/* The empty QEMUFileOps will be used by file in CompressParam */
 455static const QEMUFileOps empty_ops = { };
 456
 457static QEMUFile *decomp_file;
 458static DecompressParam *decomp_param;
 459static QemuThread *decompress_threads;
 460static QemuMutex decomp_done_lock;
 461static QemuCond decomp_done_cond;
 462
 463static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 464                                 ram_addr_t offset, uint8_t *source_buf);
 465
 466static void *do_data_compress(void *opaque)
 467{
 468    CompressParam *param = opaque;
 469    RAMBlock *block;
 470    ram_addr_t offset;
 471    bool zero_page;
 472
 473    qemu_mutex_lock(&param->mutex);
 474    while (!param->quit) {
 475        if (param->block) {
 476            block = param->block;
 477            offset = param->offset;
 478            param->block = NULL;
 479            qemu_mutex_unlock(&param->mutex);
 480
 481            zero_page = do_compress_ram_page(param->file, &param->stream,
 482                                             block, offset, param->originbuf);
 483
 484            qemu_mutex_lock(&comp_done_lock);
 485            param->done = true;
 486            param->zero_page = zero_page;
 487            qemu_cond_signal(&comp_done_cond);
 488            qemu_mutex_unlock(&comp_done_lock);
 489
 490            qemu_mutex_lock(&param->mutex);
 491        } else {
 492            qemu_cond_wait(&param->cond, &param->mutex);
 493        }
 494    }
 495    qemu_mutex_unlock(&param->mutex);
 496
 497    return NULL;
 498}
 499
 500static void compress_threads_save_cleanup(void)
 501{
 502    int i, thread_count;
 503
 504    if (!migrate_use_compression() || !comp_param) {
 505        return;
 506    }
 507
 508    thread_count = migrate_compress_threads();
 509    for (i = 0; i < thread_count; i++) {
 510        /*
 511         * we use it as a indicator which shows if the thread is
 512         * properly init'd or not
 513         */
 514        if (!comp_param[i].file) {
 515            break;
 516        }
 517
 518        qemu_mutex_lock(&comp_param[i].mutex);
 519        comp_param[i].quit = true;
 520        qemu_cond_signal(&comp_param[i].cond);
 521        qemu_mutex_unlock(&comp_param[i].mutex);
 522
 523        qemu_thread_join(compress_threads + i);
 524        qemu_mutex_destroy(&comp_param[i].mutex);
 525        qemu_cond_destroy(&comp_param[i].cond);
 526        deflateEnd(&comp_param[i].stream);
 527        g_free(comp_param[i].originbuf);
 528        qemu_fclose(comp_param[i].file);
 529        comp_param[i].file = NULL;
 530    }
 531    qemu_mutex_destroy(&comp_done_lock);
 532    qemu_cond_destroy(&comp_done_cond);
 533    g_free(compress_threads);
 534    g_free(comp_param);
 535    compress_threads = NULL;
 536    comp_param = NULL;
 537}
 538
 539static int compress_threads_save_setup(void)
 540{
 541    int i, thread_count;
 542
 543    if (!migrate_use_compression()) {
 544        return 0;
 545    }
 546    thread_count = migrate_compress_threads();
 547    compress_threads = g_new0(QemuThread, thread_count);
 548    comp_param = g_new0(CompressParam, thread_count);
 549    qemu_cond_init(&comp_done_cond);
 550    qemu_mutex_init(&comp_done_lock);
 551    for (i = 0; i < thread_count; i++) {
 552        comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 553        if (!comp_param[i].originbuf) {
 554            goto exit;
 555        }
 556
 557        if (deflateInit(&comp_param[i].stream,
 558                        migrate_compress_level()) != Z_OK) {
 559            g_free(comp_param[i].originbuf);
 560            goto exit;
 561        }
 562
 563        /* comp_param[i].file is just used as a dummy buffer to save data,
 564         * set its ops to empty.
 565         */
 566        comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 567        comp_param[i].done = true;
 568        comp_param[i].quit = false;
 569        qemu_mutex_init(&comp_param[i].mutex);
 570        qemu_cond_init(&comp_param[i].cond);
 571        qemu_thread_create(compress_threads + i, "compress",
 572                           do_data_compress, comp_param + i,
 573                           QEMU_THREAD_JOINABLE);
 574    }
 575    return 0;
 576
 577exit:
 578    compress_threads_save_cleanup();
 579    return -1;
 580}
 581
 582/**
 583 * save_page_header: write page header to wire
 584 *
 585 * If this is the 1st block, it also writes the block identification
 586 *
 587 * Returns the number of bytes written
 588 *
 589 * @f: QEMUFile where to send the data
 590 * @block: block that contains the page we want to send
 591 * @offset: offset inside the block for the page
 592 *          in the lower bits, it contains flags
 593 */
 594static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 595                               ram_addr_t offset)
 596{
 597    size_t size, len;
 598
 599    if (block == rs->last_sent_block) {
 600        offset |= RAM_SAVE_FLAG_CONTINUE;
 601    }
 602    qemu_put_be64(f, offset);
 603    size = 8;
 604
 605    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 606        len = strlen(block->idstr);
 607        qemu_put_byte(f, len);
 608        qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 609        size += 1 + len;
 610        rs->last_sent_block = block;
 611    }
 612    return size;
 613}
 614
 615/**
 616 * mig_throttle_guest_down: throotle down the guest
 617 *
 618 * Reduce amount of guest cpu execution to hopefully slow down memory
 619 * writes. If guest dirty memory rate is reduced below the rate at
 620 * which we can transfer pages to the destination then we should be
 621 * able to complete migration. Some workloads dirty memory way too
 622 * fast and will not effectively converge, even with auto-converge.
 623 */
 624static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 625                                    uint64_t bytes_dirty_threshold)
 626{
 627    MigrationState *s = migrate_get_current();
 628    uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 629    uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 630    bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 631    int pct_max = s->parameters.max_cpu_throttle;
 632
 633    uint64_t throttle_now = cpu_throttle_get_percentage();
 634    uint64_t cpu_now, cpu_ideal, throttle_inc;
 635
 636    /* We have not started throttling yet. Let's start it. */
 637    if (!cpu_throttle_active()) {
 638        cpu_throttle_set(pct_initial);
 639    } else {
 640        /* Throttling already on, just increase the rate */
 641        if (!pct_tailslow) {
 642            throttle_inc = pct_increment;
 643        } else {
 644            /* Compute the ideal CPU percentage used by Guest, which may
 645             * make the dirty rate match the dirty rate threshold. */
 646            cpu_now = 100 - throttle_now;
 647            cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 648                        bytes_dirty_period);
 649            throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 650        }
 651        cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 652    }
 653}
 654
 655/**
 656 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 657 *
 658 * @rs: current RAM state
 659 * @current_addr: address for the zero page
 660 *
 661 * Update the xbzrle cache to reflect a page that's been sent as all 0.
 662 * The important thing is that a stale (not-yet-0'd) page be replaced
 663 * by the new data.
 664 * As a bonus, if the page wasn't in the cache it gets added so that
 665 * when a small write is made into the 0'd page it gets XBZRLE sent.
 666 */
 667static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 668{
 669    if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 670        return;
 671    }
 672
 673    /* We don't care if this fails to allocate a new cache page
 674     * as long as it updated an old one */
 675    cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 676                 ram_counters.dirty_sync_count);
 677}
 678
 679#define ENCODING_FLAG_XBZRLE 0x1
 680
 681/**
 682 * save_xbzrle_page: compress and send current page
 683 *
 684 * Returns: 1 means that we wrote the page
 685 *          0 means that page is identical to the one already sent
 686 *          -1 means that xbzrle would be longer than normal
 687 *
 688 * @rs: current RAM state
 689 * @current_data: pointer to the address of the page contents
 690 * @current_addr: addr of the page
 691 * @block: block that contains the page we want to send
 692 * @offset: offset inside the block for the page
 693 * @last_stage: if we are at the completion stage
 694 */
 695static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 696                            ram_addr_t current_addr, RAMBlock *block,
 697                            ram_addr_t offset, bool last_stage)
 698{
 699    int encoded_len = 0, bytes_xbzrle;
 700    uint8_t *prev_cached_page;
 701
 702    if (!cache_is_cached(XBZRLE.cache, current_addr,
 703                         ram_counters.dirty_sync_count)) {
 704        xbzrle_counters.cache_miss++;
 705        if (!last_stage) {
 706            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 707                             ram_counters.dirty_sync_count) == -1) {
 708                return -1;
 709            } else {
 710                /* update *current_data when the page has been
 711                   inserted into cache */
 712                *current_data = get_cached_data(XBZRLE.cache, current_addr);
 713            }
 714        }
 715        return -1;
 716    }
 717
 718    /*
 719     * Reaching here means the page has hit the xbzrle cache, no matter what
 720     * encoding result it is (normal encoding, overflow or skipping the page),
 721     * count the page as encoded. This is used to calculate the encoding rate.
 722     *
 723     * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 724     * 2nd page turns out to be skipped (i.e. no new bytes written to the
 725     * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 726     * skipped page included. In this way, the encoding rate can tell if the
 727     * guest page is good for xbzrle encoding.
 728     */
 729    xbzrle_counters.pages++;
 730    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 731
 732    /* save current buffer into memory */
 733    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 734
 735    /* XBZRLE encoding (if there is no overflow) */
 736    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 737                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 738                                       TARGET_PAGE_SIZE);
 739
 740    /*
 741     * Update the cache contents, so that it corresponds to the data
 742     * sent, in all cases except where we skip the page.
 743     */
 744    if (!last_stage && encoded_len != 0) {
 745        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 746        /*
 747         * In the case where we couldn't compress, ensure that the caller
 748         * sends the data from the cache, since the guest might have
 749         * changed the RAM since we copied it.
 750         */
 751        *current_data = prev_cached_page;
 752    }
 753
 754    if (encoded_len == 0) {
 755        trace_save_xbzrle_page_skipping();
 756        return 0;
 757    } else if (encoded_len == -1) {
 758        trace_save_xbzrle_page_overflow();
 759        xbzrle_counters.overflow++;
 760        xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 761        return -1;
 762    }
 763
 764    /* Send XBZRLE based compressed page */
 765    bytes_xbzrle = save_page_header(rs, rs->f, block,
 766                                    offset | RAM_SAVE_FLAG_XBZRLE);
 767    qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 768    qemu_put_be16(rs->f, encoded_len);
 769    qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 770    bytes_xbzrle += encoded_len + 1 + 2;
 771    /*
 772     * Like compressed_size (please see update_compress_thread_counts),
 773     * the xbzrle encoded bytes don't count the 8 byte header with
 774     * RAM_SAVE_FLAG_CONTINUE.
 775     */
 776    xbzrle_counters.bytes += bytes_xbzrle - 8;
 777    ram_counters.transferred += bytes_xbzrle;
 778
 779    return 1;
 780}
 781
 782/**
 783 * migration_bitmap_find_dirty: find the next dirty page from start
 784 *
 785 * Returns the page offset within memory region of the start of a dirty page
 786 *
 787 * @rs: current RAM state
 788 * @rb: RAMBlock where to search for dirty pages
 789 * @start: page where we start the search
 790 */
 791static inline
 792unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 793                                          unsigned long start)
 794{
 795    unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 796    unsigned long *bitmap = rb->bmap;
 797    unsigned long next;
 798
 799    if (ramblock_is_ignored(rb)) {
 800        return size;
 801    }
 802
 803    /*
 804     * When the free page optimization is enabled, we need to check the bitmap
 805     * to send the non-free pages rather than all the pages in the bulk stage.
 806     */
 807    if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
 808        next = start + 1;
 809    } else {
 810        next = find_next_bit(bitmap, size, start);
 811    }
 812
 813    return next;
 814}
 815
 816static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 817                                                RAMBlock *rb,
 818                                                unsigned long page)
 819{
 820    bool ret;
 821
 822    QEMU_LOCK_GUARD(&rs->bitmap_mutex);
 823
 824    /*
 825     * Clear dirty bitmap if needed.  This _must_ be called before we
 826     * send any of the page in the chunk because we need to make sure
 827     * we can capture further page content changes when we sync dirty
 828     * log the next time.  So as long as we are going to send any of
 829     * the page in the chunk we clear the remote dirty bitmap for all.
 830     * Clearing it earlier won't be a problem, but too late will.
 831     */
 832    if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 833        uint8_t shift = rb->clear_bmap_shift;
 834        hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 835        hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 836
 837        /*
 838         * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 839         * can make things easier sometimes since then start address
 840         * of the small chunk will always be 64 pages aligned so the
 841         * bitmap will always be aligned to unsigned long.  We should
 842         * even be able to remove this restriction but I'm simply
 843         * keeping it.
 844         */
 845        assert(shift >= 6);
 846        trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 847        memory_region_clear_dirty_bitmap(rb->mr, start, size);
 848    }
 849
 850    ret = test_and_clear_bit(page, rb->bmap);
 851
 852    if (ret) {
 853        rs->migration_dirty_pages--;
 854    }
 855
 856    return ret;
 857}
 858
 859/* Called with RCU critical section */
 860static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 861{
 862    uint64_t new_dirty_pages =
 863        cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 864
 865    rs->migration_dirty_pages += new_dirty_pages;
 866    rs->num_dirty_pages_period += new_dirty_pages;
 867}
 868
 869/**
 870 * ram_pagesize_summary: calculate all the pagesizes of a VM
 871 *
 872 * Returns a summary bitmap of the page sizes of all RAMBlocks
 873 *
 874 * For VMs with just normal pages this is equivalent to the host page
 875 * size. If it's got some huge pages then it's the OR of all the
 876 * different page sizes.
 877 */
 878uint64_t ram_pagesize_summary(void)
 879{
 880    RAMBlock *block;
 881    uint64_t summary = 0;
 882
 883    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 884        summary |= block->page_size;
 885    }
 886
 887    return summary;
 888}
 889
 890uint64_t ram_get_total_transferred_pages(void)
 891{
 892    return  ram_counters.normal + ram_counters.duplicate +
 893                compression_counters.pages + xbzrle_counters.pages;
 894}
 895
 896static void migration_update_rates(RAMState *rs, int64_t end_time)
 897{
 898    uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 899    double compressed_size;
 900
 901    /* calculate period counters */
 902    ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 903                / (end_time - rs->time_last_bitmap_sync);
 904
 905    if (!page_count) {
 906        return;
 907    }
 908
 909    if (migrate_use_xbzrle()) {
 910        double encoded_size, unencoded_size;
 911
 912        xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 913            rs->xbzrle_cache_miss_prev) / page_count;
 914        rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 915        unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 916                         TARGET_PAGE_SIZE;
 917        encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 918        if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 919            xbzrle_counters.encoding_rate = 0;
 920        } else {
 921            xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 922        }
 923        rs->xbzrle_pages_prev = xbzrle_counters.pages;
 924        rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 925    }
 926
 927    if (migrate_use_compression()) {
 928        compression_counters.busy_rate = (double)(compression_counters.busy -
 929            rs->compress_thread_busy_prev) / page_count;
 930        rs->compress_thread_busy_prev = compression_counters.busy;
 931
 932        compressed_size = compression_counters.compressed_size -
 933                          rs->compressed_size_prev;
 934        if (compressed_size) {
 935            double uncompressed_size = (compression_counters.pages -
 936                                    rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 937
 938            /* Compression-Ratio = Uncompressed-size / Compressed-size */
 939            compression_counters.compression_rate =
 940                                        uncompressed_size / compressed_size;
 941
 942            rs->compress_pages_prev = compression_counters.pages;
 943            rs->compressed_size_prev = compression_counters.compressed_size;
 944        }
 945    }
 946}
 947
 948static void migration_trigger_throttle(RAMState *rs)
 949{
 950    MigrationState *s = migrate_get_current();
 951    uint64_t threshold = s->parameters.throttle_trigger_threshold;
 952
 953    uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 954    uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 955    uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 956
 957    /* During block migration the auto-converge logic incorrectly detects
 958     * that ram migration makes no progress. Avoid this by disabling the
 959     * throttling logic during the bulk phase of block migration. */
 960    if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 961        /* The following detection logic can be refined later. For now:
 962           Check to see if the ratio between dirtied bytes and the approx.
 963           amount of bytes that just got transferred since the last time
 964           we were in this routine reaches the threshold. If that happens
 965           twice, start or increase throttling. */
 966
 967        if ((bytes_dirty_period > bytes_dirty_threshold) &&
 968            (++rs->dirty_rate_high_cnt >= 2)) {
 969            trace_migration_throttle();
 970            rs->dirty_rate_high_cnt = 0;
 971            mig_throttle_guest_down(bytes_dirty_period,
 972                                    bytes_dirty_threshold);
 973        }
 974    }
 975}
 976
 977static void migration_bitmap_sync(RAMState *rs)
 978{
 979    RAMBlock *block;
 980    int64_t end_time;
 981
 982    ram_counters.dirty_sync_count++;
 983
 984    if (!rs->time_last_bitmap_sync) {
 985        rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 986    }
 987
 988    trace_migration_bitmap_sync_start();
 989    memory_global_dirty_log_sync();
 990
 991    qemu_mutex_lock(&rs->bitmap_mutex);
 992    WITH_RCU_READ_LOCK_GUARD() {
 993        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 994            ramblock_sync_dirty_bitmap(rs, block);
 995        }
 996        ram_counters.remaining = ram_bytes_remaining();
 997    }
 998    qemu_mutex_unlock(&rs->bitmap_mutex);
 999
1000    memory_global_after_dirty_log_sync();
1001    trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1002
1003    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1004
1005    /* more than 1 second = 1000 millisecons */
1006    if (end_time > rs->time_last_bitmap_sync + 1000) {
1007        migration_trigger_throttle(rs);
1008
1009        migration_update_rates(rs, end_time);
1010
1011        rs->target_page_count_prev = rs->target_page_count;
1012
1013        /* reset period counters */
1014        rs->time_last_bitmap_sync = end_time;
1015        rs->num_dirty_pages_period = 0;
1016        rs->bytes_xfer_prev = ram_counters.transferred;
1017    }
1018    if (migrate_use_events()) {
1019        qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1020    }
1021}
1022
1023static void migration_bitmap_sync_precopy(RAMState *rs)
1024{
1025    Error *local_err = NULL;
1026
1027    /*
1028     * The current notifier usage is just an optimization to migration, so we
1029     * don't stop the normal migration process in the error case.
1030     */
1031    if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1032        error_report_err(local_err);
1033        local_err = NULL;
1034    }
1035
1036    migration_bitmap_sync(rs);
1037
1038    if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1039        error_report_err(local_err);
1040    }
1041}
1042
1043/**
1044 * save_zero_page_to_file: send the zero page to the file
1045 *
1046 * Returns the size of data written to the file, 0 means the page is not
1047 * a zero page
1048 *
1049 * @rs: current RAM state
1050 * @file: the file where the data is saved
1051 * @block: block that contains the page we want to send
1052 * @offset: offset inside the block for the page
1053 */
1054static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1055                                  RAMBlock *block, ram_addr_t offset)
1056{
1057    uint8_t *p = block->host + offset;
1058    int len = 0;
1059
1060    if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1061        len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1062        qemu_put_byte(file, 0);
1063        len += 1;
1064    }
1065    return len;
1066}
1067
1068/**
1069 * save_zero_page: send the zero page to the stream
1070 *
1071 * Returns the number of pages written.
1072 *
1073 * @rs: current RAM state
1074 * @block: block that contains the page we want to send
1075 * @offset: offset inside the block for the page
1076 */
1077static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1078{
1079    int len = save_zero_page_to_file(rs, rs->f, block, offset);
1080
1081    if (len) {
1082        ram_counters.duplicate++;
1083        ram_counters.transferred += len;
1084        return 1;
1085    }
1086    return -1;
1087}
1088
1089static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1090{
1091    if (!migrate_release_ram() || !migration_in_postcopy()) {
1092        return;
1093    }
1094
1095    ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1096}
1097
1098/*
1099 * @pages: the number of pages written by the control path,
1100 *        < 0 - error
1101 *        > 0 - number of pages written
1102 *
1103 * Return true if the pages has been saved, otherwise false is returned.
1104 */
1105static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1106                              int *pages)
1107{
1108    uint64_t bytes_xmit = 0;
1109    int ret;
1110
1111    *pages = -1;
1112    ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1113                                &bytes_xmit);
1114    if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1115        return false;
1116    }
1117
1118    if (bytes_xmit) {
1119        ram_counters.transferred += bytes_xmit;
1120        *pages = 1;
1121    }
1122
1123    if (ret == RAM_SAVE_CONTROL_DELAYED) {
1124        return true;
1125    }
1126
1127    if (bytes_xmit > 0) {
1128        ram_counters.normal++;
1129    } else if (bytes_xmit == 0) {
1130        ram_counters.duplicate++;
1131    }
1132
1133    return true;
1134}
1135
1136/*
1137 * directly send the page to the stream
1138 *
1139 * Returns the number of pages written.
1140 *
1141 * @rs: current RAM state
1142 * @block: block that contains the page we want to send
1143 * @offset: offset inside the block for the page
1144 * @buf: the page to be sent
1145 * @async: send to page asyncly
1146 */
1147static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1148                            uint8_t *buf, bool async)
1149{
1150    ram_counters.transferred += save_page_header(rs, rs->f, block,
1151                                                 offset | RAM_SAVE_FLAG_PAGE);
1152    if (async) {
1153        qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1154                              migrate_release_ram() &
1155                              migration_in_postcopy());
1156    } else {
1157        qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1158    }
1159    ram_counters.transferred += TARGET_PAGE_SIZE;
1160    ram_counters.normal++;
1161    return 1;
1162}
1163
1164/**
1165 * ram_save_page: send the given page to the stream
1166 *
1167 * Returns the number of pages written.
1168 *          < 0 - error
1169 *          >=0 - Number of pages written - this might legally be 0
1170 *                if xbzrle noticed the page was the same.
1171 *
1172 * @rs: current RAM state
1173 * @block: block that contains the page we want to send
1174 * @offset: offset inside the block for the page
1175 * @last_stage: if we are at the completion stage
1176 */
1177static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1178{
1179    int pages = -1;
1180    uint8_t *p;
1181    bool send_async = true;
1182    RAMBlock *block = pss->block;
1183    ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1184    ram_addr_t current_addr = block->offset + offset;
1185
1186    p = block->host + offset;
1187    trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1188
1189    XBZRLE_cache_lock();
1190    if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1191        migrate_use_xbzrle()) {
1192        pages = save_xbzrle_page(rs, &p, current_addr, block,
1193                                 offset, last_stage);
1194        if (!last_stage) {
1195            /* Can't send this cached data async, since the cache page
1196             * might get updated before it gets to the wire
1197             */
1198            send_async = false;
1199        }
1200    }
1201
1202    /* XBZRLE overflow or normal page */
1203    if (pages == -1) {
1204        pages = save_normal_page(rs, block, offset, p, send_async);
1205    }
1206
1207    XBZRLE_cache_unlock();
1208
1209    return pages;
1210}
1211
1212static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1213                                 ram_addr_t offset)
1214{
1215    if (multifd_queue_page(rs->f, block, offset) < 0) {
1216        return -1;
1217    }
1218    ram_counters.normal++;
1219
1220    return 1;
1221}
1222
1223static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1224                                 ram_addr_t offset, uint8_t *source_buf)
1225{
1226    RAMState *rs = ram_state;
1227    uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1228    bool zero_page = false;
1229    int ret;
1230
1231    if (save_zero_page_to_file(rs, f, block, offset)) {
1232        zero_page = true;
1233        goto exit;
1234    }
1235
1236    save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1237
1238    /*
1239     * copy it to a internal buffer to avoid it being modified by VM
1240     * so that we can catch up the error during compression and
1241     * decompression
1242     */
1243    memcpy(source_buf, p, TARGET_PAGE_SIZE);
1244    ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1245    if (ret < 0) {
1246        qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1247        error_report("compressed data failed!");
1248        return false;
1249    }
1250
1251exit:
1252    ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1253    return zero_page;
1254}
1255
1256static void
1257update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1258{
1259    ram_counters.transferred += bytes_xmit;
1260
1261    if (param->zero_page) {
1262        ram_counters.duplicate++;
1263        return;
1264    }
1265
1266    /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1267    compression_counters.compressed_size += bytes_xmit - 8;
1268    compression_counters.pages++;
1269}
1270
1271static bool save_page_use_compression(RAMState *rs);
1272
1273static void flush_compressed_data(RAMState *rs)
1274{
1275    int idx, len, thread_count;
1276
1277    if (!save_page_use_compression(rs)) {
1278        return;
1279    }
1280    thread_count = migrate_compress_threads();
1281
1282    qemu_mutex_lock(&comp_done_lock);
1283    for (idx = 0; idx < thread_count; idx++) {
1284        while (!comp_param[idx].done) {
1285            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1286        }
1287    }
1288    qemu_mutex_unlock(&comp_done_lock);
1289
1290    for (idx = 0; idx < thread_count; idx++) {
1291        qemu_mutex_lock(&comp_param[idx].mutex);
1292        if (!comp_param[idx].quit) {
1293            len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1294            /*
1295             * it's safe to fetch zero_page without holding comp_done_lock
1296             * as there is no further request submitted to the thread,
1297             * i.e, the thread should be waiting for a request at this point.
1298             */
1299            update_compress_thread_counts(&comp_param[idx], len);
1300        }
1301        qemu_mutex_unlock(&comp_param[idx].mutex);
1302    }
1303}
1304
1305static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1306                                       ram_addr_t offset)
1307{
1308    param->block = block;
1309    param->offset = offset;
1310}
1311
1312static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1313                                           ram_addr_t offset)
1314{
1315    int idx, thread_count, bytes_xmit = -1, pages = -1;
1316    bool wait = migrate_compress_wait_thread();
1317
1318    thread_count = migrate_compress_threads();
1319    qemu_mutex_lock(&comp_done_lock);
1320retry:
1321    for (idx = 0; idx < thread_count; idx++) {
1322        if (comp_param[idx].done) {
1323            comp_param[idx].done = false;
1324            bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1325            qemu_mutex_lock(&comp_param[idx].mutex);
1326            set_compress_params(&comp_param[idx], block, offset);
1327            qemu_cond_signal(&comp_param[idx].cond);
1328            qemu_mutex_unlock(&comp_param[idx].mutex);
1329            pages = 1;
1330            update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1331            break;
1332        }
1333    }
1334
1335    /*
1336     * wait for the free thread if the user specifies 'compress-wait-thread',
1337     * otherwise we will post the page out in the main thread as normal page.
1338     */
1339    if (pages < 0 && wait) {
1340        qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1341        goto retry;
1342    }
1343    qemu_mutex_unlock(&comp_done_lock);
1344
1345    return pages;
1346}
1347
1348/**
1349 * find_dirty_block: find the next dirty page and update any state
1350 * associated with the search process.
1351 *
1352 * Returns true if a page is found
1353 *
1354 * @rs: current RAM state
1355 * @pss: data about the state of the current dirty page scan
1356 * @again: set to false if the search has scanned the whole of RAM
1357 */
1358static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1359{
1360    pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1361    if (pss->complete_round && pss->block == rs->last_seen_block &&
1362        pss->page >= rs->last_page) {
1363        /*
1364         * We've been once around the RAM and haven't found anything.
1365         * Give up.
1366         */
1367        *again = false;
1368        return false;
1369    }
1370    if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1371        >= pss->block->used_length) {
1372        /* Didn't find anything in this RAM Block */
1373        pss->page = 0;
1374        pss->block = QLIST_NEXT_RCU(pss->block, next);
1375        if (!pss->block) {
1376            /*
1377             * If memory migration starts over, we will meet a dirtied page
1378             * which may still exists in compression threads's ring, so we
1379             * should flush the compressed data to make sure the new page
1380             * is not overwritten by the old one in the destination.
1381             *
1382             * Also If xbzrle is on, stop using the data compression at this
1383             * point. In theory, xbzrle can do better than compression.
1384             */
1385            flush_compressed_data(rs);
1386
1387            /* Hit the end of the list */
1388            pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1389            /* Flag that we've looped */
1390            pss->complete_round = true;
1391            rs->ram_bulk_stage = false;
1392        }
1393        /* Didn't find anything this time, but try again on the new block */
1394        *again = true;
1395        return false;
1396    } else {
1397        /* Can go around again, but... */
1398        *again = true;
1399        /* We've found something so probably don't need to */
1400        return true;
1401    }
1402}
1403
1404/**
1405 * unqueue_page: gets a page of the queue
1406 *
1407 * Helper for 'get_queued_page' - gets a page off the queue
1408 *
1409 * Returns the block of the page (or NULL if none available)
1410 *
1411 * @rs: current RAM state
1412 * @offset: used to return the offset within the RAMBlock
1413 */
1414static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1415{
1416    RAMBlock *block = NULL;
1417
1418    if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1419        return NULL;
1420    }
1421
1422    QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1423    if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1424        struct RAMSrcPageRequest *entry =
1425                                QSIMPLEQ_FIRST(&rs->src_page_requests);
1426        block = entry->rb;
1427        *offset = entry->offset;
1428
1429        if (entry->len > TARGET_PAGE_SIZE) {
1430            entry->len -= TARGET_PAGE_SIZE;
1431            entry->offset += TARGET_PAGE_SIZE;
1432        } else {
1433            memory_region_unref(block->mr);
1434            QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1435            g_free(entry);
1436            migration_consume_urgent_request();
1437        }
1438    }
1439
1440    return block;
1441}
1442
1443#if defined(__linux__)
1444/**
1445 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1446 *   is found, return RAM block pointer and page offset
1447 *
1448 * Returns pointer to the RAMBlock containing faulting page,
1449 *   NULL if no write faults are pending
1450 *
1451 * @rs: current RAM state
1452 * @offset: page offset from the beginning of the block
1453 */
1454static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1455{
1456    struct uffd_msg uffd_msg;
1457    void *page_address;
1458    RAMBlock *block;
1459    int res;
1460
1461    if (!migrate_background_snapshot()) {
1462        return NULL;
1463    }
1464
1465    res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1466    if (res <= 0) {
1467        return NULL;
1468    }
1469
1470    page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1471    block = qemu_ram_block_from_host(page_address, false, offset);
1472    assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1473    return block;
1474}
1475
1476/**
1477 * ram_save_release_protection: release UFFD write protection after
1478 *   a range of pages has been saved
1479 *
1480 * @rs: current RAM state
1481 * @pss: page-search-status structure
1482 * @start_page: index of the first page in the range relative to pss->block
1483 *
1484 * Returns 0 on success, negative value in case of an error
1485*/
1486static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1487        unsigned long start_page)
1488{
1489    int res = 0;
1490
1491    /* Check if page is from UFFD-managed region. */
1492    if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1493        void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1494        uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1495
1496        /* Flush async buffers before un-protect. */
1497        qemu_fflush(rs->f);
1498        /* Un-protect memory range. */
1499        res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1500                false, false);
1501    }
1502
1503    return res;
1504}
1505
1506/* ram_write_tracking_available: check if kernel supports required UFFD features
1507 *
1508 * Returns true if supports, false otherwise
1509 */
1510bool ram_write_tracking_available(void)
1511{
1512    uint64_t uffd_features;
1513    int res;
1514
1515    res = uffd_query_features(&uffd_features);
1516    return (res == 0 &&
1517            (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1518}
1519
1520/* ram_write_tracking_compatible: check if guest configuration is
1521 *   compatible with 'write-tracking'
1522 *
1523 * Returns true if compatible, false otherwise
1524 */
1525bool ram_write_tracking_compatible(void)
1526{
1527    const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1528    int uffd_fd;
1529    RAMBlock *block;
1530    bool ret = false;
1531
1532    /* Open UFFD file descriptor */
1533    uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1534    if (uffd_fd < 0) {
1535        return false;
1536    }
1537
1538    RCU_READ_LOCK_GUARD();
1539
1540    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1541        uint64_t uffd_ioctls;
1542
1543        /* Nothing to do with read-only and MMIO-writable regions */
1544        if (block->mr->readonly || block->mr->rom_device) {
1545            continue;
1546        }
1547        /* Try to register block memory via UFFD-IO to track writes */
1548        if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1549                UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1550            goto out;
1551        }
1552        if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1553            goto out;
1554        }
1555    }
1556    ret = true;
1557
1558out:
1559    uffd_close_fd(uffd_fd);
1560    return ret;
1561}
1562
1563/*
1564 * ram_block_populate_pages: populate memory in the RAM block by reading
1565 *   an integer from the beginning of each page.
1566 *
1567 * Since it's solely used for userfault_fd WP feature, here we just
1568 *   hardcode page size to qemu_real_host_page_size.
1569 *
1570 * @block: RAM block to populate
1571 */
1572static void ram_block_populate_pages(RAMBlock *block)
1573{
1574    char *ptr = (char *) block->host;
1575
1576    for (ram_addr_t offset = 0; offset < block->used_length;
1577            offset += qemu_real_host_page_size) {
1578        char tmp = *(ptr + offset);
1579
1580        /* Don't optimize the read out */
1581        asm volatile("" : "+r" (tmp));
1582    }
1583}
1584
1585/*
1586 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1587 */
1588void ram_write_tracking_prepare(void)
1589{
1590    RAMBlock *block;
1591
1592    RCU_READ_LOCK_GUARD();
1593
1594    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1595        /* Nothing to do with read-only and MMIO-writable regions */
1596        if (block->mr->readonly || block->mr->rom_device) {
1597            continue;
1598        }
1599
1600        /*
1601         * Populate pages of the RAM block before enabling userfault_fd
1602         * write protection.
1603         *
1604         * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1605         * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1606         * pages with pte_none() entries in page table.
1607         */
1608        ram_block_populate_pages(block);
1609    }
1610}
1611
1612/*
1613 * ram_write_tracking_start: start UFFD-WP memory tracking
1614 *
1615 * Returns 0 for success or negative value in case of error
1616 */
1617int ram_write_tracking_start(void)
1618{
1619    int uffd_fd;
1620    RAMState *rs = ram_state;
1621    RAMBlock *block;
1622
1623    /* Open UFFD file descriptor */
1624    uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1625    if (uffd_fd < 0) {
1626        return uffd_fd;
1627    }
1628    rs->uffdio_fd = uffd_fd;
1629
1630    RCU_READ_LOCK_GUARD();
1631
1632    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1633        /* Nothing to do with read-only and MMIO-writable regions */
1634        if (block->mr->readonly || block->mr->rom_device) {
1635            continue;
1636        }
1637
1638        /* Register block memory with UFFD to track writes */
1639        if (uffd_register_memory(rs->uffdio_fd, block->host,
1640                block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1641            goto fail;
1642        }
1643        /* Apply UFFD write protection to the block memory range */
1644        if (uffd_change_protection(rs->uffdio_fd, block->host,
1645                block->max_length, true, false)) {
1646            goto fail;
1647        }
1648        block->flags |= RAM_UF_WRITEPROTECT;
1649        memory_region_ref(block->mr);
1650
1651        trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1652                block->host, block->max_length);
1653    }
1654
1655    return 0;
1656
1657fail:
1658    error_report("ram_write_tracking_start() failed: restoring initial memory state");
1659
1660    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1661        if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1662            continue;
1663        }
1664        /*
1665         * In case some memory block failed to be write-protected
1666         * remove protection and unregister all succeeded RAM blocks
1667         */
1668        uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1669                false, false);
1670        uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1671        /* Cleanup flags and remove reference */
1672        block->flags &= ~RAM_UF_WRITEPROTECT;
1673        memory_region_unref(block->mr);
1674    }
1675
1676    uffd_close_fd(uffd_fd);
1677    rs->uffdio_fd = -1;
1678    return -1;
1679}
1680
1681/**
1682 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1683 */
1684void ram_write_tracking_stop(void)
1685{
1686    RAMState *rs = ram_state;
1687    RAMBlock *block;
1688
1689    RCU_READ_LOCK_GUARD();
1690
1691    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1692        if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1693            continue;
1694        }
1695        /* Remove protection and unregister all affected RAM blocks */
1696        uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1697                false, false);
1698        uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1699
1700        trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1701                block->host, block->max_length);
1702
1703        /* Cleanup flags and remove reference */
1704        block->flags &= ~RAM_UF_WRITEPROTECT;
1705        memory_region_unref(block->mr);
1706    }
1707
1708    /* Finally close UFFD file descriptor */
1709    uffd_close_fd(rs->uffdio_fd);
1710    rs->uffdio_fd = -1;
1711}
1712
1713#else
1714/* No target OS support, stubs just fail or ignore */
1715
1716static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1717{
1718    (void) rs;
1719    (void) offset;
1720
1721    return NULL;
1722}
1723
1724static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1725        unsigned long start_page)
1726{
1727    (void) rs;
1728    (void) pss;
1729    (void) start_page;
1730
1731    return 0;
1732}
1733
1734bool ram_write_tracking_available(void)
1735{
1736    return false;
1737}
1738
1739bool ram_write_tracking_compatible(void)
1740{
1741    assert(0);
1742    return false;
1743}
1744
1745int ram_write_tracking_start(void)
1746{
1747    assert(0);
1748    return -1;
1749}
1750
1751void ram_write_tracking_stop(void)
1752{
1753    assert(0);
1754}
1755#endif /* defined(__linux__) */
1756
1757/**
1758 * get_queued_page: unqueue a page from the postcopy requests
1759 *
1760 * Skips pages that are already sent (!dirty)
1761 *
1762 * Returns true if a queued page is found
1763 *
1764 * @rs: current RAM state
1765 * @pss: data about the state of the current dirty page scan
1766 */
1767static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1768{
1769    RAMBlock  *block;
1770    ram_addr_t offset;
1771    bool dirty;
1772
1773    do {
1774        block = unqueue_page(rs, &offset);
1775        /*
1776         * We're sending this page, and since it's postcopy nothing else
1777         * will dirty it, and we must make sure it doesn't get sent again
1778         * even if this queue request was received after the background
1779         * search already sent it.
1780         */
1781        if (block) {
1782            unsigned long page;
1783
1784            page = offset >> TARGET_PAGE_BITS;
1785            dirty = test_bit(page, block->bmap);
1786            if (!dirty) {
1787                trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1788                                                page);
1789            } else {
1790                trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1791            }
1792        }
1793
1794    } while (block && !dirty);
1795
1796    if (!block) {
1797        /*
1798         * Poll write faults too if background snapshot is enabled; that's
1799         * when we have vcpus got blocked by the write protected pages.
1800         */
1801        block = poll_fault_page(rs, &offset);
1802    }
1803
1804    if (block) {
1805        /*
1806         * As soon as we start servicing pages out of order, then we have
1807         * to kill the bulk stage, since the bulk stage assumes
1808         * in (migration_bitmap_find_and_reset_dirty) that every page is
1809         * dirty, that's no longer true.
1810         */
1811        rs->ram_bulk_stage = false;
1812
1813        /*
1814         * We want the background search to continue from the queued page
1815         * since the guest is likely to want other pages near to the page
1816         * it just requested.
1817         */
1818        pss->block = block;
1819        pss->page = offset >> TARGET_PAGE_BITS;
1820
1821        /*
1822         * This unqueued page would break the "one round" check, even is
1823         * really rare.
1824         */
1825        pss->complete_round = false;
1826    }
1827
1828    return !!block;
1829}
1830
1831/**
1832 * migration_page_queue_free: drop any remaining pages in the ram
1833 * request queue
1834 *
1835 * It should be empty at the end anyway, but in error cases there may
1836 * be some left.  in case that there is any page left, we drop it.
1837 *
1838 */
1839static void migration_page_queue_free(RAMState *rs)
1840{
1841    struct RAMSrcPageRequest *mspr, *next_mspr;
1842    /* This queue generally should be empty - but in the case of a failed
1843     * migration might have some droppings in.
1844     */
1845    RCU_READ_LOCK_GUARD();
1846    QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1847        memory_region_unref(mspr->rb->mr);
1848        QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1849        g_free(mspr);
1850    }
1851}
1852
1853/**
1854 * ram_save_queue_pages: queue the page for transmission
1855 *
1856 * A request from postcopy destination for example.
1857 *
1858 * Returns zero on success or negative on error
1859 *
1860 * @rbname: Name of the RAMBLock of the request. NULL means the
1861 *          same that last one.
1862 * @start: starting address from the start of the RAMBlock
1863 * @len: length (in bytes) to send
1864 */
1865int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1866{
1867    RAMBlock *ramblock;
1868    RAMState *rs = ram_state;
1869
1870    ram_counters.postcopy_requests++;
1871    RCU_READ_LOCK_GUARD();
1872
1873    if (!rbname) {
1874        /* Reuse last RAMBlock */
1875        ramblock = rs->last_req_rb;
1876
1877        if (!ramblock) {
1878            /*
1879             * Shouldn't happen, we can't reuse the last RAMBlock if
1880             * it's the 1st request.
1881             */
1882            error_report("ram_save_queue_pages no previous block");
1883            return -1;
1884        }
1885    } else {
1886        ramblock = qemu_ram_block_by_name(rbname);
1887
1888        if (!ramblock) {
1889            /* We shouldn't be asked for a non-existent RAMBlock */
1890            error_report("ram_save_queue_pages no block '%s'", rbname);
1891            return -1;
1892        }
1893        rs->last_req_rb = ramblock;
1894    }
1895    trace_ram_save_queue_pages(ramblock->idstr, start, len);
1896    if (start + len > ramblock->used_length) {
1897        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1898                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1899                     __func__, start, len, ramblock->used_length);
1900        return -1;
1901    }
1902
1903    struct RAMSrcPageRequest *new_entry =
1904        g_malloc0(sizeof(struct RAMSrcPageRequest));
1905    new_entry->rb = ramblock;
1906    new_entry->offset = start;
1907    new_entry->len = len;
1908
1909    memory_region_ref(ramblock->mr);
1910    qemu_mutex_lock(&rs->src_page_req_mutex);
1911    QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1912    migration_make_urgent_request();
1913    qemu_mutex_unlock(&rs->src_page_req_mutex);
1914
1915    return 0;
1916}
1917
1918static bool save_page_use_compression(RAMState *rs)
1919{
1920    if (!migrate_use_compression()) {
1921        return false;
1922    }
1923
1924    /*
1925     * If xbzrle is on, stop using the data compression after first
1926     * round of migration even if compression is enabled. In theory,
1927     * xbzrle can do better than compression.
1928     */
1929    if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1930        return true;
1931    }
1932
1933    return false;
1934}
1935
1936/*
1937 * try to compress the page before posting it out, return true if the page
1938 * has been properly handled by compression, otherwise needs other
1939 * paths to handle it
1940 */
1941static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1942{
1943    if (!save_page_use_compression(rs)) {
1944        return false;
1945    }
1946
1947    /*
1948     * When starting the process of a new block, the first page of
1949     * the block should be sent out before other pages in the same
1950     * block, and all the pages in last block should have been sent
1951     * out, keeping this order is important, because the 'cont' flag
1952     * is used to avoid resending the block name.
1953     *
1954     * We post the fist page as normal page as compression will take
1955     * much CPU resource.
1956     */
1957    if (block != rs->last_sent_block) {
1958        flush_compressed_data(rs);
1959        return false;
1960    }
1961
1962    if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1963        return true;
1964    }
1965
1966    compression_counters.busy++;
1967    return false;
1968}
1969
1970/**
1971 * ram_save_target_page: save one target page
1972 *
1973 * Returns the number of pages written
1974 *
1975 * @rs: current RAM state
1976 * @pss: data about the page we want to send
1977 * @last_stage: if we are at the completion stage
1978 */
1979static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1980                                bool last_stage)
1981{
1982    RAMBlock *block = pss->block;
1983    ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1984    int res;
1985
1986    if (control_save_page(rs, block, offset, &res)) {
1987        return res;
1988    }
1989
1990    if (save_compress_page(rs, block, offset)) {
1991        return 1;
1992    }
1993
1994    res = save_zero_page(rs, block, offset);
1995    if (res > 0) {
1996        /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1997         * page would be stale
1998         */
1999        if (!save_page_use_compression(rs)) {
2000            XBZRLE_cache_lock();
2001            xbzrle_cache_zero_page(rs, block->offset + offset);
2002            XBZRLE_cache_unlock();
2003        }
2004        ram_release_pages(block->idstr, offset, res);
2005        return res;
2006    }
2007
2008    /*
2009     * Do not use multifd for:
2010     * 1. Compression as the first page in the new block should be posted out
2011     *    before sending the compressed page
2012     * 2. In postcopy as one whole host page should be placed
2013     */
2014    if (!save_page_use_compression(rs) && migrate_use_multifd()
2015        && !migration_in_postcopy()) {
2016        return ram_save_multifd_page(rs, block, offset);
2017    }
2018
2019    return ram_save_page(rs, pss, last_stage);
2020}
2021
2022/**
2023 * ram_save_host_page: save a whole host page
2024 *
2025 * Starting at *offset send pages up to the end of the current host
2026 * page. It's valid for the initial offset to point into the middle of
2027 * a host page in which case the remainder of the hostpage is sent.
2028 * Only dirty target pages are sent. Note that the host page size may
2029 * be a huge page for this block.
2030 * The saving stops at the boundary of the used_length of the block
2031 * if the RAMBlock isn't a multiple of the host page size.
2032 *
2033 * Returns the number of pages written or negative on error
2034 *
2035 * @rs: current RAM state
2036 * @ms: current migration state
2037 * @pss: data about the page we want to send
2038 * @last_stage: if we are at the completion stage
2039 */
2040static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2041                              bool last_stage)
2042{
2043    int tmppages, pages = 0;
2044    size_t pagesize_bits =
2045        qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2046    unsigned long start_page = pss->page;
2047    int res;
2048
2049    if (ramblock_is_ignored(pss->block)) {
2050        error_report("block %s should not be migrated !", pss->block->idstr);
2051        return 0;
2052    }
2053
2054    do {
2055        /* Check the pages is dirty and if it is send it */
2056        if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2057            pss->page++;
2058            continue;
2059        }
2060
2061        tmppages = ram_save_target_page(rs, pss, last_stage);
2062        if (tmppages < 0) {
2063            return tmppages;
2064        }
2065
2066        pages += tmppages;
2067        pss->page++;
2068        /* Allow rate limiting to happen in the middle of huge pages */
2069        migration_rate_limit();
2070    } while ((pss->page & (pagesize_bits - 1)) &&
2071             offset_in_ramblock(pss->block,
2072                                ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2073    /* The offset we leave with is the last one we looked at */
2074    pss->page--;
2075
2076    res = ram_save_release_protection(rs, pss, start_page);
2077    return (res < 0 ? res : pages);
2078}
2079
2080/**
2081 * ram_find_and_save_block: finds a dirty page and sends it to f
2082 *
2083 * Called within an RCU critical section.
2084 *
2085 * Returns the number of pages written where zero means no dirty pages,
2086 * or negative on error
2087 *
2088 * @rs: current RAM state
2089 * @last_stage: if we are at the completion stage
2090 *
2091 * On systems where host-page-size > target-page-size it will send all the
2092 * pages in a host page that are dirty.
2093 */
2094
2095static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2096{
2097    PageSearchStatus pss;
2098    int pages = 0;
2099    bool again, found;
2100
2101    /* No dirty page as there is zero RAM */
2102    if (!ram_bytes_total()) {
2103        return pages;
2104    }
2105
2106    pss.block = rs->last_seen_block;
2107    pss.page = rs->last_page;
2108    pss.complete_round = false;
2109
2110    if (!pss.block) {
2111        pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2112    }
2113
2114    do {
2115        again = true;
2116        found = get_queued_page(rs, &pss);
2117
2118        if (!found) {
2119            /* priority queue empty, so just search for something dirty */
2120            found = find_dirty_block(rs, &pss, &again);
2121        }
2122
2123        if (found) {
2124            pages = ram_save_host_page(rs, &pss, last_stage);
2125        }
2126    } while (!pages && again);
2127
2128    rs->last_seen_block = pss.block;
2129    rs->last_page = pss.page;
2130
2131    return pages;
2132}
2133
2134void acct_update_position(QEMUFile *f, size_t size, bool zero)
2135{
2136    uint64_t pages = size / TARGET_PAGE_SIZE;
2137
2138    if (zero) {
2139        ram_counters.duplicate += pages;
2140    } else {
2141        ram_counters.normal += pages;
2142        ram_counters.transferred += size;
2143        qemu_update_position(f, size);
2144    }
2145}
2146
2147static uint64_t ram_bytes_total_common(bool count_ignored)
2148{
2149    RAMBlock *block;
2150    uint64_t total = 0;
2151
2152    RCU_READ_LOCK_GUARD();
2153
2154    if (count_ignored) {
2155        RAMBLOCK_FOREACH_MIGRATABLE(block) {
2156            total += block->used_length;
2157        }
2158    } else {
2159        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2160            total += block->used_length;
2161        }
2162    }
2163    return total;
2164}
2165
2166uint64_t ram_bytes_total(void)
2167{
2168    return ram_bytes_total_common(false);
2169}
2170
2171static void xbzrle_load_setup(void)
2172{
2173    XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2174}
2175
2176static void xbzrle_load_cleanup(void)
2177{
2178    g_free(XBZRLE.decoded_buf);
2179    XBZRLE.decoded_buf = NULL;
2180}
2181
2182static void ram_state_cleanup(RAMState **rsp)
2183{
2184    if (*rsp) {
2185        migration_page_queue_free(*rsp);
2186        qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2187        qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2188        g_free(*rsp);
2189        *rsp = NULL;
2190    }
2191}
2192
2193static void xbzrle_cleanup(void)
2194{
2195    XBZRLE_cache_lock();
2196    if (XBZRLE.cache) {
2197        cache_fini(XBZRLE.cache);
2198        g_free(XBZRLE.encoded_buf);
2199        g_free(XBZRLE.current_buf);
2200        g_free(XBZRLE.zero_target_page);
2201        XBZRLE.cache = NULL;
2202        XBZRLE.encoded_buf = NULL;
2203        XBZRLE.current_buf = NULL;
2204        XBZRLE.zero_target_page = NULL;
2205    }
2206    XBZRLE_cache_unlock();
2207}
2208
2209static void ram_save_cleanup(void *opaque)
2210{
2211    RAMState **rsp = opaque;
2212    RAMBlock *block;
2213
2214    /* We don't use dirty log with background snapshots */
2215    if (!migrate_background_snapshot()) {
2216        /* caller have hold iothread lock or is in a bh, so there is
2217         * no writing race against the migration bitmap
2218         */
2219        memory_global_dirty_log_stop();
2220    }
2221
2222    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2223        g_free(block->clear_bmap);
2224        block->clear_bmap = NULL;
2225        g_free(block->bmap);
2226        block->bmap = NULL;
2227    }
2228
2229    xbzrle_cleanup();
2230    compress_threads_save_cleanup();
2231    ram_state_cleanup(rsp);
2232}
2233
2234static void ram_state_reset(RAMState *rs)
2235{
2236    rs->last_seen_block = NULL;
2237    rs->last_sent_block = NULL;
2238    rs->last_page = 0;
2239    rs->last_version = ram_list.version;
2240    rs->ram_bulk_stage = true;
2241    rs->fpo_enabled = false;
2242}
2243
2244#define MAX_WAIT 50 /* ms, half buffered_file limit */
2245
2246/*
2247 * 'expected' is the value you expect the bitmap mostly to be full
2248 * of; it won't bother printing lines that are all this value.
2249 * If 'todump' is null the migration bitmap is dumped.
2250 */
2251void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2252                           unsigned long pages)
2253{
2254    int64_t cur;
2255    int64_t linelen = 128;
2256    char linebuf[129];
2257
2258    for (cur = 0; cur < pages; cur += linelen) {
2259        int64_t curb;
2260        bool found = false;
2261        /*
2262         * Last line; catch the case where the line length
2263         * is longer than remaining ram
2264         */
2265        if (cur + linelen > pages) {
2266            linelen = pages - cur;
2267        }
2268        for (curb = 0; curb < linelen; curb++) {
2269            bool thisbit = test_bit(cur + curb, todump);
2270            linebuf[curb] = thisbit ? '1' : '.';
2271            found = found || (thisbit != expected);
2272        }
2273        if (found) {
2274            linebuf[curb] = '\0';
2275            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2276        }
2277    }
2278}
2279
2280/* **** functions for postcopy ***** */
2281
2282void ram_postcopy_migrated_memory_release(MigrationState *ms)
2283{
2284    struct RAMBlock *block;
2285
2286    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2287        unsigned long *bitmap = block->bmap;
2288        unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2289        unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2290
2291        while (run_start < range) {
2292            unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2293            ram_discard_range(block->idstr,
2294                              ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2295                              ((ram_addr_t)(run_end - run_start))
2296                                << TARGET_PAGE_BITS);
2297            run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2298        }
2299    }
2300}
2301
2302/**
2303 * postcopy_send_discard_bm_ram: discard a RAMBlock
2304 *
2305 * Returns zero on success
2306 *
2307 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2308 *
2309 * @ms: current migration state
2310 * @block: RAMBlock to discard
2311 */
2312static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2313{
2314    unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2315    unsigned long current;
2316    unsigned long *bitmap = block->bmap;
2317
2318    for (current = 0; current < end; ) {
2319        unsigned long one = find_next_bit(bitmap, end, current);
2320        unsigned long zero, discard_length;
2321
2322        if (one >= end) {
2323            break;
2324        }
2325
2326        zero = find_next_zero_bit(bitmap, end, one + 1);
2327
2328        if (zero >= end) {
2329            discard_length = end - one;
2330        } else {
2331            discard_length = zero - one;
2332        }
2333        postcopy_discard_send_range(ms, one, discard_length);
2334        current = one + discard_length;
2335    }
2336
2337    return 0;
2338}
2339
2340/**
2341 * postcopy_each_ram_send_discard: discard all RAMBlocks
2342 *
2343 * Returns 0 for success or negative for error
2344 *
2345 * Utility for the outgoing postcopy code.
2346 *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2347 *   passing it bitmap indexes and name.
2348 * (qemu_ram_foreach_block ends up passing unscaled lengths
2349 *  which would mean postcopy code would have to deal with target page)
2350 *
2351 * @ms: current migration state
2352 */
2353static int postcopy_each_ram_send_discard(MigrationState *ms)
2354{
2355    struct RAMBlock *block;
2356    int ret;
2357
2358    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2359        postcopy_discard_send_init(ms, block->idstr);
2360
2361        /*
2362         * Postcopy sends chunks of bitmap over the wire, but it
2363         * just needs indexes at this point, avoids it having
2364         * target page specific code.
2365         */
2366        ret = postcopy_send_discard_bm_ram(ms, block);
2367        postcopy_discard_send_finish(ms);
2368        if (ret) {
2369            return ret;
2370        }
2371    }
2372
2373    return 0;
2374}
2375
2376/**
2377 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2378 *
2379 * Helper for postcopy_chunk_hostpages; it's called twice to
2380 * canonicalize the two bitmaps, that are similar, but one is
2381 * inverted.
2382 *
2383 * Postcopy requires that all target pages in a hostpage are dirty or
2384 * clean, not a mix.  This function canonicalizes the bitmaps.
2385 *
2386 * @ms: current migration state
2387 * @block: block that contains the page we want to canonicalize
2388 */
2389static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2390{
2391    RAMState *rs = ram_state;
2392    unsigned long *bitmap = block->bmap;
2393    unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2394    unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2395    unsigned long run_start;
2396
2397    if (block->page_size == TARGET_PAGE_SIZE) {
2398        /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2399        return;
2400    }
2401
2402    /* Find a dirty page */
2403    run_start = find_next_bit(bitmap, pages, 0);
2404
2405    while (run_start < pages) {
2406
2407        /*
2408         * If the start of this run of pages is in the middle of a host
2409         * page, then we need to fixup this host page.
2410         */
2411        if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2412            /* Find the end of this run */
2413            run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2414            /*
2415             * If the end isn't at the start of a host page, then the
2416             * run doesn't finish at the end of a host page
2417             * and we need to discard.
2418             */
2419        }
2420
2421        if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2422            unsigned long page;
2423            unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2424                                                             host_ratio);
2425            run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2426
2427            /* Clean up the bitmap */
2428            for (page = fixup_start_addr;
2429                 page < fixup_start_addr + host_ratio; page++) {
2430                /*
2431                 * Remark them as dirty, updating the count for any pages
2432                 * that weren't previously dirty.
2433                 */
2434                rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2435            }
2436        }
2437
2438        /* Find the next dirty page for the next iteration */
2439        run_start = find_next_bit(bitmap, pages, run_start);
2440    }
2441}
2442
2443/**
2444 * postcopy_chunk_hostpages: discard any partially sent host page
2445 *
2446 * Utility for the outgoing postcopy code.
2447 *
2448 * Discard any partially sent host-page size chunks, mark any partially
2449 * dirty host-page size chunks as all dirty.  In this case the host-page
2450 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2451 *
2452 * Returns zero on success
2453 *
2454 * @ms: current migration state
2455 * @block: block we want to work with
2456 */
2457static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2458{
2459    postcopy_discard_send_init(ms, block->idstr);
2460
2461    /*
2462     * Ensure that all partially dirty host pages are made fully dirty.
2463     */
2464    postcopy_chunk_hostpages_pass(ms, block);
2465
2466    postcopy_discard_send_finish(ms);
2467    return 0;
2468}
2469
2470/**
2471 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2472 *
2473 * Returns zero on success
2474 *
2475 * Transmit the set of pages to be discarded after precopy to the target
2476 * these are pages that:
2477 *     a) Have been previously transmitted but are now dirty again
2478 *     b) Pages that have never been transmitted, this ensures that
2479 *        any pages on the destination that have been mapped by background
2480 *        tasks get discarded (transparent huge pages is the specific concern)
2481 * Hopefully this is pretty sparse
2482 *
2483 * @ms: current migration state
2484 */
2485int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2486{
2487    RAMState *rs = ram_state;
2488    RAMBlock *block;
2489    int ret;
2490
2491    RCU_READ_LOCK_GUARD();
2492
2493    /* This should be our last sync, the src is now paused */
2494    migration_bitmap_sync(rs);
2495
2496    /* Easiest way to make sure we don't resume in the middle of a host-page */
2497    rs->last_seen_block = NULL;
2498    rs->last_sent_block = NULL;
2499    rs->last_page = 0;
2500
2501    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2502        /* Deal with TPS != HPS and huge pages */
2503        ret = postcopy_chunk_hostpages(ms, block);
2504        if (ret) {
2505            return ret;
2506        }
2507
2508#ifdef DEBUG_POSTCOPY
2509        ram_debug_dump_bitmap(block->bmap, true,
2510                              block->used_length >> TARGET_PAGE_BITS);
2511#endif
2512    }
2513    trace_ram_postcopy_send_discard_bitmap();
2514
2515    return postcopy_each_ram_send_discard(ms);
2516}
2517
2518/**
2519 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2520 *
2521 * Returns zero on success
2522 *
2523 * @rbname: name of the RAMBlock of the request. NULL means the
2524 *          same that last one.
2525 * @start: RAMBlock starting page
2526 * @length: RAMBlock size
2527 */
2528int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2529{
2530    trace_ram_discard_range(rbname, start, length);
2531
2532    RCU_READ_LOCK_GUARD();
2533    RAMBlock *rb = qemu_ram_block_by_name(rbname);
2534
2535    if (!rb) {
2536        error_report("ram_discard_range: Failed to find block '%s'", rbname);
2537        return -1;
2538    }
2539
2540    /*
2541     * On source VM, we don't need to update the received bitmap since
2542     * we don't even have one.
2543     */
2544    if (rb->receivedmap) {
2545        bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2546                     length >> qemu_target_page_bits());
2547    }
2548
2549    return ram_block_discard_range(rb, start, length);
2550}
2551
2552/*
2553 * For every allocation, we will try not to crash the VM if the
2554 * allocation failed.
2555 */
2556static int xbzrle_init(void)
2557{
2558    Error *local_err = NULL;
2559
2560    if (!migrate_use_xbzrle()) {
2561        return 0;
2562    }
2563
2564    XBZRLE_cache_lock();
2565
2566    XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2567    if (!XBZRLE.zero_target_page) {
2568        error_report("%s: Error allocating zero page", __func__);
2569        goto err_out;
2570    }
2571
2572    XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2573                              TARGET_PAGE_SIZE, &local_err);
2574    if (!XBZRLE.cache) {
2575        error_report_err(local_err);
2576        goto free_zero_page;
2577    }
2578
2579    XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2580    if (!XBZRLE.encoded_buf) {
2581        error_report("%s: Error allocating encoded_buf", __func__);
2582        goto free_cache;
2583    }
2584
2585    XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2586    if (!XBZRLE.current_buf) {
2587        error_report("%s: Error allocating current_buf", __func__);
2588        goto free_encoded_buf;
2589    }
2590
2591    /* We are all good */
2592    XBZRLE_cache_unlock();
2593    return 0;
2594
2595free_encoded_buf:
2596    g_free(XBZRLE.encoded_buf);
2597    XBZRLE.encoded_buf = NULL;
2598free_cache:
2599    cache_fini(XBZRLE.cache);
2600    XBZRLE.cache = NULL;
2601free_zero_page:
2602    g_free(XBZRLE.zero_target_page);
2603    XBZRLE.zero_target_page = NULL;
2604err_out:
2605    XBZRLE_cache_unlock();
2606    return -ENOMEM;
2607}
2608
2609static int ram_state_init(RAMState **rsp)
2610{
2611    *rsp = g_try_new0(RAMState, 1);
2612
2613    if (!*rsp) {
2614        error_report("%s: Init ramstate fail", __func__);
2615        return -1;
2616    }
2617
2618    qemu_mutex_init(&(*rsp)->bitmap_mutex);
2619    qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2620    QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2621
2622    /*
2623     * Count the total number of pages used by ram blocks not including any
2624     * gaps due to alignment or unplugs.
2625     * This must match with the initial values of dirty bitmap.
2626     */
2627    (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2628    ram_state_reset(*rsp);
2629
2630    return 0;
2631}
2632
2633static void ram_list_init_bitmaps(void)
2634{
2635    MigrationState *ms = migrate_get_current();
2636    RAMBlock *block;
2637    unsigned long pages;
2638    uint8_t shift;
2639
2640    /* Skip setting bitmap if there is no RAM */
2641    if (ram_bytes_total()) {
2642        shift = ms->clear_bitmap_shift;
2643        if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2644            error_report("clear_bitmap_shift (%u) too big, using "
2645                         "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2646            shift = CLEAR_BITMAP_SHIFT_MAX;
2647        } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2648            error_report("clear_bitmap_shift (%u) too small, using "
2649                         "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2650            shift = CLEAR_BITMAP_SHIFT_MIN;
2651        }
2652
2653        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2654            pages = block->max_length >> TARGET_PAGE_BITS;
2655            /*
2656             * The initial dirty bitmap for migration must be set with all
2657             * ones to make sure we'll migrate every guest RAM page to
2658             * destination.
2659             * Here we set RAMBlock.bmap all to 1 because when rebegin a
2660             * new migration after a failed migration, ram_list.
2661             * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2662             * guest memory.
2663             */
2664            block->bmap = bitmap_new(pages);
2665            bitmap_set(block->bmap, 0, pages);
2666            block->clear_bmap_shift = shift;
2667            block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2668        }
2669    }
2670}
2671
2672static void ram_init_bitmaps(RAMState *rs)
2673{
2674    /* For memory_global_dirty_log_start below.  */
2675    qemu_mutex_lock_iothread();
2676    qemu_mutex_lock_ramlist();
2677
2678    WITH_RCU_READ_LOCK_GUARD() {
2679        ram_list_init_bitmaps();
2680        /* We don't use dirty log with background snapshots */
2681        if (!migrate_background_snapshot()) {
2682            memory_global_dirty_log_start();
2683            migration_bitmap_sync_precopy(rs);
2684        }
2685    }
2686    qemu_mutex_unlock_ramlist();
2687    qemu_mutex_unlock_iothread();
2688}
2689
2690static int ram_init_all(RAMState **rsp)
2691{
2692    if (ram_state_init(rsp)) {
2693        return -1;
2694    }
2695
2696    if (xbzrle_init()) {
2697        ram_state_cleanup(rsp);
2698        return -1;
2699    }
2700
2701    ram_init_bitmaps(*rsp);
2702
2703    return 0;
2704}
2705
2706static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2707{
2708    RAMBlock *block;
2709    uint64_t pages = 0;
2710
2711    /*
2712     * Postcopy is not using xbzrle/compression, so no need for that.
2713     * Also, since source are already halted, we don't need to care
2714     * about dirty page logging as well.
2715     */
2716
2717    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2718        pages += bitmap_count_one(block->bmap,
2719                                  block->used_length >> TARGET_PAGE_BITS);
2720    }
2721
2722    /* This may not be aligned with current bitmaps. Recalculate. */
2723    rs->migration_dirty_pages = pages;
2724
2725    rs->last_seen_block = NULL;
2726    rs->last_sent_block = NULL;
2727    rs->last_page = 0;
2728    rs->last_version = ram_list.version;
2729    /*
2730     * Disable the bulk stage, otherwise we'll resend the whole RAM no
2731     * matter what we have sent.
2732     */
2733    rs->ram_bulk_stage = false;
2734
2735    /* Update RAMState cache of output QEMUFile */
2736    rs->f = out;
2737
2738    trace_ram_state_resume_prepare(pages);
2739}
2740
2741/*
2742 * This function clears bits of the free pages reported by the caller from the
2743 * migration dirty bitmap. @addr is the host address corresponding to the
2744 * start of the continuous guest free pages, and @len is the total bytes of
2745 * those pages.
2746 */
2747void qemu_guest_free_page_hint(void *addr, size_t len)
2748{
2749    RAMBlock *block;
2750    ram_addr_t offset;
2751    size_t used_len, start, npages;
2752    MigrationState *s = migrate_get_current();
2753
2754    /* This function is currently expected to be used during live migration */
2755    if (!migration_is_setup_or_active(s->state)) {
2756        return;
2757    }
2758
2759    for (; len > 0; len -= used_len, addr += used_len) {
2760        block = qemu_ram_block_from_host(addr, false, &offset);
2761        if (unlikely(!block || offset >= block->used_length)) {
2762            /*
2763             * The implementation might not support RAMBlock resize during
2764             * live migration, but it could happen in theory with future
2765             * updates. So we add a check here to capture that case.
2766             */
2767            error_report_once("%s unexpected error", __func__);
2768            return;
2769        }
2770
2771        if (len <= block->used_length - offset) {
2772            used_len = len;
2773        } else {
2774            used_len = block->used_length - offset;
2775        }
2776
2777        start = offset >> TARGET_PAGE_BITS;
2778        npages = used_len >> TARGET_PAGE_BITS;
2779
2780        qemu_mutex_lock(&ram_state->bitmap_mutex);
2781        ram_state->migration_dirty_pages -=
2782                      bitmap_count_one_with_offset(block->bmap, start, npages);
2783        bitmap_clear(block->bmap, start, npages);
2784        qemu_mutex_unlock(&ram_state->bitmap_mutex);
2785    }
2786}
2787
2788/*
2789 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2790 * long-running RCU critical section.  When rcu-reclaims in the code
2791 * start to become numerous it will be necessary to reduce the
2792 * granularity of these critical sections.
2793 */
2794
2795/**
2796 * ram_save_setup: Setup RAM for migration
2797 *
2798 * Returns zero to indicate success and negative for error
2799 *
2800 * @f: QEMUFile where to send the data
2801 * @opaque: RAMState pointer
2802 */
2803static int ram_save_setup(QEMUFile *f, void *opaque)
2804{
2805    RAMState **rsp = opaque;
2806    RAMBlock *block;
2807
2808    if (compress_threads_save_setup()) {
2809        return -1;
2810    }
2811
2812    /* migration has already setup the bitmap, reuse it. */
2813    if (!migration_in_colo_state()) {
2814        if (ram_init_all(rsp) != 0) {
2815            compress_threads_save_cleanup();
2816            return -1;
2817        }
2818    }
2819    (*rsp)->f = f;
2820
2821    WITH_RCU_READ_LOCK_GUARD() {
2822        qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2823
2824        RAMBLOCK_FOREACH_MIGRATABLE(block) {
2825            qemu_put_byte(f, strlen(block->idstr));
2826            qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2827            qemu_put_be64(f, block->used_length);
2828            if (migrate_postcopy_ram() && block->page_size !=
2829                                          qemu_host_page_size) {
2830                qemu_put_be64(f, block->page_size);
2831            }
2832            if (migrate_ignore_shared()) {
2833                qemu_put_be64(f, block->mr->addr);
2834            }
2835        }
2836    }
2837
2838    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2839    ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2840
2841    multifd_send_sync_main(f);
2842    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2843    qemu_fflush(f);
2844
2845    return 0;
2846}
2847
2848/**
2849 * ram_save_iterate: iterative stage for migration
2850 *
2851 * Returns zero to indicate success and negative for error
2852 *
2853 * @f: QEMUFile where to send the data
2854 * @opaque: RAMState pointer
2855 */
2856static int ram_save_iterate(QEMUFile *f, void *opaque)
2857{
2858    RAMState **temp = opaque;
2859    RAMState *rs = *temp;
2860    int ret = 0;
2861    int i;
2862    int64_t t0;
2863    int done = 0;
2864
2865    if (blk_mig_bulk_active()) {
2866        /* Avoid transferring ram during bulk phase of block migration as
2867         * the bulk phase will usually take a long time and transferring
2868         * ram updates during that time is pointless. */
2869        goto out;
2870    }
2871
2872    WITH_RCU_READ_LOCK_GUARD() {
2873        if (ram_list.version != rs->last_version) {
2874            ram_state_reset(rs);
2875        }
2876
2877        /* Read version before ram_list.blocks */
2878        smp_rmb();
2879
2880        ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2881
2882        t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2883        i = 0;
2884        while ((ret = qemu_file_rate_limit(f)) == 0 ||
2885                !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2886            int pages;
2887
2888            if (qemu_file_get_error(f)) {
2889                break;
2890            }
2891
2892            pages = ram_find_and_save_block(rs, false);
2893            /* no more pages to sent */
2894            if (pages == 0) {
2895                done = 1;
2896                break;
2897            }
2898
2899            if (pages < 0) {
2900                qemu_file_set_error(f, pages);
2901                break;
2902            }
2903
2904            rs->target_page_count += pages;
2905
2906            /*
2907             * During postcopy, it is necessary to make sure one whole host
2908             * page is sent in one chunk.
2909             */
2910            if (migrate_postcopy_ram()) {
2911                flush_compressed_data(rs);
2912            }
2913
2914            /*
2915             * we want to check in the 1st loop, just in case it was the 1st
2916             * time and we had to sync the dirty bitmap.
2917             * qemu_clock_get_ns() is a bit expensive, so we only check each
2918             * some iterations
2919             */
2920            if ((i & 63) == 0) {
2921                uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2922                              1000000;
2923                if (t1 > MAX_WAIT) {
2924                    trace_ram_save_iterate_big_wait(t1, i);
2925                    break;
2926                }
2927            }
2928            i++;
2929        }
2930    }
2931
2932    /*
2933     * Must occur before EOS (or any QEMUFile operation)
2934     * because of RDMA protocol.
2935     */
2936    ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2937
2938out:
2939    if (ret >= 0
2940        && migration_is_setup_or_active(migrate_get_current()->state)) {
2941        multifd_send_sync_main(rs->f);
2942        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2943        qemu_fflush(f);
2944        ram_counters.transferred += 8;
2945
2946        ret = qemu_file_get_error(f);
2947    }
2948    if (ret < 0) {
2949        return ret;
2950    }
2951
2952    return done;
2953}
2954
2955/**
2956 * ram_save_complete: function called to send the remaining amount of ram
2957 *
2958 * Returns zero to indicate success or negative on error
2959 *
2960 * Called with iothread lock
2961 *
2962 * @f: QEMUFile where to send the data
2963 * @opaque: RAMState pointer
2964 */
2965static int ram_save_complete(QEMUFile *f, void *opaque)
2966{
2967    RAMState **temp = opaque;
2968    RAMState *rs = *temp;
2969    int ret = 0;
2970
2971    WITH_RCU_READ_LOCK_GUARD() {
2972        if (!migration_in_postcopy()) {
2973            migration_bitmap_sync_precopy(rs);
2974        }
2975
2976        ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2977
2978        /* try transferring iterative blocks of memory */
2979
2980        /* flush all remaining blocks regardless of rate limiting */
2981        while (true) {
2982            int pages;
2983
2984            pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2985            /* no more blocks to sent */
2986            if (pages == 0) {
2987                break;
2988            }
2989            if (pages < 0) {
2990                ret = pages;
2991                break;
2992            }
2993        }
2994
2995        flush_compressed_data(rs);
2996        ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2997    }
2998
2999    if (ret >= 0) {
3000        multifd_send_sync_main(rs->f);
3001        qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3002        qemu_fflush(f);
3003    }
3004
3005    return ret;
3006}
3007
3008static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3009                             uint64_t *res_precopy_only,
3010                             uint64_t *res_compatible,
3011                             uint64_t *res_postcopy_only)
3012{
3013    RAMState **temp = opaque;
3014    RAMState *rs = *temp;
3015    uint64_t remaining_size;
3016
3017    remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3018
3019    if (!migration_in_postcopy() &&
3020        remaining_size < max_size) {
3021        qemu_mutex_lock_iothread();
3022        WITH_RCU_READ_LOCK_GUARD() {
3023            migration_bitmap_sync_precopy(rs);
3024        }
3025        qemu_mutex_unlock_iothread();
3026        remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3027    }
3028
3029    if (migrate_postcopy_ram()) {
3030        /* We can do postcopy, and all the data is postcopiable */
3031        *res_compatible += remaining_size;
3032    } else {
3033        *res_precopy_only += remaining_size;
3034    }
3035}
3036
3037static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3038{
3039    unsigned int xh_len;
3040    int xh_flags;
3041    uint8_t *loaded_data;
3042
3043    /* extract RLE header */
3044    xh_flags = qemu_get_byte(f);
3045    xh_len = qemu_get_be16(f);
3046
3047    if (xh_flags != ENCODING_FLAG_XBZRLE) {
3048        error_report("Failed to load XBZRLE page - wrong compression!");
3049        return -1;
3050    }
3051
3052    if (xh_len > TARGET_PAGE_SIZE) {
3053        error_report("Failed to load XBZRLE page - len overflow!");
3054        return -1;
3055    }
3056    loaded_data = XBZRLE.decoded_buf;
3057    /* load data and decode */
3058    /* it can change loaded_data to point to an internal buffer */
3059    qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3060
3061    /* decode RLE */
3062    if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3063                             TARGET_PAGE_SIZE) == -1) {
3064        error_report("Failed to load XBZRLE page - decode error!");
3065        return -1;
3066    }
3067
3068    return 0;
3069}
3070
3071/**
3072 * ram_block_from_stream: read a RAMBlock id from the migration stream
3073 *
3074 * Must be called from within a rcu critical section.
3075 *
3076 * Returns a pointer from within the RCU-protected ram_list.
3077 *
3078 * @f: QEMUFile where to read the data from
3079 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3080 */
3081static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3082{
3083    static RAMBlock *block;
3084    char id[256];
3085    uint8_t len;
3086
3087    if (flags & RAM_SAVE_FLAG_CONTINUE) {
3088        if (!block) {
3089            error_report("Ack, bad migration stream!");
3090            return NULL;
3091        }
3092        return block;
3093    }
3094
3095    len = qemu_get_byte(f);
3096    qemu_get_buffer(f, (uint8_t *)id, len);
3097    id[len] = 0;
3098
3099    block = qemu_ram_block_by_name(id);
3100    if (!block) {
3101        error_report("Can't find block %s", id);
3102        return NULL;
3103    }
3104
3105    if (ramblock_is_ignored(block)) {
3106        error_report("block %s should not be migrated !", id);
3107        return NULL;
3108    }
3109
3110    return block;
3111}
3112
3113static inline void *host_from_ram_block_offset(RAMBlock *block,
3114                                               ram_addr_t offset)
3115{
3116    if (!offset_in_ramblock(block, offset)) {
3117        return NULL;
3118    }
3119
3120    return block->host + offset;
3121}
3122
3123static inline void *colo_cache_from_block_offset(RAMBlock *block,
3124                             ram_addr_t offset, bool record_bitmap)
3125{
3126    if (!offset_in_ramblock(block, offset)) {
3127        return NULL;
3128    }
3129    if (!block->colo_cache) {
3130        error_report("%s: colo_cache is NULL in block :%s",
3131                     __func__, block->idstr);
3132        return NULL;
3133    }
3134
3135    /*
3136    * During colo checkpoint, we need bitmap of these migrated pages.
3137    * It help us to decide which pages in ram cache should be flushed
3138    * into VM's RAM later.
3139    */
3140    if (record_bitmap &&
3141        !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3142        ram_state->migration_dirty_pages++;
3143    }
3144    return block->colo_cache + offset;
3145}
3146
3147/**
3148 * ram_handle_compressed: handle the zero page case
3149 *
3150 * If a page (or a whole RDMA chunk) has been
3151 * determined to be zero, then zap it.
3152 *
3153 * @host: host address for the zero page
3154 * @ch: what the page is filled from.  We only support zero
3155 * @size: size of the zero page
3156 */
3157void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3158{
3159    if (ch != 0 || !is_zero_range(host, size)) {
3160        memset(host, ch, size);
3161    }
3162}
3163
3164/* return the size after decompression, or negative value on error */
3165static int
3166qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3167                     const uint8_t *source, size_t source_len)
3168{
3169    int err;
3170
3171    err = inflateReset(stream);
3172    if (err != Z_OK) {
3173        return -1;
3174    }
3175
3176    stream->avail_in = source_len;
3177    stream->next_in = (uint8_t *)source;
3178    stream->avail_out = dest_len;
3179    stream->next_out = dest;
3180
3181    err = inflate(stream, Z_NO_FLUSH);
3182    if (err != Z_STREAM_END) {
3183        return -1;
3184    }
3185
3186    return stream->total_out;
3187}
3188
3189static void *do_data_decompress(void *opaque)
3190{
3191    DecompressParam *param = opaque;
3192    unsigned long pagesize;
3193    uint8_t *des;
3194    int len, ret;
3195
3196    qemu_mutex_lock(&param->mutex);
3197    while (!param->quit) {
3198        if (param->des) {
3199            des = param->des;
3200            len = param->len;
3201            param->des = 0;
3202            qemu_mutex_unlock(&param->mutex);
3203
3204            pagesize = TARGET_PAGE_SIZE;
3205
3206            ret = qemu_uncompress_data(&param->stream, des, pagesize,
3207                                       param->compbuf, len);
3208            if (ret < 0 && migrate_get_current()->decompress_error_check) {
3209                error_report("decompress data failed");
3210                qemu_file_set_error(decomp_file, ret);
3211            }
3212
3213            qemu_mutex_lock(&decomp_done_lock);
3214            param->done = true;
3215            qemu_cond_signal(&decomp_done_cond);
3216            qemu_mutex_unlock(&decomp_done_lock);
3217
3218            qemu_mutex_lock(&param->mutex);
3219        } else {
3220            qemu_cond_wait(&param->cond, &param->mutex);
3221        }
3222    }
3223    qemu_mutex_unlock(&param->mutex);
3224
3225    return NULL;
3226}
3227
3228static int wait_for_decompress_done(void)
3229{
3230    int idx, thread_count;
3231
3232    if (!migrate_use_compression()) {
3233        return 0;
3234    }
3235
3236    thread_count = migrate_decompress_threads();
3237    qemu_mutex_lock(&decomp_done_lock);
3238    for (idx = 0; idx < thread_count; idx++) {
3239        while (!decomp_param[idx].done) {
3240            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3241        }
3242    }
3243    qemu_mutex_unlock(&decomp_done_lock);
3244    return qemu_file_get_error(decomp_file);
3245}
3246
3247static void compress_threads_load_cleanup(void)
3248{
3249    int i, thread_count;
3250
3251    if (!migrate_use_compression()) {
3252        return;
3253    }
3254    thread_count = migrate_decompress_threads();
3255    for (i = 0; i < thread_count; i++) {
3256        /*
3257         * we use it as a indicator which shows if the thread is
3258         * properly init'd or not
3259         */
3260        if (!decomp_param[i].compbuf) {
3261            break;
3262        }
3263
3264        qemu_mutex_lock(&decomp_param[i].mutex);
3265        decomp_param[i].quit = true;
3266        qemu_cond_signal(&decomp_param[i].cond);
3267        qemu_mutex_unlock(&decomp_param[i].mutex);
3268    }
3269    for (i = 0; i < thread_count; i++) {
3270        if (!decomp_param[i].compbuf) {
3271            break;
3272        }
3273
3274        qemu_thread_join(decompress_threads + i);
3275        qemu_mutex_destroy(&decomp_param[i].mutex);
3276        qemu_cond_destroy(&decomp_param[i].cond);
3277        inflateEnd(&decomp_param[i].stream);
3278        g_free(decomp_param[i].compbuf);
3279        decomp_param[i].compbuf = NULL;
3280    }
3281    g_free(decompress_threads);
3282    g_free(decomp_param);
3283    decompress_threads = NULL;
3284    decomp_param = NULL;
3285    decomp_file = NULL;
3286}
3287
3288static int compress_threads_load_setup(QEMUFile *f)
3289{
3290    int i, thread_count;
3291
3292    if (!migrate_use_compression()) {
3293        return 0;
3294    }
3295
3296    thread_count = migrate_decompress_threads();
3297    decompress_threads = g_new0(QemuThread, thread_count);
3298    decomp_param = g_new0(DecompressParam, thread_count);
3299    qemu_mutex_init(&decomp_done_lock);
3300    qemu_cond_init(&decomp_done_cond);
3301    decomp_file = f;
3302    for (i = 0; i < thread_count; i++) {
3303        if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3304            goto exit;
3305        }
3306
3307        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3308        qemu_mutex_init(&decomp_param[i].mutex);
3309        qemu_cond_init(&decomp_param[i].cond);
3310        decomp_param[i].done = true;
3311        decomp_param[i].quit = false;
3312        qemu_thread_create(decompress_threads + i, "decompress",
3313                           do_data_decompress, decomp_param + i,
3314                           QEMU_THREAD_JOINABLE);
3315    }
3316    return 0;
3317exit:
3318    compress_threads_load_cleanup();
3319    return -1;
3320}
3321
3322static void decompress_data_with_multi_threads(QEMUFile *f,
3323                                               void *host, int len)
3324{
3325    int idx, thread_count;
3326
3327    thread_count = migrate_decompress_threads();
3328    QEMU_LOCK_GUARD(&decomp_done_lock);
3329    while (true) {
3330        for (idx = 0; idx < thread_count; idx++) {
3331            if (decomp_param[idx].done) {
3332                decomp_param[idx].done = false;
3333                qemu_mutex_lock(&decomp_param[idx].mutex);
3334                qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3335                decomp_param[idx].des = host;
3336                decomp_param[idx].len = len;
3337                qemu_cond_signal(&decomp_param[idx].cond);
3338                qemu_mutex_unlock(&decomp_param[idx].mutex);
3339                break;
3340            }
3341        }
3342        if (idx < thread_count) {
3343            break;
3344        } else {
3345            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3346        }
3347    }
3348}
3349
3350 /*
3351  * we must set ram_bulk_stage to false, otherwise in
3352  * migation_bitmap_find_dirty the bitmap will be unused and
3353  * all the pages in ram cache wil be flushed to the ram of
3354  * secondary VM.
3355  */
3356static void colo_init_ram_state(void)
3357{
3358    ram_state_init(&ram_state);
3359    ram_state->ram_bulk_stage = false;
3360}
3361
3362/*
3363 * colo cache: this is for secondary VM, we cache the whole
3364 * memory of the secondary VM, it is need to hold the global lock
3365 * to call this helper.
3366 */
3367int colo_init_ram_cache(void)
3368{
3369    RAMBlock *block;
3370
3371    WITH_RCU_READ_LOCK_GUARD() {
3372        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3373            block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3374                                                    NULL,
3375                                                    false);
3376            if (!block->colo_cache) {
3377                error_report("%s: Can't alloc memory for COLO cache of block %s,"
3378                             "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3379                             block->used_length);
3380                RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3381                    if (block->colo_cache) {
3382                        qemu_anon_ram_free(block->colo_cache, block->used_length);
3383                        block->colo_cache = NULL;
3384                    }
3385                }
3386                return -errno;
3387            }
3388        }
3389    }
3390
3391    /*
3392    * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3393    * with to decide which page in cache should be flushed into SVM's RAM. Here
3394    * we use the same name 'ram_bitmap' as for migration.
3395    */
3396    if (ram_bytes_total()) {
3397        RAMBlock *block;
3398
3399        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3400            unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3401            block->bmap = bitmap_new(pages);
3402        }
3403    }
3404
3405    colo_init_ram_state();
3406    return 0;
3407}
3408
3409/* TODO: duplicated with ram_init_bitmaps */
3410void colo_incoming_start_dirty_log(void)
3411{
3412    RAMBlock *block = NULL;
3413    /* For memory_global_dirty_log_start below. */
3414    qemu_mutex_lock_iothread();
3415    qemu_mutex_lock_ramlist();
3416
3417    memory_global_dirty_log_sync();
3418    WITH_RCU_READ_LOCK_GUARD() {
3419        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3420            ramblock_sync_dirty_bitmap(ram_state, block);
3421            /* Discard this dirty bitmap record */
3422            bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3423        }
3424        memory_global_dirty_log_start();
3425    }
3426    ram_state->migration_dirty_pages = 0;
3427    qemu_mutex_unlock_ramlist();
3428    qemu_mutex_unlock_iothread();
3429}
3430
3431/* It is need to hold the global lock to call this helper */
3432void colo_release_ram_cache(void)
3433{
3434    RAMBlock *block;
3435
3436    memory_global_dirty_log_stop();
3437    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3438        g_free(block->bmap);
3439        block->bmap = NULL;
3440    }
3441
3442    WITH_RCU_READ_LOCK_GUARD() {
3443        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3444            if (block->colo_cache) {
3445                qemu_anon_ram_free(block->colo_cache, block->used_length);
3446                block->colo_cache = NULL;
3447            }
3448        }
3449    }
3450    ram_state_cleanup(&ram_state);
3451}
3452
3453/**
3454 * ram_load_setup: Setup RAM for migration incoming side
3455 *
3456 * Returns zero to indicate success and negative for error
3457 *
3458 * @f: QEMUFile where to receive the data
3459 * @opaque: RAMState pointer
3460 */
3461static int ram_load_setup(QEMUFile *f, void *opaque)
3462{
3463    if (compress_threads_load_setup(f)) {
3464        return -1;
3465    }
3466
3467    xbzrle_load_setup();
3468    ramblock_recv_map_init();
3469
3470    return 0;
3471}
3472
3473static int ram_load_cleanup(void *opaque)
3474{
3475    RAMBlock *rb;
3476
3477    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3478        qemu_ram_block_writeback(rb);
3479    }
3480
3481    xbzrle_load_cleanup();
3482    compress_threads_load_cleanup();
3483
3484    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3485        g_free(rb->receivedmap);
3486        rb->receivedmap = NULL;
3487    }
3488
3489    return 0;
3490}
3491
3492/**
3493 * ram_postcopy_incoming_init: allocate postcopy data structures
3494 *
3495 * Returns 0 for success and negative if there was one error
3496 *
3497 * @mis: current migration incoming state
3498 *
3499 * Allocate data structures etc needed by incoming migration with
3500 * postcopy-ram. postcopy-ram's similarly names
3501 * postcopy_ram_incoming_init does the work.
3502 */
3503int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3504{
3505    return postcopy_ram_incoming_init(mis);
3506}
3507
3508/**
3509 * ram_load_postcopy: load a page in postcopy case
3510 *
3511 * Returns 0 for success or -errno in case of error
3512 *
3513 * Called in postcopy mode by ram_load().
3514 * rcu_read_lock is taken prior to this being called.
3515 *
3516 * @f: QEMUFile where to send the data
3517 */
3518static int ram_load_postcopy(QEMUFile *f)
3519{
3520    int flags = 0, ret = 0;
3521    bool place_needed = false;
3522    bool matches_target_page_size = false;
3523    MigrationIncomingState *mis = migration_incoming_get_current();
3524    /* Temporary page that is later 'placed' */
3525    void *postcopy_host_page = mis->postcopy_tmp_page;
3526    void *this_host = NULL;
3527    bool all_zero = true;
3528    int target_pages = 0;
3529
3530    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3531        ram_addr_t addr;
3532        void *host = NULL;
3533        void *page_buffer = NULL;
3534        void *place_source = NULL;
3535        RAMBlock *block = NULL;
3536        uint8_t ch;
3537        int len;
3538
3539        addr = qemu_get_be64(f);
3540
3541        /*
3542         * If qemu file error, we should stop here, and then "addr"
3543         * may be invalid
3544         */
3545        ret = qemu_file_get_error(f);
3546        if (ret) {
3547            break;
3548        }
3549
3550        flags = addr & ~TARGET_PAGE_MASK;
3551        addr &= TARGET_PAGE_MASK;
3552
3553        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3554        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3555                     RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3556            block = ram_block_from_stream(f, flags);
3557
3558            host = host_from_ram_block_offset(block, addr);
3559            if (!host) {
3560                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3561                ret = -EINVAL;
3562                break;
3563            }
3564            target_pages++;
3565            matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3566            /*
3567             * Postcopy requires that we place whole host pages atomically;
3568             * these may be huge pages for RAMBlocks that are backed by
3569             * hugetlbfs.
3570             * To make it atomic, the data is read into a temporary page
3571             * that's moved into place later.
3572             * The migration protocol uses,  possibly smaller, target-pages
3573             * however the source ensures it always sends all the components
3574             * of a host page in one chunk.
3575             */
3576            page_buffer = postcopy_host_page +
3577                          ((uintptr_t)host & (block->page_size - 1));
3578            if (target_pages == 1) {
3579                this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3580                                                    block->page_size);
3581            } else {
3582                /* not the 1st TP within the HP */
3583                if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3584                    (uintptr_t)this_host) {
3585                    error_report("Non-same host page %p/%p",
3586                                  host, this_host);
3587                    ret = -EINVAL;
3588                    break;
3589                }
3590            }
3591
3592            /*
3593             * If it's the last part of a host page then we place the host
3594             * page
3595             */
3596            if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3597                place_needed = true;
3598            }
3599            place_source = postcopy_host_page;
3600        }
3601
3602        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3603        case RAM_SAVE_FLAG_ZERO:
3604            ch = qemu_get_byte(f);
3605            /*
3606             * Can skip to set page_buffer when
3607             * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3608             */
3609            if (ch || !matches_target_page_size) {
3610                memset(page_buffer, ch, TARGET_PAGE_SIZE);
3611            }
3612            if (ch) {
3613                all_zero = false;
3614            }
3615            break;
3616
3617        case RAM_SAVE_FLAG_PAGE:
3618            all_zero = false;
3619            if (!matches_target_page_size) {
3620                /* For huge pages, we always use temporary buffer */
3621                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3622            } else {
3623                /*
3624                 * For small pages that matches target page size, we
3625                 * avoid the qemu_file copy.  Instead we directly use
3626                 * the buffer of QEMUFile to place the page.  Note: we
3627                 * cannot do any QEMUFile operation before using that
3628                 * buffer to make sure the buffer is valid when
3629                 * placing the page.
3630                 */
3631                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3632                                         TARGET_PAGE_SIZE);
3633            }
3634            break;
3635        case RAM_SAVE_FLAG_COMPRESS_PAGE:
3636            all_zero = false;
3637            len = qemu_get_be32(f);
3638            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3639                error_report("Invalid compressed data length: %d", len);
3640                ret = -EINVAL;
3641                break;
3642            }
3643            decompress_data_with_multi_threads(f, page_buffer, len);
3644            break;
3645
3646        case RAM_SAVE_FLAG_EOS:
3647            /* normal exit */
3648            multifd_recv_sync_main();
3649            break;
3650        default:
3651            error_report("Unknown combination of migration flags: 0x%x"
3652                         " (postcopy mode)", flags);
3653            ret = -EINVAL;
3654            break;
3655        }
3656
3657        /* Got the whole host page, wait for decompress before placing. */
3658        if (place_needed) {
3659            ret |= wait_for_decompress_done();
3660        }
3661
3662        /* Detect for any possible file errors */
3663        if (!ret && qemu_file_get_error(f)) {
3664            ret = qemu_file_get_error(f);
3665        }
3666
3667        if (!ret && place_needed) {
3668            /* This gets called at the last target page in the host page */
3669            void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3670                                                       block->page_size);
3671
3672            if (all_zero) {
3673                ret = postcopy_place_page_zero(mis, place_dest,
3674                                               block);
3675            } else {
3676                ret = postcopy_place_page(mis, place_dest,
3677                                          place_source, block);
3678            }
3679            place_needed = false;
3680            target_pages = 0;
3681            /* Assume we have a zero page until we detect something different */
3682            all_zero = true;
3683        }
3684    }
3685
3686    return ret;
3687}
3688
3689static bool postcopy_is_advised(void)
3690{
3691    PostcopyState ps = postcopy_state_get();
3692    return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3693}
3694
3695static bool postcopy_is_running(void)
3696{
3697    PostcopyState ps = postcopy_state_get();
3698    return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3699}
3700
3701/*
3702 * Flush content of RAM cache into SVM's memory.
3703 * Only flush the pages that be dirtied by PVM or SVM or both.
3704 */
3705void colo_flush_ram_cache(void)
3706{
3707    RAMBlock *block = NULL;
3708    void *dst_host;
3709    void *src_host;
3710    unsigned long offset = 0;
3711
3712    memory_global_dirty_log_sync();
3713    WITH_RCU_READ_LOCK_GUARD() {
3714        RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3715            ramblock_sync_dirty_bitmap(ram_state, block);
3716        }
3717    }
3718
3719    trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3720    WITH_RCU_READ_LOCK_GUARD() {
3721        block = QLIST_FIRST_RCU(&ram_list.blocks);
3722
3723        while (block) {
3724            offset = migration_bitmap_find_dirty(ram_state, block, offset);
3725
3726            if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3727                >= block->used_length) {
3728                offset = 0;
3729                block = QLIST_NEXT_RCU(block, next);
3730            } else {
3731                migration_bitmap_clear_dirty(ram_state, block, offset);
3732                dst_host = block->host
3733                         + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3734                src_host = block->colo_cache
3735                         + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3736                memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3737            }
3738        }
3739    }
3740    trace_colo_flush_ram_cache_end();
3741}
3742
3743/**
3744 * ram_load_precopy: load pages in precopy case
3745 *
3746 * Returns 0 for success or -errno in case of error
3747 *
3748 * Called in precopy mode by ram_load().
3749 * rcu_read_lock is taken prior to this being called.
3750 *
3751 * @f: QEMUFile where to send the data
3752 */
3753static int ram_load_precopy(QEMUFile *f)
3754{
3755    int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3756    /* ADVISE is earlier, it shows the source has the postcopy capability on */
3757    bool postcopy_advised = postcopy_is_advised();
3758    if (!migrate_use_compression()) {
3759        invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3760    }
3761
3762    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3763        ram_addr_t addr, total_ram_bytes;
3764        void *host = NULL, *host_bak = NULL;
3765        uint8_t ch;
3766
3767        /*
3768         * Yield periodically to let main loop run, but an iteration of
3769         * the main loop is expensive, so do it each some iterations
3770         */
3771        if ((i & 32767) == 0 && qemu_in_coroutine()) {
3772            aio_co_schedule(qemu_get_current_aio_context(),
3773                            qemu_coroutine_self());
3774            qemu_coroutine_yield();
3775        }
3776        i++;
3777
3778        addr = qemu_get_be64(f);
3779        flags = addr & ~TARGET_PAGE_MASK;
3780        addr &= TARGET_PAGE_MASK;
3781
3782        if (flags & invalid_flags) {
3783            if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3784                error_report("Received an unexpected compressed page");
3785            }
3786
3787            ret = -EINVAL;
3788            break;
3789        }
3790
3791        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3792                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3793            RAMBlock *block = ram_block_from_stream(f, flags);
3794
3795            host = host_from_ram_block_offset(block, addr);
3796            /*
3797             * After going into COLO stage, we should not load the page
3798             * into SVM's memory directly, we put them into colo_cache firstly.
3799             * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3800             * Previously, we copied all these memory in preparing stage of COLO
3801             * while we need to stop VM, which is a time-consuming process.
3802             * Here we optimize it by a trick, back-up every page while in
3803             * migration process while COLO is enabled, though it affects the
3804             * speed of the migration, but it obviously reduce the downtime of
3805             * back-up all SVM'S memory in COLO preparing stage.
3806             */
3807            if (migration_incoming_colo_enabled()) {
3808                if (migration_incoming_in_colo_state()) {
3809                    /* In COLO stage, put all pages into cache temporarily */
3810                    host = colo_cache_from_block_offset(block, addr, true);
3811                } else {
3812                   /*
3813                    * In migration stage but before COLO stage,
3814                    * Put all pages into both cache and SVM's memory.
3815                    */
3816                    host_bak = colo_cache_from_block_offset(block, addr, false);
3817                }
3818            }
3819            if (!host) {
3820                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3821                ret = -EINVAL;
3822                break;
3823            }
3824            if (!migration_incoming_in_colo_state()) {
3825                ramblock_recv_bitmap_set(block, host);
3826            }
3827
3828            trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3829        }
3830
3831        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3832        case RAM_SAVE_FLAG_MEM_SIZE:
3833            /* Synchronize RAM block list */
3834            total_ram_bytes = addr;
3835            while (!ret && total_ram_bytes) {
3836                RAMBlock *block;
3837                char id[256];
3838                ram_addr_t length;
3839
3840                len = qemu_get_byte(f);
3841                qemu_get_buffer(f, (uint8_t *)id, len);
3842                id[len] = 0;
3843                length = qemu_get_be64(f);
3844
3845                block = qemu_ram_block_by_name(id);
3846                if (block && !qemu_ram_is_migratable(block)) {
3847                    error_report("block %s should not be migrated !", id);
3848                    ret = -EINVAL;
3849                } else if (block) {
3850                    if (length != block->used_length) {
3851                        Error *local_err = NULL;
3852
3853                        ret = qemu_ram_resize(block, length,
3854                                              &local_err);
3855                        if (local_err) {
3856                            error_report_err(local_err);
3857                        }
3858                    }
3859                    /* For postcopy we need to check hugepage sizes match */
3860                    if (postcopy_advised && migrate_postcopy_ram() &&
3861                        block->page_size != qemu_host_page_size) {
3862                        uint64_t remote_page_size = qemu_get_be64(f);
3863                        if (remote_page_size != block->page_size) {
3864                            error_report("Mismatched RAM page size %s "
3865                                         "(local) %zd != %" PRId64,
3866                                         id, block->page_size,
3867                                         remote_page_size);
3868                            ret = -EINVAL;
3869                        }
3870                    }
3871                    if (migrate_ignore_shared()) {
3872                        hwaddr addr = qemu_get_be64(f);
3873                        if (ramblock_is_ignored(block) &&
3874                            block->mr->addr != addr) {
3875                            error_report("Mismatched GPAs for block %s "
3876                                         "%" PRId64 "!= %" PRId64,
3877                                         id, (uint64_t)addr,
3878                                         (uint64_t)block->mr->addr);
3879                            ret = -EINVAL;
3880                        }
3881                    }
3882                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3883                                          block->idstr);
3884                } else {
3885                    error_report("Unknown ramblock \"%s\", cannot "
3886                                 "accept migration", id);
3887                    ret = -EINVAL;
3888                }
3889
3890                total_ram_bytes -= length;
3891            }
3892            break;
3893
3894        case RAM_SAVE_FLAG_ZERO:
3895            ch = qemu_get_byte(f);
3896            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3897            break;
3898
3899        case RAM_SAVE_FLAG_PAGE:
3900            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3901            break;
3902
3903        case RAM_SAVE_FLAG_COMPRESS_PAGE:
3904            len = qemu_get_be32(f);
3905            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3906                error_report("Invalid compressed data length: %d", len);
3907                ret = -EINVAL;
3908                break;
3909            }
3910            decompress_data_with_multi_threads(f, host, len);
3911            break;
3912
3913        case RAM_SAVE_FLAG_XBZRLE:
3914            if (load_xbzrle(f, addr, host) < 0) {
3915                error_report("Failed to decompress XBZRLE page at "
3916                             RAM_ADDR_FMT, addr);
3917                ret = -EINVAL;
3918                break;
3919            }
3920            break;
3921        case RAM_SAVE_FLAG_EOS:
3922            /* normal exit */
3923            multifd_recv_sync_main();
3924            break;
3925        default:
3926            if (flags & RAM_SAVE_FLAG_HOOK) {
3927                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3928            } else {
3929                error_report("Unknown combination of migration flags: 0x%x",
3930                             flags);
3931                ret = -EINVAL;
3932            }
3933        }
3934        if (!ret) {
3935            ret = qemu_file_get_error(f);
3936        }
3937        if (!ret && host_bak) {
3938            memcpy(host_bak, host, TARGET_PAGE_SIZE);
3939        }
3940    }
3941
3942    ret |= wait_for_decompress_done();
3943    return ret;
3944}
3945
3946static int ram_load(QEMUFile *f, void *opaque, int version_id)
3947{
3948    int ret = 0;
3949    static uint64_t seq_iter;
3950    /*
3951     * If system is running in postcopy mode, page inserts to host memory must
3952     * be atomic
3953     */
3954    bool postcopy_running = postcopy_is_running();
3955
3956    seq_iter++;
3957
3958    if (version_id != 4) {
3959        return -EINVAL;
3960    }
3961
3962    /*
3963     * This RCU critical section can be very long running.
3964     * When RCU reclaims in the code start to become numerous,
3965     * it will be necessary to reduce the granularity of this
3966     * critical section.
3967     */
3968    WITH_RCU_READ_LOCK_GUARD() {
3969        if (postcopy_running) {
3970            ret = ram_load_postcopy(f);
3971        } else {
3972            ret = ram_load_precopy(f);
3973        }
3974    }
3975    trace_ram_load_complete(ret, seq_iter);
3976
3977    return ret;
3978}
3979
3980static bool ram_has_postcopy(void *opaque)
3981{
3982    RAMBlock *rb;
3983    RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3984        if (ramblock_is_pmem(rb)) {
3985            info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3986                         "is not supported now!", rb->idstr, rb->host);
3987            return false;
3988        }
3989    }
3990
3991    return migrate_postcopy_ram();
3992}
3993
3994/* Sync all the dirty bitmap with destination VM.  */
3995static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3996{
3997    RAMBlock *block;
3998    QEMUFile *file = s->to_dst_file;
3999    int ramblock_count = 0;
4000
4001    trace_ram_dirty_bitmap_sync_start();
4002
4003    RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4004        qemu_savevm_send_recv_bitmap(file, block->idstr);
4005        trace_ram_dirty_bitmap_request(block->idstr);
4006        ramblock_count++;
4007    }
4008
4009    trace_ram_dirty_bitmap_sync_wait();
4010
4011    /* Wait until all the ramblocks' dirty bitmap synced */
4012    while (ramblock_count--) {
4013        qemu_sem_wait(&s->rp_state.rp_sem);
4014    }
4015
4016    trace_ram_dirty_bitmap_sync_complete();
4017
4018    return 0;
4019}
4020
4021static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4022{
4023    qemu_sem_post(&s->rp_state.rp_sem);
4024}
4025
4026/*
4027 * Read the received bitmap, revert it as the initial dirty bitmap.
4028 * This is only used when the postcopy migration is paused but wants
4029 * to resume from a middle point.
4030 */
4031int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4032{
4033    int ret = -EINVAL;
4034    QEMUFile *file = s->rp_state.from_dst_file;
4035    unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4036    uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4037    uint64_t size, end_mark;
4038
4039    trace_ram_dirty_bitmap_reload_begin(block->idstr);
4040
4041    if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4042        error_report("%s: incorrect state %s", __func__,
4043                     MigrationStatus_str(s->state));
4044        return -EINVAL;
4045    }
4046
4047    /*
4048     * Note: see comments in ramblock_recv_bitmap_send() on why we
4049     * need the endianness conversion, and the paddings.
4050     */
4051    local_size = ROUND_UP(local_size, 8);
4052
4053    /* Add paddings */
4054    le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4055
4056    size = qemu_get_be64(file);
4057
4058    /* The size of the bitmap should match with our ramblock */
4059    if (size != local_size) {
4060        error_report("%s: ramblock '%s' bitmap size mismatch "
4061                     "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4062                     block->idstr, size, local_size);
4063        ret = -EINVAL;
4064        goto out;
4065    }
4066
4067    size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4068    end_mark = qemu_get_be64(file);
4069
4070    ret = qemu_file_get_error(file);
4071    if (ret || size != local_size) {
4072        error_report("%s: read bitmap failed for ramblock '%s': %d"
4073                     " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4074                     __func__, block->idstr, ret, local_size, size);
4075        ret = -EIO;
4076        goto out;
4077    }
4078
4079    if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4080        error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4081                     __func__, block->idstr, end_mark);
4082        ret = -EINVAL;
4083        goto out;
4084    }
4085
4086    /*
4087     * Endianness conversion. We are during postcopy (though paused).
4088     * The dirty bitmap won't change. We can directly modify it.
4089     */
4090    bitmap_from_le(block->bmap, le_bitmap, nbits);
4091
4092    /*
4093     * What we received is "received bitmap". Revert it as the initial
4094     * dirty bitmap for this ramblock.
4095     */
4096    bitmap_complement(block->bmap, block->bmap, nbits);
4097
4098    trace_ram_dirty_bitmap_reload_complete(block->idstr);
4099
4100    /*
4101     * We succeeded to sync bitmap for current ramblock. If this is
4102     * the last one to sync, we need to notify the main send thread.
4103     */
4104    ram_dirty_bitmap_reload_notify(s);
4105
4106    ret = 0;
4107out:
4108    g_free(le_bitmap);
4109    return ret;
4110}
4111
4112static int ram_resume_prepare(MigrationState *s, void *opaque)
4113{
4114    RAMState *rs = *(RAMState **)opaque;
4115    int ret;
4116
4117    ret = ram_dirty_bitmap_sync_all(s, rs);
4118    if (ret) {
4119        return ret;
4120    }
4121
4122    ram_state_resume_prepare(rs, s->to_dst_file);
4123
4124    return 0;
4125}
4126
4127static SaveVMHandlers savevm_ram_handlers = {
4128    .save_setup = ram_save_setup,
4129    .save_live_iterate = ram_save_iterate,
4130    .save_live_complete_postcopy = ram_save_complete,
4131    .save_live_complete_precopy = ram_save_complete,
4132    .has_postcopy = ram_has_postcopy,
4133    .save_live_pending = ram_save_pending,
4134    .load_state = ram_load,
4135    .save_cleanup = ram_save_cleanup,
4136    .load_setup = ram_load_setup,
4137    .load_cleanup = ram_load_cleanup,
4138    .resume_prepare = ram_resume_prepare,
4139};
4140
4141void ram_mig_init(void)
4142{
4143    qemu_mutex_init(&XBZRLE.lock);
4144    register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4145}
4146