qemu/migration/ram.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2011-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28#include "qemu/osdep.h"
  29#include "qemu-common.h"
  30#include "cpu.h"
  31#include <zlib.h>
  32#include "qapi-event.h"
  33#include "qemu/cutils.h"
  34#include "qemu/bitops.h"
  35#include "qemu/bitmap.h"
  36#include "qemu/timer.h"
  37#include "qemu/main-loop.h"
  38#include "migration/migration.h"
  39#include "migration/postcopy-ram.h"
  40#include "exec/address-spaces.h"
  41#include "migration/page_cache.h"
  42#include "qemu/error-report.h"
  43#include "trace.h"
  44#include "exec/ram_addr.h"
  45#include "qemu/rcu_queue.h"
  46
  47#ifdef DEBUG_MIGRATION_RAM
  48#define DPRINTF(fmt, ...) \
  49    do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
  50#else
  51#define DPRINTF(fmt, ...) \
  52    do { } while (0)
  53#endif
  54
  55static int dirty_rate_high_cnt;
  56
  57static uint64_t bitmap_sync_count;
  58
  59/***********************************************************/
  60/* ram save/restore */
  61
  62#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  63#define RAM_SAVE_FLAG_COMPRESS 0x02
  64#define RAM_SAVE_FLAG_MEM_SIZE 0x04
  65#define RAM_SAVE_FLAG_PAGE     0x08
  66#define RAM_SAVE_FLAG_EOS      0x10
  67#define RAM_SAVE_FLAG_CONTINUE 0x20
  68#define RAM_SAVE_FLAG_XBZRLE   0x40
  69/* 0x80 is reserved in migration.h start with 0x100 next */
  70#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  71
  72static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
  73
  74static inline bool is_zero_range(uint8_t *p, uint64_t size)
  75{
  76    return buffer_find_nonzero_offset(p, size) == size;
  77}
  78
  79/* struct contains XBZRLE cache and a static page
  80   used by the compression */
  81static struct {
  82    /* buffer used for XBZRLE encoding */
  83    uint8_t *encoded_buf;
  84    /* buffer for storing page content */
  85    uint8_t *current_buf;
  86    /* Cache for XBZRLE, Protected by lock. */
  87    PageCache *cache;
  88    QemuMutex lock;
  89} XBZRLE;
  90
  91/* buffer used for XBZRLE decoding */
  92static uint8_t *xbzrle_decoded_buf;
  93
  94static void XBZRLE_cache_lock(void)
  95{
  96    if (migrate_use_xbzrle())
  97        qemu_mutex_lock(&XBZRLE.lock);
  98}
  99
 100static void XBZRLE_cache_unlock(void)
 101{
 102    if (migrate_use_xbzrle())
 103        qemu_mutex_unlock(&XBZRLE.lock);
 104}
 105
 106/*
 107 * called from qmp_migrate_set_cache_size in main thread, possibly while
 108 * a migration is in progress.
 109 * A running migration maybe using the cache and might finish during this
 110 * call, hence changes to the cache are protected by XBZRLE.lock().
 111 */
 112int64_t xbzrle_cache_resize(int64_t new_size)
 113{
 114    PageCache *new_cache;
 115    int64_t ret;
 116
 117    if (new_size < TARGET_PAGE_SIZE) {
 118        return -1;
 119    }
 120
 121    XBZRLE_cache_lock();
 122
 123    if (XBZRLE.cache != NULL) {
 124        if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 125            goto out_new_size;
 126        }
 127        new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 128                                        TARGET_PAGE_SIZE);
 129        if (!new_cache) {
 130            error_report("Error creating cache");
 131            ret = -1;
 132            goto out;
 133        }
 134
 135        cache_fini(XBZRLE.cache);
 136        XBZRLE.cache = new_cache;
 137    }
 138
 139out_new_size:
 140    ret = pow2floor(new_size);
 141out:
 142    XBZRLE_cache_unlock();
 143    return ret;
 144}
 145
 146/* accounting for migration statistics */
 147typedef struct AccountingInfo {
 148    uint64_t dup_pages;
 149    uint64_t skipped_pages;
 150    uint64_t norm_pages;
 151    uint64_t iterations;
 152    uint64_t xbzrle_bytes;
 153    uint64_t xbzrle_pages;
 154    uint64_t xbzrle_cache_miss;
 155    double xbzrle_cache_miss_rate;
 156    uint64_t xbzrle_overflows;
 157} AccountingInfo;
 158
 159static AccountingInfo acct_info;
 160
 161static void acct_clear(void)
 162{
 163    memset(&acct_info, 0, sizeof(acct_info));
 164}
 165
 166uint64_t dup_mig_bytes_transferred(void)
 167{
 168    return acct_info.dup_pages * TARGET_PAGE_SIZE;
 169}
 170
 171uint64_t dup_mig_pages_transferred(void)
 172{
 173    return acct_info.dup_pages;
 174}
 175
 176uint64_t skipped_mig_bytes_transferred(void)
 177{
 178    return acct_info.skipped_pages * TARGET_PAGE_SIZE;
 179}
 180
 181uint64_t skipped_mig_pages_transferred(void)
 182{
 183    return acct_info.skipped_pages;
 184}
 185
 186uint64_t norm_mig_bytes_transferred(void)
 187{
 188    return acct_info.norm_pages * TARGET_PAGE_SIZE;
 189}
 190
 191uint64_t norm_mig_pages_transferred(void)
 192{
 193    return acct_info.norm_pages;
 194}
 195
 196uint64_t xbzrle_mig_bytes_transferred(void)
 197{
 198    return acct_info.xbzrle_bytes;
 199}
 200
 201uint64_t xbzrle_mig_pages_transferred(void)
 202{
 203    return acct_info.xbzrle_pages;
 204}
 205
 206uint64_t xbzrle_mig_pages_cache_miss(void)
 207{
 208    return acct_info.xbzrle_cache_miss;
 209}
 210
 211double xbzrle_mig_cache_miss_rate(void)
 212{
 213    return acct_info.xbzrle_cache_miss_rate;
 214}
 215
 216uint64_t xbzrle_mig_pages_overflow(void)
 217{
 218    return acct_info.xbzrle_overflows;
 219}
 220
 221/* This is the last block that we have visited serching for dirty pages
 222 */
 223static RAMBlock *last_seen_block;
 224/* This is the last block from where we have sent data */
 225static RAMBlock *last_sent_block;
 226static ram_addr_t last_offset;
 227static QemuMutex migration_bitmap_mutex;
 228static uint64_t migration_dirty_pages;
 229static uint32_t last_version;
 230static bool ram_bulk_stage;
 231
 232/* used by the search for pages to send */
 233struct PageSearchStatus {
 234    /* Current block being searched */
 235    RAMBlock    *block;
 236    /* Current offset to search from */
 237    ram_addr_t   offset;
 238    /* Set once we wrap around */
 239    bool         complete_round;
 240};
 241typedef struct PageSearchStatus PageSearchStatus;
 242
 243static struct BitmapRcu {
 244    struct rcu_head rcu;
 245    /* Main migration bitmap */
 246    unsigned long *bmap;
 247    /* bitmap of pages that haven't been sent even once
 248     * only maintained and used in postcopy at the moment
 249     * where it's used to send the dirtymap at the start
 250     * of the postcopy phase
 251     */
 252    unsigned long *unsentmap;
 253} *migration_bitmap_rcu;
 254
 255struct CompressParam {
 256    bool done;
 257    bool quit;
 258    QEMUFile *file;
 259    QemuMutex mutex;
 260    QemuCond cond;
 261    RAMBlock *block;
 262    ram_addr_t offset;
 263};
 264typedef struct CompressParam CompressParam;
 265
 266struct DecompressParam {
 267    bool done;
 268    bool quit;
 269    QemuMutex mutex;
 270    QemuCond cond;
 271    void *des;
 272    uint8_t *compbuf;
 273    int len;
 274};
 275typedef struct DecompressParam DecompressParam;
 276
 277static CompressParam *comp_param;
 278static QemuThread *compress_threads;
 279/* comp_done_cond is used to wake up the migration thread when
 280 * one of the compression threads has finished the compression.
 281 * comp_done_lock is used to co-work with comp_done_cond.
 282 */
 283static QemuMutex comp_done_lock;
 284static QemuCond comp_done_cond;
 285/* The empty QEMUFileOps will be used by file in CompressParam */
 286static const QEMUFileOps empty_ops = { };
 287
 288static bool compression_switch;
 289static DecompressParam *decomp_param;
 290static QemuThread *decompress_threads;
 291static QemuMutex decomp_done_lock;
 292static QemuCond decomp_done_cond;
 293
 294static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 295                                ram_addr_t offset);
 296
 297static void *do_data_compress(void *opaque)
 298{
 299    CompressParam *param = opaque;
 300    RAMBlock *block;
 301    ram_addr_t offset;
 302
 303    qemu_mutex_lock(&param->mutex);
 304    while (!param->quit) {
 305        if (param->block) {
 306            block = param->block;
 307            offset = param->offset;
 308            param->block = NULL;
 309            qemu_mutex_unlock(&param->mutex);
 310
 311            do_compress_ram_page(param->file, block, offset);
 312
 313            qemu_mutex_lock(&comp_done_lock);
 314            param->done = true;
 315            qemu_cond_signal(&comp_done_cond);
 316            qemu_mutex_unlock(&comp_done_lock);
 317
 318            qemu_mutex_lock(&param->mutex);
 319        } else {
 320            qemu_cond_wait(&param->cond, &param->mutex);
 321        }
 322    }
 323    qemu_mutex_unlock(&param->mutex);
 324
 325    return NULL;
 326}
 327
 328static inline void terminate_compression_threads(void)
 329{
 330    int idx, thread_count;
 331
 332    thread_count = migrate_compress_threads();
 333    for (idx = 0; idx < thread_count; idx++) {
 334        qemu_mutex_lock(&comp_param[idx].mutex);
 335        comp_param[idx].quit = true;
 336        qemu_cond_signal(&comp_param[idx].cond);
 337        qemu_mutex_unlock(&comp_param[idx].mutex);
 338    }
 339}
 340
 341void migrate_compress_threads_join(void)
 342{
 343    int i, thread_count;
 344
 345    if (!migrate_use_compression()) {
 346        return;
 347    }
 348    terminate_compression_threads();
 349    thread_count = migrate_compress_threads();
 350    for (i = 0; i < thread_count; i++) {
 351        qemu_thread_join(compress_threads + i);
 352        qemu_fclose(comp_param[i].file);
 353        qemu_mutex_destroy(&comp_param[i].mutex);
 354        qemu_cond_destroy(&comp_param[i].cond);
 355    }
 356    qemu_mutex_destroy(&comp_done_lock);
 357    qemu_cond_destroy(&comp_done_cond);
 358    g_free(compress_threads);
 359    g_free(comp_param);
 360    compress_threads = NULL;
 361    comp_param = NULL;
 362}
 363
 364void migrate_compress_threads_create(void)
 365{
 366    int i, thread_count;
 367
 368    if (!migrate_use_compression()) {
 369        return;
 370    }
 371    compression_switch = true;
 372    thread_count = migrate_compress_threads();
 373    compress_threads = g_new0(QemuThread, thread_count);
 374    comp_param = g_new0(CompressParam, thread_count);
 375    qemu_cond_init(&comp_done_cond);
 376    qemu_mutex_init(&comp_done_lock);
 377    for (i = 0; i < thread_count; i++) {
 378        /* comp_param[i].file is just used as a dummy buffer to save data,
 379         * set its ops to empty.
 380         */
 381        comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 382        comp_param[i].done = true;
 383        comp_param[i].quit = false;
 384        qemu_mutex_init(&comp_param[i].mutex);
 385        qemu_cond_init(&comp_param[i].cond);
 386        qemu_thread_create(compress_threads + i, "compress",
 387                           do_data_compress, comp_param + i,
 388                           QEMU_THREAD_JOINABLE);
 389    }
 390}
 391
 392/**
 393 * save_page_header: Write page header to wire
 394 *
 395 * If this is the 1st block, it also writes the block identification
 396 *
 397 * Returns: Number of bytes written
 398 *
 399 * @f: QEMUFile where to send the data
 400 * @block: block that contains the page we want to send
 401 * @offset: offset inside the block for the page
 402 *          in the lower bits, it contains flags
 403 */
 404static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 405{
 406    size_t size, len;
 407
 408    qemu_put_be64(f, offset);
 409    size = 8;
 410
 411    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 412        len = strlen(block->idstr);
 413        qemu_put_byte(f, len);
 414        qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 415        size += 1 + len;
 416    }
 417    return size;
 418}
 419
 420/* Reduce amount of guest cpu execution to hopefully slow down memory writes.
 421 * If guest dirty memory rate is reduced below the rate at which we can
 422 * transfer pages to the destination then we should be able to complete
 423 * migration. Some workloads dirty memory way too fast and will not effectively
 424 * converge, even with auto-converge.
 425 */
 426static void mig_throttle_guest_down(void)
 427{
 428    MigrationState *s = migrate_get_current();
 429    uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 430    uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 431
 432    /* We have not started throttling yet. Let's start it. */
 433    if (!cpu_throttle_active()) {
 434        cpu_throttle_set(pct_initial);
 435    } else {
 436        /* Throttling already on, just increase the rate */
 437        cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 438    }
 439}
 440
 441/* Update the xbzrle cache to reflect a page that's been sent as all 0.
 442 * The important thing is that a stale (not-yet-0'd) page be replaced
 443 * by the new data.
 444 * As a bonus, if the page wasn't in the cache it gets added so that
 445 * when a small write is made into the 0'd page it gets XBZRLE sent
 446 */
 447static void xbzrle_cache_zero_page(ram_addr_t current_addr)
 448{
 449    if (ram_bulk_stage || !migrate_use_xbzrle()) {
 450        return;
 451    }
 452
 453    /* We don't care if this fails to allocate a new cache page
 454     * as long as it updated an old one */
 455    cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 456                 bitmap_sync_count);
 457}
 458
 459#define ENCODING_FLAG_XBZRLE 0x1
 460
 461/**
 462 * save_xbzrle_page: compress and send current page
 463 *
 464 * Returns: 1 means that we wrote the page
 465 *          0 means that page is identical to the one already sent
 466 *          -1 means that xbzrle would be longer than normal
 467 *
 468 * @f: QEMUFile where to send the data
 469 * @current_data:
 470 * @current_addr:
 471 * @block: block that contains the page we want to send
 472 * @offset: offset inside the block for the page
 473 * @last_stage: if we are at the completion stage
 474 * @bytes_transferred: increase it with the number of transferred bytes
 475 */
 476static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
 477                            ram_addr_t current_addr, RAMBlock *block,
 478                            ram_addr_t offset, bool last_stage,
 479                            uint64_t *bytes_transferred)
 480{
 481    int encoded_len = 0, bytes_xbzrle;
 482    uint8_t *prev_cached_page;
 483
 484    if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
 485        acct_info.xbzrle_cache_miss++;
 486        if (!last_stage) {
 487            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 488                             bitmap_sync_count) == -1) {
 489                return -1;
 490            } else {
 491                /* update *current_data when the page has been
 492                   inserted into cache */
 493                *current_data = get_cached_data(XBZRLE.cache, current_addr);
 494            }
 495        }
 496        return -1;
 497    }
 498
 499    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 500
 501    /* save current buffer into memory */
 502    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 503
 504    /* XBZRLE encoding (if there is no overflow) */
 505    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 506                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 507                                       TARGET_PAGE_SIZE);
 508    if (encoded_len == 0) {
 509        DPRINTF("Skipping unmodified page\n");
 510        return 0;
 511    } else if (encoded_len == -1) {
 512        DPRINTF("Overflow\n");
 513        acct_info.xbzrle_overflows++;
 514        /* update data in the cache */
 515        if (!last_stage) {
 516            memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 517            *current_data = prev_cached_page;
 518        }
 519        return -1;
 520    }
 521
 522    /* we need to update the data in the cache, in order to get the same data */
 523    if (!last_stage) {
 524        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 525    }
 526
 527    /* Send XBZRLE based compressed page */
 528    bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
 529    qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
 530    qemu_put_be16(f, encoded_len);
 531    qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
 532    bytes_xbzrle += encoded_len + 1 + 2;
 533    acct_info.xbzrle_pages++;
 534    acct_info.xbzrle_bytes += bytes_xbzrle;
 535    *bytes_transferred += bytes_xbzrle;
 536
 537    return 1;
 538}
 539
 540/* Called with rcu_read_lock() to protect migration_bitmap
 541 * rb: The RAMBlock  to search for dirty pages in
 542 * start: Start address (typically so we can continue from previous page)
 543 * ram_addr_abs: Pointer into which to store the address of the dirty page
 544 *               within the global ram_addr space
 545 *
 546 * Returns: byte offset within memory region of the start of a dirty page
 547 */
 548static inline
 549ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
 550                                       ram_addr_t start,
 551                                       ram_addr_t *ram_addr_abs)
 552{
 553    unsigned long base = rb->offset >> TARGET_PAGE_BITS;
 554    unsigned long nr = base + (start >> TARGET_PAGE_BITS);
 555    uint64_t rb_size = rb->used_length;
 556    unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
 557    unsigned long *bitmap;
 558
 559    unsigned long next;
 560
 561    bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 562    if (ram_bulk_stage && nr > base) {
 563        next = nr + 1;
 564    } else {
 565        next = find_next_bit(bitmap, size, nr);
 566    }
 567
 568    *ram_addr_abs = next << TARGET_PAGE_BITS;
 569    return (next - base) << TARGET_PAGE_BITS;
 570}
 571
 572static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
 573{
 574    bool ret;
 575    int nr = addr >> TARGET_PAGE_BITS;
 576    unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 577
 578    ret = test_and_clear_bit(nr, bitmap);
 579
 580    if (ret) {
 581        migration_dirty_pages--;
 582    }
 583    return ret;
 584}
 585
 586static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
 587{
 588    unsigned long *bitmap;
 589    bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 590    migration_dirty_pages +=
 591        cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
 592}
 593
 594/* Fix me: there are too many global variables used in migration process. */
 595static int64_t start_time;
 596static int64_t bytes_xfer_prev;
 597static int64_t num_dirty_pages_period;
 598static uint64_t xbzrle_cache_miss_prev;
 599static uint64_t iterations_prev;
 600
 601static void migration_bitmap_sync_init(void)
 602{
 603    start_time = 0;
 604    bytes_xfer_prev = 0;
 605    num_dirty_pages_period = 0;
 606    xbzrle_cache_miss_prev = 0;
 607    iterations_prev = 0;
 608}
 609
 610static void migration_bitmap_sync(void)
 611{
 612    RAMBlock *block;
 613    uint64_t num_dirty_pages_init = migration_dirty_pages;
 614    MigrationState *s = migrate_get_current();
 615    int64_t end_time;
 616    int64_t bytes_xfer_now;
 617
 618    bitmap_sync_count++;
 619
 620    if (!bytes_xfer_prev) {
 621        bytes_xfer_prev = ram_bytes_transferred();
 622    }
 623
 624    if (!start_time) {
 625        start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 626    }
 627
 628    trace_migration_bitmap_sync_start();
 629    address_space_sync_dirty_bitmap(&address_space_memory);
 630
 631    qemu_mutex_lock(&migration_bitmap_mutex);
 632    rcu_read_lock();
 633    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 634        migration_bitmap_sync_range(block->offset, block->used_length);
 635    }
 636    rcu_read_unlock();
 637    qemu_mutex_unlock(&migration_bitmap_mutex);
 638
 639    trace_migration_bitmap_sync_end(migration_dirty_pages
 640                                    - num_dirty_pages_init);
 641    num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
 642    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 643
 644    /* more than 1 second = 1000 millisecons */
 645    if (end_time > start_time + 1000) {
 646        if (migrate_auto_converge()) {
 647            /* The following detection logic can be refined later. For now:
 648               Check to see if the dirtied bytes is 50% more than the approx.
 649               amount of bytes that just got transferred since the last time we
 650               were in this routine. If that happens twice, start or increase
 651               throttling */
 652            bytes_xfer_now = ram_bytes_transferred();
 653
 654            if (s->dirty_pages_rate &&
 655               (num_dirty_pages_period * TARGET_PAGE_SIZE >
 656                   (bytes_xfer_now - bytes_xfer_prev)/2) &&
 657               (dirty_rate_high_cnt++ >= 2)) {
 658                    trace_migration_throttle();
 659                    dirty_rate_high_cnt = 0;
 660                    mig_throttle_guest_down();
 661             }
 662             bytes_xfer_prev = bytes_xfer_now;
 663        }
 664
 665        if (migrate_use_xbzrle()) {
 666            if (iterations_prev != acct_info.iterations) {
 667                acct_info.xbzrle_cache_miss_rate =
 668                   (double)(acct_info.xbzrle_cache_miss -
 669                            xbzrle_cache_miss_prev) /
 670                   (acct_info.iterations - iterations_prev);
 671            }
 672            iterations_prev = acct_info.iterations;
 673            xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
 674        }
 675        s->dirty_pages_rate = num_dirty_pages_period * 1000
 676            / (end_time - start_time);
 677        s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
 678        start_time = end_time;
 679        num_dirty_pages_period = 0;
 680    }
 681    s->dirty_sync_count = bitmap_sync_count;
 682    if (migrate_use_events()) {
 683        qapi_event_send_migration_pass(bitmap_sync_count, NULL);
 684    }
 685}
 686
 687/**
 688 * save_zero_page: Send the zero page to the stream
 689 *
 690 * Returns: Number of pages written.
 691 *
 692 * @f: QEMUFile where to send the data
 693 * @block: block that contains the page we want to send
 694 * @offset: offset inside the block for the page
 695 * @p: pointer to the page
 696 * @bytes_transferred: increase it with the number of transferred bytes
 697 */
 698static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
 699                          uint8_t *p, uint64_t *bytes_transferred)
 700{
 701    int pages = -1;
 702
 703    if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 704        acct_info.dup_pages++;
 705        *bytes_transferred += save_page_header(f, block,
 706                                               offset | RAM_SAVE_FLAG_COMPRESS);
 707        qemu_put_byte(f, 0);
 708        *bytes_transferred += 1;
 709        pages = 1;
 710    }
 711
 712    return pages;
 713}
 714
 715/**
 716 * ram_save_page: Send the given page to the stream
 717 *
 718 * Returns: Number of pages written.
 719 *          < 0 - error
 720 *          >=0 - Number of pages written - this might legally be 0
 721 *                if xbzrle noticed the page was the same.
 722 *
 723 * @f: QEMUFile where to send the data
 724 * @block: block that contains the page we want to send
 725 * @offset: offset inside the block for the page
 726 * @last_stage: if we are at the completion stage
 727 * @bytes_transferred: increase it with the number of transferred bytes
 728 */
 729static int ram_save_page(QEMUFile *f, PageSearchStatus *pss,
 730                         bool last_stage, uint64_t *bytes_transferred)
 731{
 732    int pages = -1;
 733    uint64_t bytes_xmit;
 734    ram_addr_t current_addr;
 735    uint8_t *p;
 736    int ret;
 737    bool send_async = true;
 738    RAMBlock *block = pss->block;
 739    ram_addr_t offset = pss->offset;
 740
 741    p = block->host + offset;
 742
 743    /* In doubt sent page as normal */
 744    bytes_xmit = 0;
 745    ret = ram_control_save_page(f, block->offset,
 746                           offset, TARGET_PAGE_SIZE, &bytes_xmit);
 747    if (bytes_xmit) {
 748        *bytes_transferred += bytes_xmit;
 749        pages = 1;
 750    }
 751
 752    XBZRLE_cache_lock();
 753
 754    current_addr = block->offset + offset;
 755
 756    if (block == last_sent_block) {
 757        offset |= RAM_SAVE_FLAG_CONTINUE;
 758    }
 759    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 760        if (ret != RAM_SAVE_CONTROL_DELAYED) {
 761            if (bytes_xmit > 0) {
 762                acct_info.norm_pages++;
 763            } else if (bytes_xmit == 0) {
 764                acct_info.dup_pages++;
 765            }
 766        }
 767    } else {
 768        pages = save_zero_page(f, block, offset, p, bytes_transferred);
 769        if (pages > 0) {
 770            /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 771             * page would be stale
 772             */
 773            xbzrle_cache_zero_page(current_addr);
 774        } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
 775            pages = save_xbzrle_page(f, &p, current_addr, block,
 776                                     offset, last_stage, bytes_transferred);
 777            if (!last_stage) {
 778                /* Can't send this cached data async, since the cache page
 779                 * might get updated before it gets to the wire
 780                 */
 781                send_async = false;
 782            }
 783        }
 784    }
 785
 786    /* XBZRLE overflow or normal page */
 787    if (pages == -1) {
 788        *bytes_transferred += save_page_header(f, block,
 789                                               offset | RAM_SAVE_FLAG_PAGE);
 790        if (send_async) {
 791            qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
 792        } else {
 793            qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
 794        }
 795        *bytes_transferred += TARGET_PAGE_SIZE;
 796        pages = 1;
 797        acct_info.norm_pages++;
 798    }
 799
 800    XBZRLE_cache_unlock();
 801
 802    return pages;
 803}
 804
 805static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 806                                ram_addr_t offset)
 807{
 808    int bytes_sent, blen;
 809    uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 810
 811    bytes_sent = save_page_header(f, block, offset |
 812                                  RAM_SAVE_FLAG_COMPRESS_PAGE);
 813    blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 814                                     migrate_compress_level());
 815    if (blen < 0) {
 816        bytes_sent = 0;
 817        qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 818        error_report("compressed data failed!");
 819    } else {
 820        bytes_sent += blen;
 821    }
 822
 823    return bytes_sent;
 824}
 825
 826static uint64_t bytes_transferred;
 827
 828static void flush_compressed_data(QEMUFile *f)
 829{
 830    int idx, len, thread_count;
 831
 832    if (!migrate_use_compression()) {
 833        return;
 834    }
 835    thread_count = migrate_compress_threads();
 836
 837    qemu_mutex_lock(&comp_done_lock);
 838    for (idx = 0; idx < thread_count; idx++) {
 839        while (!comp_param[idx].done) {
 840            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 841        }
 842    }
 843    qemu_mutex_unlock(&comp_done_lock);
 844
 845    for (idx = 0; idx < thread_count; idx++) {
 846        qemu_mutex_lock(&comp_param[idx].mutex);
 847        if (!comp_param[idx].quit) {
 848            len = qemu_put_qemu_file(f, comp_param[idx].file);
 849            bytes_transferred += len;
 850        }
 851        qemu_mutex_unlock(&comp_param[idx].mutex);
 852    }
 853}
 854
 855static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 856                                       ram_addr_t offset)
 857{
 858    param->block = block;
 859    param->offset = offset;
 860}
 861
 862static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
 863                                           ram_addr_t offset,
 864                                           uint64_t *bytes_transferred)
 865{
 866    int idx, thread_count, bytes_xmit = -1, pages = -1;
 867
 868    thread_count = migrate_compress_threads();
 869    qemu_mutex_lock(&comp_done_lock);
 870    while (true) {
 871        for (idx = 0; idx < thread_count; idx++) {
 872            if (comp_param[idx].done) {
 873                comp_param[idx].done = false;
 874                bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
 875                qemu_mutex_lock(&comp_param[idx].mutex);
 876                set_compress_params(&comp_param[idx], block, offset);
 877                qemu_cond_signal(&comp_param[idx].cond);
 878                qemu_mutex_unlock(&comp_param[idx].mutex);
 879                pages = 1;
 880                acct_info.norm_pages++;
 881                *bytes_transferred += bytes_xmit;
 882                break;
 883            }
 884        }
 885        if (pages > 0) {
 886            break;
 887        } else {
 888            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 889        }
 890    }
 891    qemu_mutex_unlock(&comp_done_lock);
 892
 893    return pages;
 894}
 895
 896/**
 897 * ram_save_compressed_page: compress the given page and send it to the stream
 898 *
 899 * Returns: Number of pages written.
 900 *
 901 * @f: QEMUFile where to send the data
 902 * @block: block that contains the page we want to send
 903 * @offset: offset inside the block for the page
 904 * @last_stage: if we are at the completion stage
 905 * @bytes_transferred: increase it with the number of transferred bytes
 906 */
 907static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss,
 908                                    bool last_stage,
 909                                    uint64_t *bytes_transferred)
 910{
 911    int pages = -1;
 912    uint64_t bytes_xmit = 0;
 913    uint8_t *p;
 914    int ret, blen;
 915    RAMBlock *block = pss->block;
 916    ram_addr_t offset = pss->offset;
 917
 918    p = block->host + offset;
 919
 920    ret = ram_control_save_page(f, block->offset,
 921                                offset, TARGET_PAGE_SIZE, &bytes_xmit);
 922    if (bytes_xmit) {
 923        *bytes_transferred += bytes_xmit;
 924        pages = 1;
 925    }
 926    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 927        if (ret != RAM_SAVE_CONTROL_DELAYED) {
 928            if (bytes_xmit > 0) {
 929                acct_info.norm_pages++;
 930            } else if (bytes_xmit == 0) {
 931                acct_info.dup_pages++;
 932            }
 933        }
 934    } else {
 935        /* When starting the process of a new block, the first page of
 936         * the block should be sent out before other pages in the same
 937         * block, and all the pages in last block should have been sent
 938         * out, keeping this order is important, because the 'cont' flag
 939         * is used to avoid resending the block name.
 940         */
 941        if (block != last_sent_block) {
 942            flush_compressed_data(f);
 943            pages = save_zero_page(f, block, offset, p, bytes_transferred);
 944            if (pages == -1) {
 945                /* Make sure the first page is sent out before other pages */
 946                bytes_xmit = save_page_header(f, block, offset |
 947                                              RAM_SAVE_FLAG_COMPRESS_PAGE);
 948                blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 949                                                 migrate_compress_level());
 950                if (blen > 0) {
 951                    *bytes_transferred += bytes_xmit + blen;
 952                    acct_info.norm_pages++;
 953                    pages = 1;
 954                } else {
 955                    qemu_file_set_error(f, blen);
 956                    error_report("compressed data failed!");
 957                }
 958            }
 959        } else {
 960            offset |= RAM_SAVE_FLAG_CONTINUE;
 961            pages = save_zero_page(f, block, offset, p, bytes_transferred);
 962            if (pages == -1) {
 963                pages = compress_page_with_multi_thread(f, block, offset,
 964                                                        bytes_transferred);
 965            }
 966        }
 967    }
 968
 969    return pages;
 970}
 971
 972/*
 973 * Find the next dirty page and update any state associated with
 974 * the search process.
 975 *
 976 * Returns: True if a page is found
 977 *
 978 * @f: Current migration stream.
 979 * @pss: Data about the state of the current dirty page scan.
 980 * @*again: Set to false if the search has scanned the whole of RAM
 981 * *ram_addr_abs: Pointer into which to store the address of the dirty page
 982 *               within the global ram_addr space
 983 */
 984static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
 985                             bool *again, ram_addr_t *ram_addr_abs)
 986{
 987    pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
 988                                              ram_addr_abs);
 989    if (pss->complete_round && pss->block == last_seen_block &&
 990        pss->offset >= last_offset) {
 991        /*
 992         * We've been once around the RAM and haven't found anything.
 993         * Give up.
 994         */
 995        *again = false;
 996        return false;
 997    }
 998    if (pss->offset >= pss->block->used_length) {
 999        /* Didn't find anything in this RAM Block */
1000        pss->offset = 0;
1001        pss->block = QLIST_NEXT_RCU(pss->block, next);
1002        if (!pss->block) {
1003            /* Hit the end of the list */
1004            pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1005            /* Flag that we've looped */
1006            pss->complete_round = true;
1007            ram_bulk_stage = false;
1008            if (migrate_use_xbzrle()) {
1009                /* If xbzrle is on, stop using the data compression at this
1010                 * point. In theory, xbzrle can do better than compression.
1011                 */
1012                flush_compressed_data(f);
1013                compression_switch = false;
1014            }
1015        }
1016        /* Didn't find anything this time, but try again on the new block */
1017        *again = true;
1018        return false;
1019    } else {
1020        /* Can go around again, but... */
1021        *again = true;
1022        /* We've found something so probably don't need to */
1023        return true;
1024    }
1025}
1026
1027/*
1028 * Helper for 'get_queued_page' - gets a page off the queue
1029 *      ms:      MigrationState in
1030 * *offset:      Used to return the offset within the RAMBlock
1031 * ram_addr_abs: global offset in the dirty/sent bitmaps
1032 *
1033 * Returns:      block (or NULL if none available)
1034 */
1035static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1036                              ram_addr_t *ram_addr_abs)
1037{
1038    RAMBlock *block = NULL;
1039
1040    qemu_mutex_lock(&ms->src_page_req_mutex);
1041    if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1042        struct MigrationSrcPageRequest *entry =
1043                                QSIMPLEQ_FIRST(&ms->src_page_requests);
1044        block = entry->rb;
1045        *offset = entry->offset;
1046        *ram_addr_abs = (entry->offset + entry->rb->offset) &
1047                        TARGET_PAGE_MASK;
1048
1049        if (entry->len > TARGET_PAGE_SIZE) {
1050            entry->len -= TARGET_PAGE_SIZE;
1051            entry->offset += TARGET_PAGE_SIZE;
1052        } else {
1053            memory_region_unref(block->mr);
1054            QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1055            g_free(entry);
1056        }
1057    }
1058    qemu_mutex_unlock(&ms->src_page_req_mutex);
1059
1060    return block;
1061}
1062
1063/*
1064 * Unqueue a page from the queue fed by postcopy page requests; skips pages
1065 * that are already sent (!dirty)
1066 *
1067 *      ms:      MigrationState in
1068 *     pss:      PageSearchStatus structure updated with found block/offset
1069 * ram_addr_abs: global offset in the dirty/sent bitmaps
1070 *
1071 * Returns:      true if a queued page is found
1072 */
1073static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1074                            ram_addr_t *ram_addr_abs)
1075{
1076    RAMBlock  *block;
1077    ram_addr_t offset;
1078    bool dirty;
1079
1080    do {
1081        block = unqueue_page(ms, &offset, ram_addr_abs);
1082        /*
1083         * We're sending this page, and since it's postcopy nothing else
1084         * will dirty it, and we must make sure it doesn't get sent again
1085         * even if this queue request was received after the background
1086         * search already sent it.
1087         */
1088        if (block) {
1089            unsigned long *bitmap;
1090            bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1091            dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1092            if (!dirty) {
1093                trace_get_queued_page_not_dirty(
1094                    block->idstr, (uint64_t)offset,
1095                    (uint64_t)*ram_addr_abs,
1096                    test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1097                         atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1098            } else {
1099                trace_get_queued_page(block->idstr,
1100                                      (uint64_t)offset,
1101                                      (uint64_t)*ram_addr_abs);
1102            }
1103        }
1104
1105    } while (block && !dirty);
1106
1107    if (block) {
1108        /*
1109         * As soon as we start servicing pages out of order, then we have
1110         * to kill the bulk stage, since the bulk stage assumes
1111         * in (migration_bitmap_find_and_reset_dirty) that every page is
1112         * dirty, that's no longer true.
1113         */
1114        ram_bulk_stage = false;
1115
1116        /*
1117         * We want the background search to continue from the queued page
1118         * since the guest is likely to want other pages near to the page
1119         * it just requested.
1120         */
1121        pss->block = block;
1122        pss->offset = offset;
1123    }
1124
1125    return !!block;
1126}
1127
1128/**
1129 * flush_page_queue: Flush any remaining pages in the ram request queue
1130 *    it should be empty at the end anyway, but in error cases there may be
1131 *    some left.
1132 *
1133 * ms: MigrationState
1134 */
1135void flush_page_queue(MigrationState *ms)
1136{
1137    struct MigrationSrcPageRequest *mspr, *next_mspr;
1138    /* This queue generally should be empty - but in the case of a failed
1139     * migration might have some droppings in.
1140     */
1141    rcu_read_lock();
1142    QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1143        memory_region_unref(mspr->rb->mr);
1144        QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1145        g_free(mspr);
1146    }
1147    rcu_read_unlock();
1148}
1149
1150/**
1151 * Queue the pages for transmission, e.g. a request from postcopy destination
1152 *   ms: MigrationStatus in which the queue is held
1153 *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1154 *   start: Offset from the start of the RAMBlock
1155 *   len: Length (in bytes) to send
1156 *   Return: 0 on success
1157 */
1158int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1159                         ram_addr_t start, ram_addr_t len)
1160{
1161    RAMBlock *ramblock;
1162
1163    ms->postcopy_requests++;
1164    rcu_read_lock();
1165    if (!rbname) {
1166        /* Reuse last RAMBlock */
1167        ramblock = ms->last_req_rb;
1168
1169        if (!ramblock) {
1170            /*
1171             * Shouldn't happen, we can't reuse the last RAMBlock if
1172             * it's the 1st request.
1173             */
1174            error_report("ram_save_queue_pages no previous block");
1175            goto err;
1176        }
1177    } else {
1178        ramblock = qemu_ram_block_by_name(rbname);
1179
1180        if (!ramblock) {
1181            /* We shouldn't be asked for a non-existent RAMBlock */
1182            error_report("ram_save_queue_pages no block '%s'", rbname);
1183            goto err;
1184        }
1185        ms->last_req_rb = ramblock;
1186    }
1187    trace_ram_save_queue_pages(ramblock->idstr, start, len);
1188    if (start+len > ramblock->used_length) {
1189        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1190                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1191                     __func__, start, len, ramblock->used_length);
1192        goto err;
1193    }
1194
1195    struct MigrationSrcPageRequest *new_entry =
1196        g_malloc0(sizeof(struct MigrationSrcPageRequest));
1197    new_entry->rb = ramblock;
1198    new_entry->offset = start;
1199    new_entry->len = len;
1200
1201    memory_region_ref(ramblock->mr);
1202    qemu_mutex_lock(&ms->src_page_req_mutex);
1203    QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1204    qemu_mutex_unlock(&ms->src_page_req_mutex);
1205    rcu_read_unlock();
1206
1207    return 0;
1208
1209err:
1210    rcu_read_unlock();
1211    return -1;
1212}
1213
1214/**
1215 * ram_save_target_page: Save one target page
1216 *
1217 *
1218 * @f: QEMUFile where to send the data
1219 * @block: pointer to block that contains the page we want to send
1220 * @offset: offset inside the block for the page;
1221 * @last_stage: if we are at the completion stage
1222 * @bytes_transferred: increase it with the number of transferred bytes
1223 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1224 *
1225 * Returns: Number of pages written.
1226 */
1227static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1228                                PageSearchStatus *pss,
1229                                bool last_stage,
1230                                uint64_t *bytes_transferred,
1231                                ram_addr_t dirty_ram_abs)
1232{
1233    int res = 0;
1234
1235    /* Check the pages is dirty and if it is send it */
1236    if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1237        unsigned long *unsentmap;
1238        if (compression_switch && migrate_use_compression()) {
1239            res = ram_save_compressed_page(f, pss,
1240                                           last_stage,
1241                                           bytes_transferred);
1242        } else {
1243            res = ram_save_page(f, pss, last_stage,
1244                                bytes_transferred);
1245        }
1246
1247        if (res < 0) {
1248            return res;
1249        }
1250        unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1251        if (unsentmap) {
1252            clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1253        }
1254        /* Only update last_sent_block if a block was actually sent; xbzrle
1255         * might have decided the page was identical so didn't bother writing
1256         * to the stream.
1257         */
1258        if (res > 0) {
1259            last_sent_block = pss->block;
1260        }
1261    }
1262
1263    return res;
1264}
1265
1266/**
1267 * ram_save_host_page: Starting at *offset send pages up to the end
1268 *                     of the current host page.  It's valid for the initial
1269 *                     offset to point into the middle of a host page
1270 *                     in which case the remainder of the hostpage is sent.
1271 *                     Only dirty target pages are sent.
1272 *
1273 * Returns: Number of pages written.
1274 *
1275 * @f: QEMUFile where to send the data
1276 * @block: pointer to block that contains the page we want to send
1277 * @offset: offset inside the block for the page; updated to last target page
1278 *          sent
1279 * @last_stage: if we are at the completion stage
1280 * @bytes_transferred: increase it with the number of transferred bytes
1281 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1282 */
1283static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1284                              PageSearchStatus *pss,
1285                              bool last_stage,
1286                              uint64_t *bytes_transferred,
1287                              ram_addr_t dirty_ram_abs)
1288{
1289    int tmppages, pages = 0;
1290    do {
1291        tmppages = ram_save_target_page(ms, f, pss, last_stage,
1292                                        bytes_transferred, dirty_ram_abs);
1293        if (tmppages < 0) {
1294            return tmppages;
1295        }
1296
1297        pages += tmppages;
1298        pss->offset += TARGET_PAGE_SIZE;
1299        dirty_ram_abs += TARGET_PAGE_SIZE;
1300    } while (pss->offset & (qemu_host_page_size - 1));
1301
1302    /* The offset we leave with is the last one we looked at */
1303    pss->offset -= TARGET_PAGE_SIZE;
1304    return pages;
1305}
1306
1307/**
1308 * ram_find_and_save_block: Finds a dirty page and sends it to f
1309 *
1310 * Called within an RCU critical section.
1311 *
1312 * Returns:  The number of pages written
1313 *           0 means no dirty pages
1314 *
1315 * @f: QEMUFile where to send the data
1316 * @last_stage: if we are at the completion stage
1317 * @bytes_transferred: increase it with the number of transferred bytes
1318 *
1319 * On systems where host-page-size > target-page-size it will send all the
1320 * pages in a host page that are dirty.
1321 */
1322
1323static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1324                                   uint64_t *bytes_transferred)
1325{
1326    PageSearchStatus pss;
1327    MigrationState *ms = migrate_get_current();
1328    int pages = 0;
1329    bool again, found;
1330    ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1331                                 ram_addr_t space */
1332
1333    pss.block = last_seen_block;
1334    pss.offset = last_offset;
1335    pss.complete_round = false;
1336
1337    if (!pss.block) {
1338        pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1339    }
1340
1341    do {
1342        again = true;
1343        found = get_queued_page(ms, &pss, &dirty_ram_abs);
1344
1345        if (!found) {
1346            /* priority queue empty, so just search for something dirty */
1347            found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1348        }
1349
1350        if (found) {
1351            pages = ram_save_host_page(ms, f, &pss,
1352                                       last_stage, bytes_transferred,
1353                                       dirty_ram_abs);
1354        }
1355    } while (!pages && again);
1356
1357    last_seen_block = pss.block;
1358    last_offset = pss.offset;
1359
1360    return pages;
1361}
1362
1363void acct_update_position(QEMUFile *f, size_t size, bool zero)
1364{
1365    uint64_t pages = size / TARGET_PAGE_SIZE;
1366    if (zero) {
1367        acct_info.dup_pages += pages;
1368    } else {
1369        acct_info.norm_pages += pages;
1370        bytes_transferred += size;
1371        qemu_update_position(f, size);
1372    }
1373}
1374
1375static ram_addr_t ram_save_remaining(void)
1376{
1377    return migration_dirty_pages;
1378}
1379
1380uint64_t ram_bytes_remaining(void)
1381{
1382    return ram_save_remaining() * TARGET_PAGE_SIZE;
1383}
1384
1385uint64_t ram_bytes_transferred(void)
1386{
1387    return bytes_transferred;
1388}
1389
1390uint64_t ram_bytes_total(void)
1391{
1392    RAMBlock *block;
1393    uint64_t total = 0;
1394
1395    rcu_read_lock();
1396    QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1397        total += block->used_length;
1398    rcu_read_unlock();
1399    return total;
1400}
1401
1402void free_xbzrle_decoded_buf(void)
1403{
1404    g_free(xbzrle_decoded_buf);
1405    xbzrle_decoded_buf = NULL;
1406}
1407
1408static void migration_bitmap_free(struct BitmapRcu *bmap)
1409{
1410    g_free(bmap->bmap);
1411    g_free(bmap->unsentmap);
1412    g_free(bmap);
1413}
1414
1415static void ram_migration_cleanup(void *opaque)
1416{
1417    /* caller have hold iothread lock or is in a bh, so there is
1418     * no writing race against this migration_bitmap
1419     */
1420    struct BitmapRcu *bitmap = migration_bitmap_rcu;
1421    atomic_rcu_set(&migration_bitmap_rcu, NULL);
1422    if (bitmap) {
1423        memory_global_dirty_log_stop();
1424        call_rcu(bitmap, migration_bitmap_free, rcu);
1425    }
1426
1427    XBZRLE_cache_lock();
1428    if (XBZRLE.cache) {
1429        cache_fini(XBZRLE.cache);
1430        g_free(XBZRLE.encoded_buf);
1431        g_free(XBZRLE.current_buf);
1432        XBZRLE.cache = NULL;
1433        XBZRLE.encoded_buf = NULL;
1434        XBZRLE.current_buf = NULL;
1435    }
1436    XBZRLE_cache_unlock();
1437}
1438
1439static void reset_ram_globals(void)
1440{
1441    last_seen_block = NULL;
1442    last_sent_block = NULL;
1443    last_offset = 0;
1444    last_version = ram_list.version;
1445    ram_bulk_stage = true;
1446}
1447
1448#define MAX_WAIT 50 /* ms, half buffered_file limit */
1449
1450void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1451{
1452    /* called in qemu main thread, so there is
1453     * no writing race against this migration_bitmap
1454     */
1455    if (migration_bitmap_rcu) {
1456        struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1457        bitmap = g_new(struct BitmapRcu, 1);
1458        bitmap->bmap = bitmap_new(new);
1459
1460        /* prevent migration_bitmap content from being set bit
1461         * by migration_bitmap_sync_range() at the same time.
1462         * it is safe to migration if migration_bitmap is cleared bit
1463         * at the same time.
1464         */
1465        qemu_mutex_lock(&migration_bitmap_mutex);
1466        bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1467        bitmap_set(bitmap->bmap, old, new - old);
1468
1469        /* We don't have a way to safely extend the sentmap
1470         * with RCU; so mark it as missing, entry to postcopy
1471         * will fail.
1472         */
1473        bitmap->unsentmap = NULL;
1474
1475        atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1476        qemu_mutex_unlock(&migration_bitmap_mutex);
1477        migration_dirty_pages += new - old;
1478        call_rcu(old_bitmap, migration_bitmap_free, rcu);
1479    }
1480}
1481
1482/*
1483 * 'expected' is the value you expect the bitmap mostly to be full
1484 * of; it won't bother printing lines that are all this value.
1485 * If 'todump' is null the migration bitmap is dumped.
1486 */
1487void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1488{
1489    int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1490
1491    int64_t cur;
1492    int64_t linelen = 128;
1493    char linebuf[129];
1494
1495    if (!todump) {
1496        todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1497    }
1498
1499    for (cur = 0; cur < ram_pages; cur += linelen) {
1500        int64_t curb;
1501        bool found = false;
1502        /*
1503         * Last line; catch the case where the line length
1504         * is longer than remaining ram
1505         */
1506        if (cur + linelen > ram_pages) {
1507            linelen = ram_pages - cur;
1508        }
1509        for (curb = 0; curb < linelen; curb++) {
1510            bool thisbit = test_bit(cur + curb, todump);
1511            linebuf[curb] = thisbit ? '1' : '.';
1512            found = found || (thisbit != expected);
1513        }
1514        if (found) {
1515            linebuf[curb] = '\0';
1516            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1517        }
1518    }
1519}
1520
1521/* **** functions for postcopy ***** */
1522
1523/*
1524 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1525 * Note: At this point the 'unsentmap' is the processed bitmap combined
1526 *       with the dirtymap; so a '1' means it's either dirty or unsent.
1527 * start,length: Indexes into the bitmap for the first bit
1528 *            representing the named block and length in target-pages
1529 */
1530static int postcopy_send_discard_bm_ram(MigrationState *ms,
1531                                        PostcopyDiscardState *pds,
1532                                        unsigned long start,
1533                                        unsigned long length)
1534{
1535    unsigned long end = start + length; /* one after the end */
1536    unsigned long current;
1537    unsigned long *unsentmap;
1538
1539    unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1540    for (current = start; current < end; ) {
1541        unsigned long one = find_next_bit(unsentmap, end, current);
1542
1543        if (one <= end) {
1544            unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1545            unsigned long discard_length;
1546
1547            if (zero >= end) {
1548                discard_length = end - one;
1549            } else {
1550                discard_length = zero - one;
1551            }
1552            if (discard_length) {
1553                postcopy_discard_send_range(ms, pds, one, discard_length);
1554            }
1555            current = one + discard_length;
1556        } else {
1557            current = one;
1558        }
1559    }
1560
1561    return 0;
1562}
1563
1564/*
1565 * Utility for the outgoing postcopy code.
1566 *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1567 *   passing it bitmap indexes and name.
1568 * Returns: 0 on success
1569 * (qemu_ram_foreach_block ends up passing unscaled lengths
1570 *  which would mean postcopy code would have to deal with target page)
1571 */
1572static int postcopy_each_ram_send_discard(MigrationState *ms)
1573{
1574    struct RAMBlock *block;
1575    int ret;
1576
1577    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1578        unsigned long first = block->offset >> TARGET_PAGE_BITS;
1579        PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1580                                                               first,
1581                                                               block->idstr);
1582
1583        /*
1584         * Postcopy sends chunks of bitmap over the wire, but it
1585         * just needs indexes at this point, avoids it having
1586         * target page specific code.
1587         */
1588        ret = postcopy_send_discard_bm_ram(ms, pds, first,
1589                                    block->used_length >> TARGET_PAGE_BITS);
1590        postcopy_discard_send_finish(ms, pds);
1591        if (ret) {
1592            return ret;
1593        }
1594    }
1595
1596    return 0;
1597}
1598
1599/*
1600 * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1601 *   the two bitmaps, that are similar, but one is inverted.
1602 *
1603 * We search for runs of target-pages that don't start or end on a
1604 * host page boundary;
1605 * unsent_pass=true: Cleans up partially unsent host pages by searching
1606 *                 the unsentmap
1607 * unsent_pass=false: Cleans up partially dirty host pages by searching
1608 *                 the main migration bitmap
1609 *
1610 */
1611static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1612                                          RAMBlock *block,
1613                                          PostcopyDiscardState *pds)
1614{
1615    unsigned long *bitmap;
1616    unsigned long *unsentmap;
1617    unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
1618    unsigned long first = block->offset >> TARGET_PAGE_BITS;
1619    unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1620    unsigned long last = first + (len - 1);
1621    unsigned long run_start;
1622
1623    bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1624    unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1625
1626    if (unsent_pass) {
1627        /* Find a sent page */
1628        run_start = find_next_zero_bit(unsentmap, last + 1, first);
1629    } else {
1630        /* Find a dirty page */
1631        run_start = find_next_bit(bitmap, last + 1, first);
1632    }
1633
1634    while (run_start <= last) {
1635        bool do_fixup = false;
1636        unsigned long fixup_start_addr;
1637        unsigned long host_offset;
1638
1639        /*
1640         * If the start of this run of pages is in the middle of a host
1641         * page, then we need to fixup this host page.
1642         */
1643        host_offset = run_start % host_ratio;
1644        if (host_offset) {
1645            do_fixup = true;
1646            run_start -= host_offset;
1647            fixup_start_addr = run_start;
1648            /* For the next pass */
1649            run_start = run_start + host_ratio;
1650        } else {
1651            /* Find the end of this run */
1652            unsigned long run_end;
1653            if (unsent_pass) {
1654                run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1655            } else {
1656                run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1657            }
1658            /*
1659             * If the end isn't at the start of a host page, then the
1660             * run doesn't finish at the end of a host page
1661             * and we need to discard.
1662             */
1663            host_offset = run_end % host_ratio;
1664            if (host_offset) {
1665                do_fixup = true;
1666                fixup_start_addr = run_end - host_offset;
1667                /*
1668                 * This host page has gone, the next loop iteration starts
1669                 * from after the fixup
1670                 */
1671                run_start = fixup_start_addr + host_ratio;
1672            } else {
1673                /*
1674                 * No discards on this iteration, next loop starts from
1675                 * next sent/dirty page
1676                 */
1677                run_start = run_end + 1;
1678            }
1679        }
1680
1681        if (do_fixup) {
1682            unsigned long page;
1683
1684            /* Tell the destination to discard this page */
1685            if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1686                /* For the unsent_pass we:
1687                 *     discard partially sent pages
1688                 * For the !unsent_pass (dirty) we:
1689                 *     discard partially dirty pages that were sent
1690                 *     (any partially sent pages were already discarded
1691                 *     by the previous unsent_pass)
1692                 */
1693                postcopy_discard_send_range(ms, pds, fixup_start_addr,
1694                                            host_ratio);
1695            }
1696
1697            /* Clean up the bitmap */
1698            for (page = fixup_start_addr;
1699                 page < fixup_start_addr + host_ratio; page++) {
1700                /* All pages in this host page are now not sent */
1701                set_bit(page, unsentmap);
1702
1703                /*
1704                 * Remark them as dirty, updating the count for any pages
1705                 * that weren't previously dirty.
1706                 */
1707                migration_dirty_pages += !test_and_set_bit(page, bitmap);
1708            }
1709        }
1710
1711        if (unsent_pass) {
1712            /* Find the next sent page for the next iteration */
1713            run_start = find_next_zero_bit(unsentmap, last + 1,
1714                                           run_start);
1715        } else {
1716            /* Find the next dirty page for the next iteration */
1717            run_start = find_next_bit(bitmap, last + 1, run_start);
1718        }
1719    }
1720}
1721
1722/*
1723 * Utility for the outgoing postcopy code.
1724 *
1725 * Discard any partially sent host-page size chunks, mark any partially
1726 * dirty host-page size chunks as all dirty.
1727 *
1728 * Returns: 0 on success
1729 */
1730static int postcopy_chunk_hostpages(MigrationState *ms)
1731{
1732    struct RAMBlock *block;
1733
1734    if (qemu_host_page_size == TARGET_PAGE_SIZE) {
1735        /* Easy case - TPS==HPS - nothing to be done */
1736        return 0;
1737    }
1738
1739    /* Easiest way to make sure we don't resume in the middle of a host-page */
1740    last_seen_block = NULL;
1741    last_sent_block = NULL;
1742    last_offset     = 0;
1743
1744    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1745        unsigned long first = block->offset >> TARGET_PAGE_BITS;
1746
1747        PostcopyDiscardState *pds =
1748                         postcopy_discard_send_init(ms, first, block->idstr);
1749
1750        /* First pass: Discard all partially sent host pages */
1751        postcopy_chunk_hostpages_pass(ms, true, block, pds);
1752        /*
1753         * Second pass: Ensure that all partially dirty host pages are made
1754         * fully dirty.
1755         */
1756        postcopy_chunk_hostpages_pass(ms, false, block, pds);
1757
1758        postcopy_discard_send_finish(ms, pds);
1759    } /* ram_list loop */
1760
1761    return 0;
1762}
1763
1764/*
1765 * Transmit the set of pages to be discarded after precopy to the target
1766 * these are pages that:
1767 *     a) Have been previously transmitted but are now dirty again
1768 *     b) Pages that have never been transmitted, this ensures that
1769 *        any pages on the destination that have been mapped by background
1770 *        tasks get discarded (transparent huge pages is the specific concern)
1771 * Hopefully this is pretty sparse
1772 */
1773int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1774{
1775    int ret;
1776    unsigned long *bitmap, *unsentmap;
1777
1778    rcu_read_lock();
1779
1780    /* This should be our last sync, the src is now paused */
1781    migration_bitmap_sync();
1782
1783    unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1784    if (!unsentmap) {
1785        /* We don't have a safe way to resize the sentmap, so
1786         * if the bitmap was resized it will be NULL at this
1787         * point.
1788         */
1789        error_report("migration ram resized during precopy phase");
1790        rcu_read_unlock();
1791        return -EINVAL;
1792    }
1793
1794    /* Deal with TPS != HPS */
1795    ret = postcopy_chunk_hostpages(ms);
1796    if (ret) {
1797        rcu_read_unlock();
1798        return ret;
1799    }
1800
1801    /*
1802     * Update the unsentmap to be unsentmap = unsentmap | dirty
1803     */
1804    bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1805    bitmap_or(unsentmap, unsentmap, bitmap,
1806               last_ram_offset() >> TARGET_PAGE_BITS);
1807
1808
1809    trace_ram_postcopy_send_discard_bitmap();
1810#ifdef DEBUG_POSTCOPY
1811    ram_debug_dump_bitmap(unsentmap, true);
1812#endif
1813
1814    ret = postcopy_each_ram_send_discard(ms);
1815    rcu_read_unlock();
1816
1817    return ret;
1818}
1819
1820/*
1821 * At the start of the postcopy phase of migration, any now-dirty
1822 * precopied pages are discarded.
1823 *
1824 * start, length describe a byte address range within the RAMBlock
1825 *
1826 * Returns 0 on success.
1827 */
1828int ram_discard_range(MigrationIncomingState *mis,
1829                      const char *block_name,
1830                      uint64_t start, size_t length)
1831{
1832    int ret = -1;
1833
1834    rcu_read_lock();
1835    RAMBlock *rb = qemu_ram_block_by_name(block_name);
1836
1837    if (!rb) {
1838        error_report("ram_discard_range: Failed to find block '%s'",
1839                     block_name);
1840        goto err;
1841    }
1842
1843    uint8_t *host_startaddr = rb->host + start;
1844
1845    if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
1846        error_report("ram_discard_range: Unaligned start address: %p",
1847                     host_startaddr);
1848        goto err;
1849    }
1850
1851    if ((start + length) <= rb->used_length) {
1852        uint8_t *host_endaddr = host_startaddr + length;
1853        if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
1854            error_report("ram_discard_range: Unaligned end address: %p",
1855                         host_endaddr);
1856            goto err;
1857        }
1858        ret = postcopy_ram_discard_range(mis, host_startaddr, length);
1859    } else {
1860        error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
1861                     "/%zx/" RAM_ADDR_FMT")",
1862                     block_name, start, length, rb->used_length);
1863    }
1864
1865err:
1866    rcu_read_unlock();
1867
1868    return ret;
1869}
1870
1871
1872/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1873 * long-running RCU critical section.  When rcu-reclaims in the code
1874 * start to become numerous it will be necessary to reduce the
1875 * granularity of these critical sections.
1876 */
1877
1878static int ram_save_setup(QEMUFile *f, void *opaque)
1879{
1880    RAMBlock *block;
1881    int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1882
1883    dirty_rate_high_cnt = 0;
1884    bitmap_sync_count = 0;
1885    migration_bitmap_sync_init();
1886    qemu_mutex_init(&migration_bitmap_mutex);
1887
1888    if (migrate_use_xbzrle()) {
1889        XBZRLE_cache_lock();
1890        XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1891                                  TARGET_PAGE_SIZE,
1892                                  TARGET_PAGE_SIZE);
1893        if (!XBZRLE.cache) {
1894            XBZRLE_cache_unlock();
1895            error_report("Error creating cache");
1896            return -1;
1897        }
1898        XBZRLE_cache_unlock();
1899
1900        /* We prefer not to abort if there is no memory */
1901        XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1902        if (!XBZRLE.encoded_buf) {
1903            error_report("Error allocating encoded_buf");
1904            return -1;
1905        }
1906
1907        XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1908        if (!XBZRLE.current_buf) {
1909            error_report("Error allocating current_buf");
1910            g_free(XBZRLE.encoded_buf);
1911            XBZRLE.encoded_buf = NULL;
1912            return -1;
1913        }
1914
1915        acct_clear();
1916    }
1917
1918    /* For memory_global_dirty_log_start below.  */
1919    qemu_mutex_lock_iothread();
1920
1921    qemu_mutex_lock_ramlist();
1922    rcu_read_lock();
1923    bytes_transferred = 0;
1924    reset_ram_globals();
1925
1926    ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1927    migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
1928    migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1929    bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1930
1931    if (migrate_postcopy_ram()) {
1932        migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1933        bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1934    }
1935
1936    /*
1937     * Count the total number of pages used by ram blocks not including any
1938     * gaps due to alignment or unplugs.
1939     */
1940    migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1941
1942    memory_global_dirty_log_start();
1943    migration_bitmap_sync();
1944    qemu_mutex_unlock_ramlist();
1945    qemu_mutex_unlock_iothread();
1946
1947    qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1948
1949    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1950        qemu_put_byte(f, strlen(block->idstr));
1951        qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1952        qemu_put_be64(f, block->used_length);
1953    }
1954
1955    rcu_read_unlock();
1956
1957    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1958    ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1959
1960    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1961
1962    return 0;
1963}
1964
1965static int ram_save_iterate(QEMUFile *f, void *opaque)
1966{
1967    int ret;
1968    int i;
1969    int64_t t0;
1970    int pages_sent = 0;
1971
1972    rcu_read_lock();
1973    if (ram_list.version != last_version) {
1974        reset_ram_globals();
1975    }
1976
1977    /* Read version before ram_list.blocks */
1978    smp_rmb();
1979
1980    ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1981
1982    t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1983    i = 0;
1984    while ((ret = qemu_file_rate_limit(f)) == 0) {
1985        int pages;
1986
1987        pages = ram_find_and_save_block(f, false, &bytes_transferred);
1988        /* no more pages to sent */
1989        if (pages == 0) {
1990            break;
1991        }
1992        pages_sent += pages;
1993        acct_info.iterations++;
1994
1995        /* we want to check in the 1st loop, just in case it was the 1st time
1996           and we had to sync the dirty bitmap.
1997           qemu_get_clock_ns() is a bit expensive, so we only check each some
1998           iterations
1999        */
2000        if ((i & 63) == 0) {
2001            uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2002            if (t1 > MAX_WAIT) {
2003                DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
2004                        t1, i);
2005                break;
2006            }
2007        }
2008        i++;
2009    }
2010    flush_compressed_data(f);
2011    rcu_read_unlock();
2012
2013    /*
2014     * Must occur before EOS (or any QEMUFile operation)
2015     * because of RDMA protocol.
2016     */
2017    ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2018
2019    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2020    bytes_transferred += 8;
2021
2022    ret = qemu_file_get_error(f);
2023    if (ret < 0) {
2024        return ret;
2025    }
2026
2027    return pages_sent;
2028}
2029
2030/* Called with iothread lock */
2031static int ram_save_complete(QEMUFile *f, void *opaque)
2032{
2033    rcu_read_lock();
2034
2035    if (!migration_in_postcopy(migrate_get_current())) {
2036        migration_bitmap_sync();
2037    }
2038
2039    ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2040
2041    /* try transferring iterative blocks of memory */
2042
2043    /* flush all remaining blocks regardless of rate limiting */
2044    while (true) {
2045        int pages;
2046
2047        pages = ram_find_and_save_block(f, true, &bytes_transferred);
2048        /* no more blocks to sent */
2049        if (pages == 0) {
2050            break;
2051        }
2052    }
2053
2054    flush_compressed_data(f);
2055    ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2056
2057    rcu_read_unlock();
2058
2059    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2060
2061    return 0;
2062}
2063
2064static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2065                             uint64_t *non_postcopiable_pending,
2066                             uint64_t *postcopiable_pending)
2067{
2068    uint64_t remaining_size;
2069
2070    remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2071
2072    if (!migration_in_postcopy(migrate_get_current()) &&
2073        remaining_size < max_size) {
2074        qemu_mutex_lock_iothread();
2075        rcu_read_lock();
2076        migration_bitmap_sync();
2077        rcu_read_unlock();
2078        qemu_mutex_unlock_iothread();
2079        remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2080    }
2081
2082    /* We can do postcopy, and all the data is postcopiable */
2083    *postcopiable_pending += remaining_size;
2084}
2085
2086static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2087{
2088    unsigned int xh_len;
2089    int xh_flags;
2090    uint8_t *loaded_data;
2091
2092    if (!xbzrle_decoded_buf) {
2093        xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2094    }
2095    loaded_data = xbzrle_decoded_buf;
2096
2097    /* extract RLE header */
2098    xh_flags = qemu_get_byte(f);
2099    xh_len = qemu_get_be16(f);
2100
2101    if (xh_flags != ENCODING_FLAG_XBZRLE) {
2102        error_report("Failed to load XBZRLE page - wrong compression!");
2103        return -1;
2104    }
2105
2106    if (xh_len > TARGET_PAGE_SIZE) {
2107        error_report("Failed to load XBZRLE page - len overflow!");
2108        return -1;
2109    }
2110    /* load data and decode */
2111    qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2112
2113    /* decode RLE */
2114    if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2115                             TARGET_PAGE_SIZE) == -1) {
2116        error_report("Failed to load XBZRLE page - decode error!");
2117        return -1;
2118    }
2119
2120    return 0;
2121}
2122
2123/* Must be called from within a rcu critical section.
2124 * Returns a pointer from within the RCU-protected ram_list.
2125 */
2126/*
2127 * Read a RAMBlock ID from the stream f.
2128 *
2129 * f: Stream to read from
2130 * flags: Page flags (mostly to see if it's a continuation of previous block)
2131 */
2132static inline RAMBlock *ram_block_from_stream(QEMUFile *f,
2133                                              int flags)
2134{
2135    static RAMBlock *block = NULL;
2136    char id[256];
2137    uint8_t len;
2138
2139    if (flags & RAM_SAVE_FLAG_CONTINUE) {
2140        if (!block) {
2141            error_report("Ack, bad migration stream!");
2142            return NULL;
2143        }
2144        return block;
2145    }
2146
2147    len = qemu_get_byte(f);
2148    qemu_get_buffer(f, (uint8_t *)id, len);
2149    id[len] = 0;
2150
2151    block = qemu_ram_block_by_name(id);
2152    if (!block) {
2153        error_report("Can't find block %s", id);
2154        return NULL;
2155    }
2156
2157    return block;
2158}
2159
2160static inline void *host_from_ram_block_offset(RAMBlock *block,
2161                                               ram_addr_t offset)
2162{
2163    if (!offset_in_ramblock(block, offset)) {
2164        return NULL;
2165    }
2166
2167    return block->host + offset;
2168}
2169
2170/*
2171 * If a page (or a whole RDMA chunk) has been
2172 * determined to be zero, then zap it.
2173 */
2174void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2175{
2176    if (ch != 0 || !is_zero_range(host, size)) {
2177        memset(host, ch, size);
2178    }
2179}
2180
2181static void *do_data_decompress(void *opaque)
2182{
2183    DecompressParam *param = opaque;
2184    unsigned long pagesize;
2185    uint8_t *des;
2186    int len;
2187
2188    qemu_mutex_lock(&param->mutex);
2189    while (!param->quit) {
2190        if (param->des) {
2191            des = param->des;
2192            len = param->len;
2193            param->des = 0;
2194            qemu_mutex_unlock(&param->mutex);
2195
2196            pagesize = TARGET_PAGE_SIZE;
2197            /* uncompress() will return failed in some case, especially
2198             * when the page is dirted when doing the compression, it's
2199             * not a problem because the dirty page will be retransferred
2200             * and uncompress() won't break the data in other pages.
2201             */
2202            uncompress((Bytef *)des, &pagesize,
2203                       (const Bytef *)param->compbuf, len);
2204
2205            qemu_mutex_lock(&decomp_done_lock);
2206            param->done = true;
2207            qemu_cond_signal(&decomp_done_cond);
2208            qemu_mutex_unlock(&decomp_done_lock);
2209
2210            qemu_mutex_lock(&param->mutex);
2211        } else {
2212            qemu_cond_wait(&param->cond, &param->mutex);
2213        }
2214    }
2215    qemu_mutex_unlock(&param->mutex);
2216
2217    return NULL;
2218}
2219
2220static void wait_for_decompress_done(void)
2221{
2222    int idx, thread_count;
2223
2224    if (!migrate_use_compression()) {
2225        return;
2226    }
2227
2228    thread_count = migrate_decompress_threads();
2229    qemu_mutex_lock(&decomp_done_lock);
2230    for (idx = 0; idx < thread_count; idx++) {
2231        while (!decomp_param[idx].done) {
2232            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2233        }
2234    }
2235    qemu_mutex_unlock(&decomp_done_lock);
2236}
2237
2238void migrate_decompress_threads_create(void)
2239{
2240    int i, thread_count;
2241
2242    thread_count = migrate_decompress_threads();
2243    decompress_threads = g_new0(QemuThread, thread_count);
2244    decomp_param = g_new0(DecompressParam, thread_count);
2245    qemu_mutex_init(&decomp_done_lock);
2246    qemu_cond_init(&decomp_done_cond);
2247    for (i = 0; i < thread_count; i++) {
2248        qemu_mutex_init(&decomp_param[i].mutex);
2249        qemu_cond_init(&decomp_param[i].cond);
2250        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2251        decomp_param[i].done = true;
2252        decomp_param[i].quit = false;
2253        qemu_thread_create(decompress_threads + i, "decompress",
2254                           do_data_decompress, decomp_param + i,
2255                           QEMU_THREAD_JOINABLE);
2256    }
2257}
2258
2259void migrate_decompress_threads_join(void)
2260{
2261    int i, thread_count;
2262
2263    thread_count = migrate_decompress_threads();
2264    for (i = 0; i < thread_count; i++) {
2265        qemu_mutex_lock(&decomp_param[i].mutex);
2266        decomp_param[i].quit = true;
2267        qemu_cond_signal(&decomp_param[i].cond);
2268        qemu_mutex_unlock(&decomp_param[i].mutex);
2269    }
2270    for (i = 0; i < thread_count; i++) {
2271        qemu_thread_join(decompress_threads + i);
2272        qemu_mutex_destroy(&decomp_param[i].mutex);
2273        qemu_cond_destroy(&decomp_param[i].cond);
2274        g_free(decomp_param[i].compbuf);
2275    }
2276    g_free(decompress_threads);
2277    g_free(decomp_param);
2278    decompress_threads = NULL;
2279    decomp_param = NULL;
2280}
2281
2282static void decompress_data_with_multi_threads(QEMUFile *f,
2283                                               void *host, int len)
2284{
2285    int idx, thread_count;
2286
2287    thread_count = migrate_decompress_threads();
2288    qemu_mutex_lock(&decomp_done_lock);
2289    while (true) {
2290        for (idx = 0; idx < thread_count; idx++) {
2291            if (decomp_param[idx].done) {
2292                decomp_param[idx].done = false;
2293                qemu_mutex_lock(&decomp_param[idx].mutex);
2294                qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2295                decomp_param[idx].des = host;
2296                decomp_param[idx].len = len;
2297                qemu_cond_signal(&decomp_param[idx].cond);
2298                qemu_mutex_unlock(&decomp_param[idx].mutex);
2299                break;
2300            }
2301        }
2302        if (idx < thread_count) {
2303            break;
2304        } else {
2305            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2306        }
2307    }
2308    qemu_mutex_unlock(&decomp_done_lock);
2309}
2310
2311/*
2312 * Allocate data structures etc needed by incoming migration with postcopy-ram
2313 * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2314 */
2315int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2316{
2317    size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2318
2319    return postcopy_ram_incoming_init(mis, ram_pages);
2320}
2321
2322/*
2323 * Called in postcopy mode by ram_load().
2324 * rcu_read_lock is taken prior to this being called.
2325 */
2326static int ram_load_postcopy(QEMUFile *f)
2327{
2328    int flags = 0, ret = 0;
2329    bool place_needed = false;
2330    bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
2331    MigrationIncomingState *mis = migration_incoming_get_current();
2332    /* Temporary page that is later 'placed' */
2333    void *postcopy_host_page = postcopy_get_tmp_page(mis);
2334    void *last_host = NULL;
2335    bool all_zero = false;
2336
2337    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2338        ram_addr_t addr;
2339        void *host = NULL;
2340        void *page_buffer = NULL;
2341        void *place_source = NULL;
2342        uint8_t ch;
2343
2344        addr = qemu_get_be64(f);
2345        flags = addr & ~TARGET_PAGE_MASK;
2346        addr &= TARGET_PAGE_MASK;
2347
2348        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2349        place_needed = false;
2350        if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2351            RAMBlock *block = ram_block_from_stream(f, flags);
2352
2353            host = host_from_ram_block_offset(block, addr);
2354            if (!host) {
2355                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2356                ret = -EINVAL;
2357                break;
2358            }
2359            /*
2360             * Postcopy requires that we place whole host pages atomically.
2361             * To make it atomic, the data is read into a temporary page
2362             * that's moved into place later.
2363             * The migration protocol uses,  possibly smaller, target-pages
2364             * however the source ensures it always sends all the components
2365             * of a host page in order.
2366             */
2367            page_buffer = postcopy_host_page +
2368                          ((uintptr_t)host & ~qemu_host_page_mask);
2369            /* If all TP are zero then we can optimise the place */
2370            if (!((uintptr_t)host & ~qemu_host_page_mask)) {
2371                all_zero = true;
2372            } else {
2373                /* not the 1st TP within the HP */
2374                if (host != (last_host + TARGET_PAGE_SIZE)) {
2375                    error_report("Non-sequential target page %p/%p",
2376                                  host, last_host);
2377                    ret = -EINVAL;
2378                    break;
2379                }
2380            }
2381
2382
2383            /*
2384             * If it's the last part of a host page then we place the host
2385             * page
2386             */
2387            place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2388                                     ~qemu_host_page_mask) == 0;
2389            place_source = postcopy_host_page;
2390        }
2391        last_host = host;
2392
2393        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2394        case RAM_SAVE_FLAG_COMPRESS:
2395            ch = qemu_get_byte(f);
2396            memset(page_buffer, ch, TARGET_PAGE_SIZE);
2397            if (ch) {
2398                all_zero = false;
2399            }
2400            break;
2401
2402        case RAM_SAVE_FLAG_PAGE:
2403            all_zero = false;
2404            if (!place_needed || !matching_page_sizes) {
2405                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2406            } else {
2407                /* Avoids the qemu_file copy during postcopy, which is
2408                 * going to do a copy later; can only do it when we
2409                 * do this read in one go (matching page sizes)
2410                 */
2411                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2412                                         TARGET_PAGE_SIZE);
2413            }
2414            break;
2415        case RAM_SAVE_FLAG_EOS:
2416            /* normal exit */
2417            break;
2418        default:
2419            error_report("Unknown combination of migration flags: %#x"
2420                         " (postcopy mode)", flags);
2421            ret = -EINVAL;
2422        }
2423
2424        if (place_needed) {
2425            /* This gets called at the last target page in the host page */
2426            if (all_zero) {
2427                ret = postcopy_place_page_zero(mis,
2428                                               host + TARGET_PAGE_SIZE -
2429                                               qemu_host_page_size);
2430            } else {
2431                ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
2432                                               qemu_host_page_size,
2433                                               place_source);
2434            }
2435        }
2436        if (!ret) {
2437            ret = qemu_file_get_error(f);
2438        }
2439    }
2440
2441    return ret;
2442}
2443
2444static int ram_load(QEMUFile *f, void *opaque, int version_id)
2445{
2446    int flags = 0, ret = 0;
2447    static uint64_t seq_iter;
2448    int len = 0;
2449    /*
2450     * If system is running in postcopy mode, page inserts to host memory must
2451     * be atomic
2452     */
2453    bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2454
2455    seq_iter++;
2456
2457    if (version_id != 4) {
2458        ret = -EINVAL;
2459    }
2460
2461    /* This RCU critical section can be very long running.
2462     * When RCU reclaims in the code start to become numerous,
2463     * it will be necessary to reduce the granularity of this
2464     * critical section.
2465     */
2466    rcu_read_lock();
2467
2468    if (postcopy_running) {
2469        ret = ram_load_postcopy(f);
2470    }
2471
2472    while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2473        ram_addr_t addr, total_ram_bytes;
2474        void *host = NULL;
2475        uint8_t ch;
2476
2477        addr = qemu_get_be64(f);
2478        flags = addr & ~TARGET_PAGE_MASK;
2479        addr &= TARGET_PAGE_MASK;
2480
2481        if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2482                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2483            RAMBlock *block = ram_block_from_stream(f, flags);
2484
2485            host = host_from_ram_block_offset(block, addr);
2486            if (!host) {
2487                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2488                ret = -EINVAL;
2489                break;
2490            }
2491        }
2492
2493        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2494        case RAM_SAVE_FLAG_MEM_SIZE:
2495            /* Synchronize RAM block list */
2496            total_ram_bytes = addr;
2497            while (!ret && total_ram_bytes) {
2498                RAMBlock *block;
2499                char id[256];
2500                ram_addr_t length;
2501
2502                len = qemu_get_byte(f);
2503                qemu_get_buffer(f, (uint8_t *)id, len);
2504                id[len] = 0;
2505                length = qemu_get_be64(f);
2506
2507                block = qemu_ram_block_by_name(id);
2508                if (block) {
2509                    if (length != block->used_length) {
2510                        Error *local_err = NULL;
2511
2512                        ret = qemu_ram_resize(block, length,
2513                                              &local_err);
2514                        if (local_err) {
2515                            error_report_err(local_err);
2516                        }
2517                    }
2518                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2519                                          block->idstr);
2520                } else {
2521                    error_report("Unknown ramblock \"%s\", cannot "
2522                                 "accept migration", id);
2523                    ret = -EINVAL;
2524                }
2525
2526                total_ram_bytes -= length;
2527            }
2528            break;
2529
2530        case RAM_SAVE_FLAG_COMPRESS:
2531            ch = qemu_get_byte(f);
2532            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2533            break;
2534
2535        case RAM_SAVE_FLAG_PAGE:
2536            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2537            break;
2538
2539        case RAM_SAVE_FLAG_COMPRESS_PAGE:
2540            len = qemu_get_be32(f);
2541            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2542                error_report("Invalid compressed data length: %d", len);
2543                ret = -EINVAL;
2544                break;
2545            }
2546            decompress_data_with_multi_threads(f, host, len);
2547            break;
2548
2549        case RAM_SAVE_FLAG_XBZRLE:
2550            if (load_xbzrle(f, addr, host) < 0) {
2551                error_report("Failed to decompress XBZRLE page at "
2552                             RAM_ADDR_FMT, addr);
2553                ret = -EINVAL;
2554                break;
2555            }
2556            break;
2557        case RAM_SAVE_FLAG_EOS:
2558            /* normal exit */
2559            break;
2560        default:
2561            if (flags & RAM_SAVE_FLAG_HOOK) {
2562                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2563            } else {
2564                error_report("Unknown combination of migration flags: %#x",
2565                             flags);
2566                ret = -EINVAL;
2567            }
2568        }
2569        if (!ret) {
2570            ret = qemu_file_get_error(f);
2571        }
2572    }
2573
2574    wait_for_decompress_done();
2575    rcu_read_unlock();
2576    DPRINTF("Completed load of VM with exit code %d seq iteration "
2577            "%" PRIu64 "\n", ret, seq_iter);
2578    return ret;
2579}
2580
2581static SaveVMHandlers savevm_ram_handlers = {
2582    .save_live_setup = ram_save_setup,
2583    .save_live_iterate = ram_save_iterate,
2584    .save_live_complete_postcopy = ram_save_complete,
2585    .save_live_complete_precopy = ram_save_complete,
2586    .save_live_pending = ram_save_pending,
2587    .load_state = ram_load,
2588    .cleanup = ram_migration_cleanup,
2589};
2590
2591void ram_mig_init(void)
2592{
2593    qemu_mutex_init(&XBZRLE.lock);
2594    register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2595}
2596