qemu/migration/ram.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2011-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28#include "qemu/osdep.h"
  29#include "qemu-common.h"
  30#include "cpu.h"
  31#include <zlib.h>
  32#include "qapi-event.h"
  33#include "qemu/cutils.h"
  34#include "qemu/bitops.h"
  35#include "qemu/bitmap.h"
  36#include "qemu/timer.h"
  37#include "qemu/main-loop.h"
  38#include "migration/migration.h"
  39#include "migration/postcopy-ram.h"
  40#include "exec/address-spaces.h"
  41#include "migration/page_cache.h"
  42#include "qemu/error-report.h"
  43#include "trace.h"
  44#include "exec/ram_addr.h"
  45#include "qemu/rcu_queue.h"
  46#include "migration/colo.h"
  47
  48#ifdef DEBUG_MIGRATION_RAM
  49#define DPRINTF(fmt, ...) \
  50    do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
  51#else
  52#define DPRINTF(fmt, ...) \
  53    do { } while (0)
  54#endif
  55
  56static int dirty_rate_high_cnt;
  57
  58static uint64_t bitmap_sync_count;
  59
  60/***********************************************************/
  61/* ram save/restore */
  62
  63#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  64#define RAM_SAVE_FLAG_COMPRESS 0x02
  65#define RAM_SAVE_FLAG_MEM_SIZE 0x04
  66#define RAM_SAVE_FLAG_PAGE     0x08
  67#define RAM_SAVE_FLAG_EOS      0x10
  68#define RAM_SAVE_FLAG_CONTINUE 0x20
  69#define RAM_SAVE_FLAG_XBZRLE   0x40
  70/* 0x80 is reserved in migration.h start with 0x100 next */
  71#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  72
  73static uint8_t *ZERO_TARGET_PAGE;
  74
  75static inline bool is_zero_range(uint8_t *p, uint64_t size)
  76{
  77    return buffer_is_zero(p, size);
  78}
  79
  80/* struct contains XBZRLE cache and a static page
  81   used by the compression */
  82static struct {
  83    /* buffer used for XBZRLE encoding */
  84    uint8_t *encoded_buf;
  85    /* buffer for storing page content */
  86    uint8_t *current_buf;
  87    /* Cache for XBZRLE, Protected by lock. */
  88    PageCache *cache;
  89    QemuMutex lock;
  90} XBZRLE;
  91
  92/* buffer used for XBZRLE decoding */
  93static uint8_t *xbzrle_decoded_buf;
  94
  95static void XBZRLE_cache_lock(void)
  96{
  97    if (migrate_use_xbzrle())
  98        qemu_mutex_lock(&XBZRLE.lock);
  99}
 100
 101static void XBZRLE_cache_unlock(void)
 102{
 103    if (migrate_use_xbzrle())
 104        qemu_mutex_unlock(&XBZRLE.lock);
 105}
 106
 107/*
 108 * called from qmp_migrate_set_cache_size in main thread, possibly while
 109 * a migration is in progress.
 110 * A running migration maybe using the cache and might finish during this
 111 * call, hence changes to the cache are protected by XBZRLE.lock().
 112 */
 113int64_t xbzrle_cache_resize(int64_t new_size)
 114{
 115    PageCache *new_cache;
 116    int64_t ret;
 117
 118    if (new_size < TARGET_PAGE_SIZE) {
 119        return -1;
 120    }
 121
 122    XBZRLE_cache_lock();
 123
 124    if (XBZRLE.cache != NULL) {
 125        if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 126            goto out_new_size;
 127        }
 128        new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 129                                        TARGET_PAGE_SIZE);
 130        if (!new_cache) {
 131            error_report("Error creating cache");
 132            ret = -1;
 133            goto out;
 134        }
 135
 136        cache_fini(XBZRLE.cache);
 137        XBZRLE.cache = new_cache;
 138    }
 139
 140out_new_size:
 141    ret = pow2floor(new_size);
 142out:
 143    XBZRLE_cache_unlock();
 144    return ret;
 145}
 146
 147/* accounting for migration statistics */
 148typedef struct AccountingInfo {
 149    uint64_t dup_pages;
 150    uint64_t skipped_pages;
 151    uint64_t norm_pages;
 152    uint64_t iterations;
 153    uint64_t xbzrle_bytes;
 154    uint64_t xbzrle_pages;
 155    uint64_t xbzrle_cache_miss;
 156    double xbzrle_cache_miss_rate;
 157    uint64_t xbzrle_overflows;
 158} AccountingInfo;
 159
 160static AccountingInfo acct_info;
 161
 162static void acct_clear(void)
 163{
 164    memset(&acct_info, 0, sizeof(acct_info));
 165}
 166
 167uint64_t dup_mig_bytes_transferred(void)
 168{
 169    return acct_info.dup_pages * TARGET_PAGE_SIZE;
 170}
 171
 172uint64_t dup_mig_pages_transferred(void)
 173{
 174    return acct_info.dup_pages;
 175}
 176
 177uint64_t skipped_mig_bytes_transferred(void)
 178{
 179    return acct_info.skipped_pages * TARGET_PAGE_SIZE;
 180}
 181
 182uint64_t skipped_mig_pages_transferred(void)
 183{
 184    return acct_info.skipped_pages;
 185}
 186
 187uint64_t norm_mig_bytes_transferred(void)
 188{
 189    return acct_info.norm_pages * TARGET_PAGE_SIZE;
 190}
 191
 192uint64_t norm_mig_pages_transferred(void)
 193{
 194    return acct_info.norm_pages;
 195}
 196
 197uint64_t xbzrle_mig_bytes_transferred(void)
 198{
 199    return acct_info.xbzrle_bytes;
 200}
 201
 202uint64_t xbzrle_mig_pages_transferred(void)
 203{
 204    return acct_info.xbzrle_pages;
 205}
 206
 207uint64_t xbzrle_mig_pages_cache_miss(void)
 208{
 209    return acct_info.xbzrle_cache_miss;
 210}
 211
 212double xbzrle_mig_cache_miss_rate(void)
 213{
 214    return acct_info.xbzrle_cache_miss_rate;
 215}
 216
 217uint64_t xbzrle_mig_pages_overflow(void)
 218{
 219    return acct_info.xbzrle_overflows;
 220}
 221
 222/* This is the last block that we have visited serching for dirty pages
 223 */
 224static RAMBlock *last_seen_block;
 225/* This is the last block from where we have sent data */
 226static RAMBlock *last_sent_block;
 227static ram_addr_t last_offset;
 228static QemuMutex migration_bitmap_mutex;
 229static uint64_t migration_dirty_pages;
 230static uint32_t last_version;
 231static bool ram_bulk_stage;
 232
 233/* used by the search for pages to send */
 234struct PageSearchStatus {
 235    /* Current block being searched */
 236    RAMBlock    *block;
 237    /* Current offset to search from */
 238    ram_addr_t   offset;
 239    /* Set once we wrap around */
 240    bool         complete_round;
 241};
 242typedef struct PageSearchStatus PageSearchStatus;
 243
 244static struct BitmapRcu {
 245    struct rcu_head rcu;
 246    /* Main migration bitmap */
 247    unsigned long *bmap;
 248    /* bitmap of pages that haven't been sent even once
 249     * only maintained and used in postcopy at the moment
 250     * where it's used to send the dirtymap at the start
 251     * of the postcopy phase
 252     */
 253    unsigned long *unsentmap;
 254} *migration_bitmap_rcu;
 255
 256struct CompressParam {
 257    bool done;
 258    bool quit;
 259    QEMUFile *file;
 260    QemuMutex mutex;
 261    QemuCond cond;
 262    RAMBlock *block;
 263    ram_addr_t offset;
 264};
 265typedef struct CompressParam CompressParam;
 266
 267struct DecompressParam {
 268    bool done;
 269    bool quit;
 270    QemuMutex mutex;
 271    QemuCond cond;
 272    void *des;
 273    uint8_t *compbuf;
 274    int len;
 275};
 276typedef struct DecompressParam DecompressParam;
 277
 278static CompressParam *comp_param;
 279static QemuThread *compress_threads;
 280/* comp_done_cond is used to wake up the migration thread when
 281 * one of the compression threads has finished the compression.
 282 * comp_done_lock is used to co-work with comp_done_cond.
 283 */
 284static QemuMutex comp_done_lock;
 285static QemuCond comp_done_cond;
 286/* The empty QEMUFileOps will be used by file in CompressParam */
 287static const QEMUFileOps empty_ops = { };
 288
 289static bool compression_switch;
 290static DecompressParam *decomp_param;
 291static QemuThread *decompress_threads;
 292static QemuMutex decomp_done_lock;
 293static QemuCond decomp_done_cond;
 294
 295static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 296                                ram_addr_t offset);
 297
 298static void *do_data_compress(void *opaque)
 299{
 300    CompressParam *param = opaque;
 301    RAMBlock *block;
 302    ram_addr_t offset;
 303
 304    qemu_mutex_lock(&param->mutex);
 305    while (!param->quit) {
 306        if (param->block) {
 307            block = param->block;
 308            offset = param->offset;
 309            param->block = NULL;
 310            qemu_mutex_unlock(&param->mutex);
 311
 312            do_compress_ram_page(param->file, block, offset);
 313
 314            qemu_mutex_lock(&comp_done_lock);
 315            param->done = true;
 316            qemu_cond_signal(&comp_done_cond);
 317            qemu_mutex_unlock(&comp_done_lock);
 318
 319            qemu_mutex_lock(&param->mutex);
 320        } else {
 321            qemu_cond_wait(&param->cond, &param->mutex);
 322        }
 323    }
 324    qemu_mutex_unlock(&param->mutex);
 325
 326    return NULL;
 327}
 328
 329static inline void terminate_compression_threads(void)
 330{
 331    int idx, thread_count;
 332
 333    thread_count = migrate_compress_threads();
 334    for (idx = 0; idx < thread_count; idx++) {
 335        qemu_mutex_lock(&comp_param[idx].mutex);
 336        comp_param[idx].quit = true;
 337        qemu_cond_signal(&comp_param[idx].cond);
 338        qemu_mutex_unlock(&comp_param[idx].mutex);
 339    }
 340}
 341
 342void migrate_compress_threads_join(void)
 343{
 344    int i, thread_count;
 345
 346    if (!migrate_use_compression()) {
 347        return;
 348    }
 349    terminate_compression_threads();
 350    thread_count = migrate_compress_threads();
 351    for (i = 0; i < thread_count; i++) {
 352        qemu_thread_join(compress_threads + i);
 353        qemu_fclose(comp_param[i].file);
 354        qemu_mutex_destroy(&comp_param[i].mutex);
 355        qemu_cond_destroy(&comp_param[i].cond);
 356    }
 357    qemu_mutex_destroy(&comp_done_lock);
 358    qemu_cond_destroy(&comp_done_cond);
 359    g_free(compress_threads);
 360    g_free(comp_param);
 361    compress_threads = NULL;
 362    comp_param = NULL;
 363}
 364
 365void migrate_compress_threads_create(void)
 366{
 367    int i, thread_count;
 368
 369    if (!migrate_use_compression()) {
 370        return;
 371    }
 372    compression_switch = true;
 373    thread_count = migrate_compress_threads();
 374    compress_threads = g_new0(QemuThread, thread_count);
 375    comp_param = g_new0(CompressParam, thread_count);
 376    qemu_cond_init(&comp_done_cond);
 377    qemu_mutex_init(&comp_done_lock);
 378    for (i = 0; i < thread_count; i++) {
 379        /* comp_param[i].file is just used as a dummy buffer to save data,
 380         * set its ops to empty.
 381         */
 382        comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 383        comp_param[i].done = true;
 384        comp_param[i].quit = false;
 385        qemu_mutex_init(&comp_param[i].mutex);
 386        qemu_cond_init(&comp_param[i].cond);
 387        qemu_thread_create(compress_threads + i, "compress",
 388                           do_data_compress, comp_param + i,
 389                           QEMU_THREAD_JOINABLE);
 390    }
 391}
 392
 393/**
 394 * save_page_header: Write page header to wire
 395 *
 396 * If this is the 1st block, it also writes the block identification
 397 *
 398 * Returns: Number of bytes written
 399 *
 400 * @f: QEMUFile where to send the data
 401 * @block: block that contains the page we want to send
 402 * @offset: offset inside the block for the page
 403 *          in the lower bits, it contains flags
 404 */
 405static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 406{
 407    size_t size, len;
 408
 409    qemu_put_be64(f, offset);
 410    size = 8;
 411
 412    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 413        len = strlen(block->idstr);
 414        qemu_put_byte(f, len);
 415        qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 416        size += 1 + len;
 417    }
 418    return size;
 419}
 420
 421/* Reduce amount of guest cpu execution to hopefully slow down memory writes.
 422 * If guest dirty memory rate is reduced below the rate at which we can
 423 * transfer pages to the destination then we should be able to complete
 424 * migration. Some workloads dirty memory way too fast and will not effectively
 425 * converge, even with auto-converge.
 426 */
 427static void mig_throttle_guest_down(void)
 428{
 429    MigrationState *s = migrate_get_current();
 430    uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 431    uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 432
 433    /* We have not started throttling yet. Let's start it. */
 434    if (!cpu_throttle_active()) {
 435        cpu_throttle_set(pct_initial);
 436    } else {
 437        /* Throttling already on, just increase the rate */
 438        cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 439    }
 440}
 441
 442/* Update the xbzrle cache to reflect a page that's been sent as all 0.
 443 * The important thing is that a stale (not-yet-0'd) page be replaced
 444 * by the new data.
 445 * As a bonus, if the page wasn't in the cache it gets added so that
 446 * when a small write is made into the 0'd page it gets XBZRLE sent
 447 */
 448static void xbzrle_cache_zero_page(ram_addr_t current_addr)
 449{
 450    if (ram_bulk_stage || !migrate_use_xbzrle()) {
 451        return;
 452    }
 453
 454    /* We don't care if this fails to allocate a new cache page
 455     * as long as it updated an old one */
 456    cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 457                 bitmap_sync_count);
 458}
 459
 460#define ENCODING_FLAG_XBZRLE 0x1
 461
 462/**
 463 * save_xbzrle_page: compress and send current page
 464 *
 465 * Returns: 1 means that we wrote the page
 466 *          0 means that page is identical to the one already sent
 467 *          -1 means that xbzrle would be longer than normal
 468 *
 469 * @f: QEMUFile where to send the data
 470 * @current_data:
 471 * @current_addr:
 472 * @block: block that contains the page we want to send
 473 * @offset: offset inside the block for the page
 474 * @last_stage: if we are at the completion stage
 475 * @bytes_transferred: increase it with the number of transferred bytes
 476 */
 477static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
 478                            ram_addr_t current_addr, RAMBlock *block,
 479                            ram_addr_t offset, bool last_stage,
 480                            uint64_t *bytes_transferred)
 481{
 482    int encoded_len = 0, bytes_xbzrle;
 483    uint8_t *prev_cached_page;
 484
 485    if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
 486        acct_info.xbzrle_cache_miss++;
 487        if (!last_stage) {
 488            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 489                             bitmap_sync_count) == -1) {
 490                return -1;
 491            } else {
 492                /* update *current_data when the page has been
 493                   inserted into cache */
 494                *current_data = get_cached_data(XBZRLE.cache, current_addr);
 495            }
 496        }
 497        return -1;
 498    }
 499
 500    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 501
 502    /* save current buffer into memory */
 503    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 504
 505    /* XBZRLE encoding (if there is no overflow) */
 506    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 507                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 508                                       TARGET_PAGE_SIZE);
 509    if (encoded_len == 0) {
 510        DPRINTF("Skipping unmodified page\n");
 511        return 0;
 512    } else if (encoded_len == -1) {
 513        DPRINTF("Overflow\n");
 514        acct_info.xbzrle_overflows++;
 515        /* update data in the cache */
 516        if (!last_stage) {
 517            memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 518            *current_data = prev_cached_page;
 519        }
 520        return -1;
 521    }
 522
 523    /* we need to update the data in the cache, in order to get the same data */
 524    if (!last_stage) {
 525        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 526    }
 527
 528    /* Send XBZRLE based compressed page */
 529    bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
 530    qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
 531    qemu_put_be16(f, encoded_len);
 532    qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
 533    bytes_xbzrle += encoded_len + 1 + 2;
 534    acct_info.xbzrle_pages++;
 535    acct_info.xbzrle_bytes += bytes_xbzrle;
 536    *bytes_transferred += bytes_xbzrle;
 537
 538    return 1;
 539}
 540
 541/* Called with rcu_read_lock() to protect migration_bitmap
 542 * rb: The RAMBlock  to search for dirty pages in
 543 * start: Start address (typically so we can continue from previous page)
 544 * ram_addr_abs: Pointer into which to store the address of the dirty page
 545 *               within the global ram_addr space
 546 *
 547 * Returns: byte offset within memory region of the start of a dirty page
 548 */
 549static inline
 550ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
 551                                       ram_addr_t start,
 552                                       ram_addr_t *ram_addr_abs)
 553{
 554    unsigned long base = rb->offset >> TARGET_PAGE_BITS;
 555    unsigned long nr = base + (start >> TARGET_PAGE_BITS);
 556    uint64_t rb_size = rb->used_length;
 557    unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
 558    unsigned long *bitmap;
 559
 560    unsigned long next;
 561
 562    bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 563    if (ram_bulk_stage && nr > base) {
 564        next = nr + 1;
 565    } else {
 566        next = find_next_bit(bitmap, size, nr);
 567    }
 568
 569    *ram_addr_abs = next << TARGET_PAGE_BITS;
 570    return (next - base) << TARGET_PAGE_BITS;
 571}
 572
 573static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
 574{
 575    bool ret;
 576    int nr = addr >> TARGET_PAGE_BITS;
 577    unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 578
 579    ret = test_and_clear_bit(nr, bitmap);
 580
 581    if (ret) {
 582        migration_dirty_pages--;
 583    }
 584    return ret;
 585}
 586
 587static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
 588{
 589    unsigned long *bitmap;
 590    bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 591    migration_dirty_pages +=
 592        cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
 593}
 594
 595/* Fix me: there are too many global variables used in migration process. */
 596static int64_t start_time;
 597static int64_t bytes_xfer_prev;
 598static int64_t num_dirty_pages_period;
 599static uint64_t xbzrle_cache_miss_prev;
 600static uint64_t iterations_prev;
 601
 602static void migration_bitmap_sync_init(void)
 603{
 604    start_time = 0;
 605    bytes_xfer_prev = 0;
 606    num_dirty_pages_period = 0;
 607    xbzrle_cache_miss_prev = 0;
 608    iterations_prev = 0;
 609}
 610
 611static void migration_bitmap_sync(void)
 612{
 613    RAMBlock *block;
 614    uint64_t num_dirty_pages_init = migration_dirty_pages;
 615    MigrationState *s = migrate_get_current();
 616    int64_t end_time;
 617    int64_t bytes_xfer_now;
 618
 619    bitmap_sync_count++;
 620
 621    if (!bytes_xfer_prev) {
 622        bytes_xfer_prev = ram_bytes_transferred();
 623    }
 624
 625    if (!start_time) {
 626        start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 627    }
 628
 629    trace_migration_bitmap_sync_start();
 630    memory_global_dirty_log_sync();
 631
 632    qemu_mutex_lock(&migration_bitmap_mutex);
 633    rcu_read_lock();
 634    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 635        migration_bitmap_sync_range(block->offset, block->used_length);
 636    }
 637    rcu_read_unlock();
 638    qemu_mutex_unlock(&migration_bitmap_mutex);
 639
 640    trace_migration_bitmap_sync_end(migration_dirty_pages
 641                                    - num_dirty_pages_init);
 642    num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
 643    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 644
 645    /* more than 1 second = 1000 millisecons */
 646    if (end_time > start_time + 1000) {
 647        if (migrate_auto_converge()) {
 648            /* The following detection logic can be refined later. For now:
 649               Check to see if the dirtied bytes is 50% more than the approx.
 650               amount of bytes that just got transferred since the last time we
 651               were in this routine. If that happens twice, start or increase
 652               throttling */
 653            bytes_xfer_now = ram_bytes_transferred();
 654
 655            if (s->dirty_pages_rate &&
 656               (num_dirty_pages_period * TARGET_PAGE_SIZE >
 657                   (bytes_xfer_now - bytes_xfer_prev)/2) &&
 658               (dirty_rate_high_cnt++ >= 2)) {
 659                    trace_migration_throttle();
 660                    dirty_rate_high_cnt = 0;
 661                    mig_throttle_guest_down();
 662             }
 663             bytes_xfer_prev = bytes_xfer_now;
 664        }
 665
 666        if (migrate_use_xbzrle()) {
 667            if (iterations_prev != acct_info.iterations) {
 668                acct_info.xbzrle_cache_miss_rate =
 669                   (double)(acct_info.xbzrle_cache_miss -
 670                            xbzrle_cache_miss_prev) /
 671                   (acct_info.iterations - iterations_prev);
 672            }
 673            iterations_prev = acct_info.iterations;
 674            xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
 675        }
 676        s->dirty_pages_rate = num_dirty_pages_period * 1000
 677            / (end_time - start_time);
 678        s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
 679        start_time = end_time;
 680        num_dirty_pages_period = 0;
 681    }
 682    s->dirty_sync_count = bitmap_sync_count;
 683    if (migrate_use_events()) {
 684        qapi_event_send_migration_pass(bitmap_sync_count, NULL);
 685    }
 686}
 687
 688/**
 689 * save_zero_page: Send the zero page to the stream
 690 *
 691 * Returns: Number of pages written.
 692 *
 693 * @f: QEMUFile where to send the data
 694 * @block: block that contains the page we want to send
 695 * @offset: offset inside the block for the page
 696 * @p: pointer to the page
 697 * @bytes_transferred: increase it with the number of transferred bytes
 698 */
 699static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
 700                          uint8_t *p, uint64_t *bytes_transferred)
 701{
 702    int pages = -1;
 703
 704    if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 705        acct_info.dup_pages++;
 706        *bytes_transferred += save_page_header(f, block,
 707                                               offset | RAM_SAVE_FLAG_COMPRESS);
 708        qemu_put_byte(f, 0);
 709        *bytes_transferred += 1;
 710        pages = 1;
 711    }
 712
 713    return pages;
 714}
 715
 716/**
 717 * ram_save_page: Send the given page to the stream
 718 *
 719 * Returns: Number of pages written.
 720 *          < 0 - error
 721 *          >=0 - Number of pages written - this might legally be 0
 722 *                if xbzrle noticed the page was the same.
 723 *
 724 * @f: QEMUFile where to send the data
 725 * @block: block that contains the page we want to send
 726 * @offset: offset inside the block for the page
 727 * @last_stage: if we are at the completion stage
 728 * @bytes_transferred: increase it with the number of transferred bytes
 729 */
 730static int ram_save_page(QEMUFile *f, PageSearchStatus *pss,
 731                         bool last_stage, uint64_t *bytes_transferred)
 732{
 733    int pages = -1;
 734    uint64_t bytes_xmit;
 735    ram_addr_t current_addr;
 736    uint8_t *p;
 737    int ret;
 738    bool send_async = true;
 739    RAMBlock *block = pss->block;
 740    ram_addr_t offset = pss->offset;
 741
 742    p = block->host + offset;
 743
 744    /* In doubt sent page as normal */
 745    bytes_xmit = 0;
 746    ret = ram_control_save_page(f, block->offset,
 747                           offset, TARGET_PAGE_SIZE, &bytes_xmit);
 748    if (bytes_xmit) {
 749        *bytes_transferred += bytes_xmit;
 750        pages = 1;
 751    }
 752
 753    XBZRLE_cache_lock();
 754
 755    current_addr = block->offset + offset;
 756
 757    if (block == last_sent_block) {
 758        offset |= RAM_SAVE_FLAG_CONTINUE;
 759    }
 760    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 761        if (ret != RAM_SAVE_CONTROL_DELAYED) {
 762            if (bytes_xmit > 0) {
 763                acct_info.norm_pages++;
 764            } else if (bytes_xmit == 0) {
 765                acct_info.dup_pages++;
 766            }
 767        }
 768    } else {
 769        pages = save_zero_page(f, block, offset, p, bytes_transferred);
 770        if (pages > 0) {
 771            /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 772             * page would be stale
 773             */
 774            xbzrle_cache_zero_page(current_addr);
 775        } else if (!ram_bulk_stage &&
 776                   !migration_in_postcopy(migrate_get_current()) &&
 777                   migrate_use_xbzrle()) {
 778            pages = save_xbzrle_page(f, &p, current_addr, block,
 779                                     offset, last_stage, bytes_transferred);
 780            if (!last_stage) {
 781                /* Can't send this cached data async, since the cache page
 782                 * might get updated before it gets to the wire
 783                 */
 784                send_async = false;
 785            }
 786        }
 787    }
 788
 789    /* XBZRLE overflow or normal page */
 790    if (pages == -1) {
 791        *bytes_transferred += save_page_header(f, block,
 792                                               offset | RAM_SAVE_FLAG_PAGE);
 793        if (send_async) {
 794            qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
 795        } else {
 796            qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
 797        }
 798        *bytes_transferred += TARGET_PAGE_SIZE;
 799        pages = 1;
 800        acct_info.norm_pages++;
 801    }
 802
 803    XBZRLE_cache_unlock();
 804
 805    return pages;
 806}
 807
 808static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 809                                ram_addr_t offset)
 810{
 811    int bytes_sent, blen;
 812    uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 813
 814    bytes_sent = save_page_header(f, block, offset |
 815                                  RAM_SAVE_FLAG_COMPRESS_PAGE);
 816    blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 817                                     migrate_compress_level());
 818    if (blen < 0) {
 819        bytes_sent = 0;
 820        qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 821        error_report("compressed data failed!");
 822    } else {
 823        bytes_sent += blen;
 824    }
 825
 826    return bytes_sent;
 827}
 828
 829static uint64_t bytes_transferred;
 830
 831static void flush_compressed_data(QEMUFile *f)
 832{
 833    int idx, len, thread_count;
 834
 835    if (!migrate_use_compression()) {
 836        return;
 837    }
 838    thread_count = migrate_compress_threads();
 839
 840    qemu_mutex_lock(&comp_done_lock);
 841    for (idx = 0; idx < thread_count; idx++) {
 842        while (!comp_param[idx].done) {
 843            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 844        }
 845    }
 846    qemu_mutex_unlock(&comp_done_lock);
 847
 848    for (idx = 0; idx < thread_count; idx++) {
 849        qemu_mutex_lock(&comp_param[idx].mutex);
 850        if (!comp_param[idx].quit) {
 851            len = qemu_put_qemu_file(f, comp_param[idx].file);
 852            bytes_transferred += len;
 853        }
 854        qemu_mutex_unlock(&comp_param[idx].mutex);
 855    }
 856}
 857
 858static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 859                                       ram_addr_t offset)
 860{
 861    param->block = block;
 862    param->offset = offset;
 863}
 864
 865static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
 866                                           ram_addr_t offset,
 867                                           uint64_t *bytes_transferred)
 868{
 869    int idx, thread_count, bytes_xmit = -1, pages = -1;
 870
 871    thread_count = migrate_compress_threads();
 872    qemu_mutex_lock(&comp_done_lock);
 873    while (true) {
 874        for (idx = 0; idx < thread_count; idx++) {
 875            if (comp_param[idx].done) {
 876                comp_param[idx].done = false;
 877                bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
 878                qemu_mutex_lock(&comp_param[idx].mutex);
 879                set_compress_params(&comp_param[idx], block, offset);
 880                qemu_cond_signal(&comp_param[idx].cond);
 881                qemu_mutex_unlock(&comp_param[idx].mutex);
 882                pages = 1;
 883                acct_info.norm_pages++;
 884                *bytes_transferred += bytes_xmit;
 885                break;
 886            }
 887        }
 888        if (pages > 0) {
 889            break;
 890        } else {
 891            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 892        }
 893    }
 894    qemu_mutex_unlock(&comp_done_lock);
 895
 896    return pages;
 897}
 898
 899/**
 900 * ram_save_compressed_page: compress the given page and send it to the stream
 901 *
 902 * Returns: Number of pages written.
 903 *
 904 * @f: QEMUFile where to send the data
 905 * @block: block that contains the page we want to send
 906 * @offset: offset inside the block for the page
 907 * @last_stage: if we are at the completion stage
 908 * @bytes_transferred: increase it with the number of transferred bytes
 909 */
 910static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss,
 911                                    bool last_stage,
 912                                    uint64_t *bytes_transferred)
 913{
 914    int pages = -1;
 915    uint64_t bytes_xmit = 0;
 916    uint8_t *p;
 917    int ret, blen;
 918    RAMBlock *block = pss->block;
 919    ram_addr_t offset = pss->offset;
 920
 921    p = block->host + offset;
 922
 923    ret = ram_control_save_page(f, block->offset,
 924                                offset, TARGET_PAGE_SIZE, &bytes_xmit);
 925    if (bytes_xmit) {
 926        *bytes_transferred += bytes_xmit;
 927        pages = 1;
 928    }
 929    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 930        if (ret != RAM_SAVE_CONTROL_DELAYED) {
 931            if (bytes_xmit > 0) {
 932                acct_info.norm_pages++;
 933            } else if (bytes_xmit == 0) {
 934                acct_info.dup_pages++;
 935            }
 936        }
 937    } else {
 938        /* When starting the process of a new block, the first page of
 939         * the block should be sent out before other pages in the same
 940         * block, and all the pages in last block should have been sent
 941         * out, keeping this order is important, because the 'cont' flag
 942         * is used to avoid resending the block name.
 943         */
 944        if (block != last_sent_block) {
 945            flush_compressed_data(f);
 946            pages = save_zero_page(f, block, offset, p, bytes_transferred);
 947            if (pages == -1) {
 948                /* Make sure the first page is sent out before other pages */
 949                bytes_xmit = save_page_header(f, block, offset |
 950                                              RAM_SAVE_FLAG_COMPRESS_PAGE);
 951                blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 952                                                 migrate_compress_level());
 953                if (blen > 0) {
 954                    *bytes_transferred += bytes_xmit + blen;
 955                    acct_info.norm_pages++;
 956                    pages = 1;
 957                } else {
 958                    qemu_file_set_error(f, blen);
 959                    error_report("compressed data failed!");
 960                }
 961            }
 962        } else {
 963            offset |= RAM_SAVE_FLAG_CONTINUE;
 964            pages = save_zero_page(f, block, offset, p, bytes_transferred);
 965            if (pages == -1) {
 966                pages = compress_page_with_multi_thread(f, block, offset,
 967                                                        bytes_transferred);
 968            }
 969        }
 970    }
 971
 972    return pages;
 973}
 974
 975/*
 976 * Find the next dirty page and update any state associated with
 977 * the search process.
 978 *
 979 * Returns: True if a page is found
 980 *
 981 * @f: Current migration stream.
 982 * @pss: Data about the state of the current dirty page scan.
 983 * @*again: Set to false if the search has scanned the whole of RAM
 984 * *ram_addr_abs: Pointer into which to store the address of the dirty page
 985 *               within the global ram_addr space
 986 */
 987static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
 988                             bool *again, ram_addr_t *ram_addr_abs)
 989{
 990    pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
 991                                              ram_addr_abs);
 992    if (pss->complete_round && pss->block == last_seen_block &&
 993        pss->offset >= last_offset) {
 994        /*
 995         * We've been once around the RAM and haven't found anything.
 996         * Give up.
 997         */
 998        *again = false;
 999        return false;
1000    }
1001    if (pss->offset >= pss->block->used_length) {
1002        /* Didn't find anything in this RAM Block */
1003        pss->offset = 0;
1004        pss->block = QLIST_NEXT_RCU(pss->block, next);
1005        if (!pss->block) {
1006            /* Hit the end of the list */
1007            pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1008            /* Flag that we've looped */
1009            pss->complete_round = true;
1010            ram_bulk_stage = false;
1011            if (migrate_use_xbzrle()) {
1012                /* If xbzrle is on, stop using the data compression at this
1013                 * point. In theory, xbzrle can do better than compression.
1014                 */
1015                flush_compressed_data(f);
1016                compression_switch = false;
1017            }
1018        }
1019        /* Didn't find anything this time, but try again on the new block */
1020        *again = true;
1021        return false;
1022    } else {
1023        /* Can go around again, but... */
1024        *again = true;
1025        /* We've found something so probably don't need to */
1026        return true;
1027    }
1028}
1029
1030/*
1031 * Helper for 'get_queued_page' - gets a page off the queue
1032 *      ms:      MigrationState in
1033 * *offset:      Used to return the offset within the RAMBlock
1034 * ram_addr_abs: global offset in the dirty/sent bitmaps
1035 *
1036 * Returns:      block (or NULL if none available)
1037 */
1038static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1039                              ram_addr_t *ram_addr_abs)
1040{
1041    RAMBlock *block = NULL;
1042
1043    qemu_mutex_lock(&ms->src_page_req_mutex);
1044    if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1045        struct MigrationSrcPageRequest *entry =
1046                                QSIMPLEQ_FIRST(&ms->src_page_requests);
1047        block = entry->rb;
1048        *offset = entry->offset;
1049        *ram_addr_abs = (entry->offset + entry->rb->offset) &
1050                        TARGET_PAGE_MASK;
1051
1052        if (entry->len > TARGET_PAGE_SIZE) {
1053            entry->len -= TARGET_PAGE_SIZE;
1054            entry->offset += TARGET_PAGE_SIZE;
1055        } else {
1056            memory_region_unref(block->mr);
1057            QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1058            g_free(entry);
1059        }
1060    }
1061    qemu_mutex_unlock(&ms->src_page_req_mutex);
1062
1063    return block;
1064}
1065
1066/*
1067 * Unqueue a page from the queue fed by postcopy page requests; skips pages
1068 * that are already sent (!dirty)
1069 *
1070 *      ms:      MigrationState in
1071 *     pss:      PageSearchStatus structure updated with found block/offset
1072 * ram_addr_abs: global offset in the dirty/sent bitmaps
1073 *
1074 * Returns:      true if a queued page is found
1075 */
1076static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1077                            ram_addr_t *ram_addr_abs)
1078{
1079    RAMBlock  *block;
1080    ram_addr_t offset;
1081    bool dirty;
1082
1083    do {
1084        block = unqueue_page(ms, &offset, ram_addr_abs);
1085        /*
1086         * We're sending this page, and since it's postcopy nothing else
1087         * will dirty it, and we must make sure it doesn't get sent again
1088         * even if this queue request was received after the background
1089         * search already sent it.
1090         */
1091        if (block) {
1092            unsigned long *bitmap;
1093            bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1094            dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1095            if (!dirty) {
1096                trace_get_queued_page_not_dirty(
1097                    block->idstr, (uint64_t)offset,
1098                    (uint64_t)*ram_addr_abs,
1099                    test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1100                         atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1101            } else {
1102                trace_get_queued_page(block->idstr,
1103                                      (uint64_t)offset,
1104                                      (uint64_t)*ram_addr_abs);
1105            }
1106        }
1107
1108    } while (block && !dirty);
1109
1110    if (block) {
1111        /*
1112         * As soon as we start servicing pages out of order, then we have
1113         * to kill the bulk stage, since the bulk stage assumes
1114         * in (migration_bitmap_find_and_reset_dirty) that every page is
1115         * dirty, that's no longer true.
1116         */
1117        ram_bulk_stage = false;
1118
1119        /*
1120         * We want the background search to continue from the queued page
1121         * since the guest is likely to want other pages near to the page
1122         * it just requested.
1123         */
1124        pss->block = block;
1125        pss->offset = offset;
1126    }
1127
1128    return !!block;
1129}
1130
1131/**
1132 * flush_page_queue: Flush any remaining pages in the ram request queue
1133 *    it should be empty at the end anyway, but in error cases there may be
1134 *    some left.
1135 *
1136 * ms: MigrationState
1137 */
1138void flush_page_queue(MigrationState *ms)
1139{
1140    struct MigrationSrcPageRequest *mspr, *next_mspr;
1141    /* This queue generally should be empty - but in the case of a failed
1142     * migration might have some droppings in.
1143     */
1144    rcu_read_lock();
1145    QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1146        memory_region_unref(mspr->rb->mr);
1147        QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1148        g_free(mspr);
1149    }
1150    rcu_read_unlock();
1151}
1152
1153/**
1154 * Queue the pages for transmission, e.g. a request from postcopy destination
1155 *   ms: MigrationStatus in which the queue is held
1156 *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1157 *   start: Offset from the start of the RAMBlock
1158 *   len: Length (in bytes) to send
1159 *   Return: 0 on success
1160 */
1161int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1162                         ram_addr_t start, ram_addr_t len)
1163{
1164    RAMBlock *ramblock;
1165
1166    ms->postcopy_requests++;
1167    rcu_read_lock();
1168    if (!rbname) {
1169        /* Reuse last RAMBlock */
1170        ramblock = ms->last_req_rb;
1171
1172        if (!ramblock) {
1173            /*
1174             * Shouldn't happen, we can't reuse the last RAMBlock if
1175             * it's the 1st request.
1176             */
1177            error_report("ram_save_queue_pages no previous block");
1178            goto err;
1179        }
1180    } else {
1181        ramblock = qemu_ram_block_by_name(rbname);
1182
1183        if (!ramblock) {
1184            /* We shouldn't be asked for a non-existent RAMBlock */
1185            error_report("ram_save_queue_pages no block '%s'", rbname);
1186            goto err;
1187        }
1188        ms->last_req_rb = ramblock;
1189    }
1190    trace_ram_save_queue_pages(ramblock->idstr, start, len);
1191    if (start+len > ramblock->used_length) {
1192        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1193                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1194                     __func__, start, len, ramblock->used_length);
1195        goto err;
1196    }
1197
1198    struct MigrationSrcPageRequest *new_entry =
1199        g_malloc0(sizeof(struct MigrationSrcPageRequest));
1200    new_entry->rb = ramblock;
1201    new_entry->offset = start;
1202    new_entry->len = len;
1203
1204    memory_region_ref(ramblock->mr);
1205    qemu_mutex_lock(&ms->src_page_req_mutex);
1206    QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1207    qemu_mutex_unlock(&ms->src_page_req_mutex);
1208    rcu_read_unlock();
1209
1210    return 0;
1211
1212err:
1213    rcu_read_unlock();
1214    return -1;
1215}
1216
1217/**
1218 * ram_save_target_page: Save one target page
1219 *
1220 *
1221 * @f: QEMUFile where to send the data
1222 * @block: pointer to block that contains the page we want to send
1223 * @offset: offset inside the block for the page;
1224 * @last_stage: if we are at the completion stage
1225 * @bytes_transferred: increase it with the number of transferred bytes
1226 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1227 *
1228 * Returns: Number of pages written.
1229 */
1230static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1231                                PageSearchStatus *pss,
1232                                bool last_stage,
1233                                uint64_t *bytes_transferred,
1234                                ram_addr_t dirty_ram_abs)
1235{
1236    int res = 0;
1237
1238    /* Check the pages is dirty and if it is send it */
1239    if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1240        unsigned long *unsentmap;
1241        if (compression_switch && migrate_use_compression()) {
1242            res = ram_save_compressed_page(f, pss,
1243                                           last_stage,
1244                                           bytes_transferred);
1245        } else {
1246            res = ram_save_page(f, pss, last_stage,
1247                                bytes_transferred);
1248        }
1249
1250        if (res < 0) {
1251            return res;
1252        }
1253        unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1254        if (unsentmap) {
1255            clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1256        }
1257        /* Only update last_sent_block if a block was actually sent; xbzrle
1258         * might have decided the page was identical so didn't bother writing
1259         * to the stream.
1260         */
1261        if (res > 0) {
1262            last_sent_block = pss->block;
1263        }
1264    }
1265
1266    return res;
1267}
1268
1269/**
1270 * ram_save_host_page: Starting at *offset send pages up to the end
1271 *                     of the current host page.  It's valid for the initial
1272 *                     offset to point into the middle of a host page
1273 *                     in which case the remainder of the hostpage is sent.
1274 *                     Only dirty target pages are sent.
1275 *
1276 * Returns: Number of pages written.
1277 *
1278 * @f: QEMUFile where to send the data
1279 * @block: pointer to block that contains the page we want to send
1280 * @offset: offset inside the block for the page; updated to last target page
1281 *          sent
1282 * @last_stage: if we are at the completion stage
1283 * @bytes_transferred: increase it with the number of transferred bytes
1284 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1285 */
1286static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1287                              PageSearchStatus *pss,
1288                              bool last_stage,
1289                              uint64_t *bytes_transferred,
1290                              ram_addr_t dirty_ram_abs)
1291{
1292    int tmppages, pages = 0;
1293    do {
1294        tmppages = ram_save_target_page(ms, f, pss, last_stage,
1295                                        bytes_transferred, dirty_ram_abs);
1296        if (tmppages < 0) {
1297            return tmppages;
1298        }
1299
1300        pages += tmppages;
1301        pss->offset += TARGET_PAGE_SIZE;
1302        dirty_ram_abs += TARGET_PAGE_SIZE;
1303    } while (pss->offset & (qemu_host_page_size - 1));
1304
1305    /* The offset we leave with is the last one we looked at */
1306    pss->offset -= TARGET_PAGE_SIZE;
1307    return pages;
1308}
1309
1310/**
1311 * ram_find_and_save_block: Finds a dirty page and sends it to f
1312 *
1313 * Called within an RCU critical section.
1314 *
1315 * Returns:  The number of pages written
1316 *           0 means no dirty pages
1317 *
1318 * @f: QEMUFile where to send the data
1319 * @last_stage: if we are at the completion stage
1320 * @bytes_transferred: increase it with the number of transferred bytes
1321 *
1322 * On systems where host-page-size > target-page-size it will send all the
1323 * pages in a host page that are dirty.
1324 */
1325
1326static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1327                                   uint64_t *bytes_transferred)
1328{
1329    PageSearchStatus pss;
1330    MigrationState *ms = migrate_get_current();
1331    int pages = 0;
1332    bool again, found;
1333    ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1334                                 ram_addr_t space */
1335
1336    pss.block = last_seen_block;
1337    pss.offset = last_offset;
1338    pss.complete_round = false;
1339
1340    if (!pss.block) {
1341        pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1342    }
1343
1344    do {
1345        again = true;
1346        found = get_queued_page(ms, &pss, &dirty_ram_abs);
1347
1348        if (!found) {
1349            /* priority queue empty, so just search for something dirty */
1350            found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1351        }
1352
1353        if (found) {
1354            pages = ram_save_host_page(ms, f, &pss,
1355                                       last_stage, bytes_transferred,
1356                                       dirty_ram_abs);
1357        }
1358    } while (!pages && again);
1359
1360    last_seen_block = pss.block;
1361    last_offset = pss.offset;
1362
1363    return pages;
1364}
1365
1366void acct_update_position(QEMUFile *f, size_t size, bool zero)
1367{
1368    uint64_t pages = size / TARGET_PAGE_SIZE;
1369    if (zero) {
1370        acct_info.dup_pages += pages;
1371    } else {
1372        acct_info.norm_pages += pages;
1373        bytes_transferred += size;
1374        qemu_update_position(f, size);
1375    }
1376}
1377
1378static ram_addr_t ram_save_remaining(void)
1379{
1380    return migration_dirty_pages;
1381}
1382
1383uint64_t ram_bytes_remaining(void)
1384{
1385    return ram_save_remaining() * TARGET_PAGE_SIZE;
1386}
1387
1388uint64_t ram_bytes_transferred(void)
1389{
1390    return bytes_transferred;
1391}
1392
1393uint64_t ram_bytes_total(void)
1394{
1395    RAMBlock *block;
1396    uint64_t total = 0;
1397
1398    rcu_read_lock();
1399    QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1400        total += block->used_length;
1401    rcu_read_unlock();
1402    return total;
1403}
1404
1405void free_xbzrle_decoded_buf(void)
1406{
1407    g_free(xbzrle_decoded_buf);
1408    xbzrle_decoded_buf = NULL;
1409}
1410
1411static void migration_bitmap_free(struct BitmapRcu *bmap)
1412{
1413    g_free(bmap->bmap);
1414    g_free(bmap->unsentmap);
1415    g_free(bmap);
1416}
1417
1418static void ram_migration_cleanup(void *opaque)
1419{
1420    /* caller have hold iothread lock or is in a bh, so there is
1421     * no writing race against this migration_bitmap
1422     */
1423    struct BitmapRcu *bitmap = migration_bitmap_rcu;
1424    atomic_rcu_set(&migration_bitmap_rcu, NULL);
1425    if (bitmap) {
1426        memory_global_dirty_log_stop();
1427        call_rcu(bitmap, migration_bitmap_free, rcu);
1428    }
1429
1430    XBZRLE_cache_lock();
1431    if (XBZRLE.cache) {
1432        cache_fini(XBZRLE.cache);
1433        g_free(XBZRLE.encoded_buf);
1434        g_free(XBZRLE.current_buf);
1435        g_free(ZERO_TARGET_PAGE);
1436        XBZRLE.cache = NULL;
1437        XBZRLE.encoded_buf = NULL;
1438        XBZRLE.current_buf = NULL;
1439    }
1440    XBZRLE_cache_unlock();
1441}
1442
1443static void reset_ram_globals(void)
1444{
1445    last_seen_block = NULL;
1446    last_sent_block = NULL;
1447    last_offset = 0;
1448    last_version = ram_list.version;
1449    ram_bulk_stage = true;
1450}
1451
1452#define MAX_WAIT 50 /* ms, half buffered_file limit */
1453
1454void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1455{
1456    /* called in qemu main thread, so there is
1457     * no writing race against this migration_bitmap
1458     */
1459    if (migration_bitmap_rcu) {
1460        struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1461        bitmap = g_new(struct BitmapRcu, 1);
1462        bitmap->bmap = bitmap_new(new);
1463
1464        /* prevent migration_bitmap content from being set bit
1465         * by migration_bitmap_sync_range() at the same time.
1466         * it is safe to migration if migration_bitmap is cleared bit
1467         * at the same time.
1468         */
1469        qemu_mutex_lock(&migration_bitmap_mutex);
1470        bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1471        bitmap_set(bitmap->bmap, old, new - old);
1472
1473        /* We don't have a way to safely extend the sentmap
1474         * with RCU; so mark it as missing, entry to postcopy
1475         * will fail.
1476         */
1477        bitmap->unsentmap = NULL;
1478
1479        atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1480        qemu_mutex_unlock(&migration_bitmap_mutex);
1481        migration_dirty_pages += new - old;
1482        call_rcu(old_bitmap, migration_bitmap_free, rcu);
1483    }
1484}
1485
1486/*
1487 * 'expected' is the value you expect the bitmap mostly to be full
1488 * of; it won't bother printing lines that are all this value.
1489 * If 'todump' is null the migration bitmap is dumped.
1490 */
1491void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1492{
1493    int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1494
1495    int64_t cur;
1496    int64_t linelen = 128;
1497    char linebuf[129];
1498
1499    if (!todump) {
1500        todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1501    }
1502
1503    for (cur = 0; cur < ram_pages; cur += linelen) {
1504        int64_t curb;
1505        bool found = false;
1506        /*
1507         * Last line; catch the case where the line length
1508         * is longer than remaining ram
1509         */
1510        if (cur + linelen > ram_pages) {
1511            linelen = ram_pages - cur;
1512        }
1513        for (curb = 0; curb < linelen; curb++) {
1514            bool thisbit = test_bit(cur + curb, todump);
1515            linebuf[curb] = thisbit ? '1' : '.';
1516            found = found || (thisbit != expected);
1517        }
1518        if (found) {
1519            linebuf[curb] = '\0';
1520            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1521        }
1522    }
1523}
1524
1525/* **** functions for postcopy ***** */
1526
1527/*
1528 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1529 * Note: At this point the 'unsentmap' is the processed bitmap combined
1530 *       with the dirtymap; so a '1' means it's either dirty or unsent.
1531 * start,length: Indexes into the bitmap for the first bit
1532 *            representing the named block and length in target-pages
1533 */
1534static int postcopy_send_discard_bm_ram(MigrationState *ms,
1535                                        PostcopyDiscardState *pds,
1536                                        unsigned long start,
1537                                        unsigned long length)
1538{
1539    unsigned long end = start + length; /* one after the end */
1540    unsigned long current;
1541    unsigned long *unsentmap;
1542
1543    unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1544    for (current = start; current < end; ) {
1545        unsigned long one = find_next_bit(unsentmap, end, current);
1546
1547        if (one <= end) {
1548            unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1549            unsigned long discard_length;
1550
1551            if (zero >= end) {
1552                discard_length = end - one;
1553            } else {
1554                discard_length = zero - one;
1555            }
1556            if (discard_length) {
1557                postcopy_discard_send_range(ms, pds, one, discard_length);
1558            }
1559            current = one + discard_length;
1560        } else {
1561            current = one;
1562        }
1563    }
1564
1565    return 0;
1566}
1567
1568/*
1569 * Utility for the outgoing postcopy code.
1570 *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1571 *   passing it bitmap indexes and name.
1572 * Returns: 0 on success
1573 * (qemu_ram_foreach_block ends up passing unscaled lengths
1574 *  which would mean postcopy code would have to deal with target page)
1575 */
1576static int postcopy_each_ram_send_discard(MigrationState *ms)
1577{
1578    struct RAMBlock *block;
1579    int ret;
1580
1581    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1582        unsigned long first = block->offset >> TARGET_PAGE_BITS;
1583        PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1584                                                               first,
1585                                                               block->idstr);
1586
1587        /*
1588         * Postcopy sends chunks of bitmap over the wire, but it
1589         * just needs indexes at this point, avoids it having
1590         * target page specific code.
1591         */
1592        ret = postcopy_send_discard_bm_ram(ms, pds, first,
1593                                    block->used_length >> TARGET_PAGE_BITS);
1594        postcopy_discard_send_finish(ms, pds);
1595        if (ret) {
1596            return ret;
1597        }
1598    }
1599
1600    return 0;
1601}
1602
1603/*
1604 * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1605 *   the two bitmaps, that are similar, but one is inverted.
1606 *
1607 * We search for runs of target-pages that don't start or end on a
1608 * host page boundary;
1609 * unsent_pass=true: Cleans up partially unsent host pages by searching
1610 *                 the unsentmap
1611 * unsent_pass=false: Cleans up partially dirty host pages by searching
1612 *                 the main migration bitmap
1613 *
1614 */
1615static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1616                                          RAMBlock *block,
1617                                          PostcopyDiscardState *pds)
1618{
1619    unsigned long *bitmap;
1620    unsigned long *unsentmap;
1621    unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
1622    unsigned long first = block->offset >> TARGET_PAGE_BITS;
1623    unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1624    unsigned long last = first + (len - 1);
1625    unsigned long run_start;
1626
1627    bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1628    unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1629
1630    if (unsent_pass) {
1631        /* Find a sent page */
1632        run_start = find_next_zero_bit(unsentmap, last + 1, first);
1633    } else {
1634        /* Find a dirty page */
1635        run_start = find_next_bit(bitmap, last + 1, first);
1636    }
1637
1638    while (run_start <= last) {
1639        bool do_fixup = false;
1640        unsigned long fixup_start_addr;
1641        unsigned long host_offset;
1642
1643        /*
1644         * If the start of this run of pages is in the middle of a host
1645         * page, then we need to fixup this host page.
1646         */
1647        host_offset = run_start % host_ratio;
1648        if (host_offset) {
1649            do_fixup = true;
1650            run_start -= host_offset;
1651            fixup_start_addr = run_start;
1652            /* For the next pass */
1653            run_start = run_start + host_ratio;
1654        } else {
1655            /* Find the end of this run */
1656            unsigned long run_end;
1657            if (unsent_pass) {
1658                run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1659            } else {
1660                run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1661            }
1662            /*
1663             * If the end isn't at the start of a host page, then the
1664             * run doesn't finish at the end of a host page
1665             * and we need to discard.
1666             */
1667            host_offset = run_end % host_ratio;
1668            if (host_offset) {
1669                do_fixup = true;
1670                fixup_start_addr = run_end - host_offset;
1671                /*
1672                 * This host page has gone, the next loop iteration starts
1673                 * from after the fixup
1674                 */
1675                run_start = fixup_start_addr + host_ratio;
1676            } else {
1677                /*
1678                 * No discards on this iteration, next loop starts from
1679                 * next sent/dirty page
1680                 */
1681                run_start = run_end + 1;
1682            }
1683        }
1684
1685        if (do_fixup) {
1686            unsigned long page;
1687
1688            /* Tell the destination to discard this page */
1689            if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1690                /* For the unsent_pass we:
1691                 *     discard partially sent pages
1692                 * For the !unsent_pass (dirty) we:
1693                 *     discard partially dirty pages that were sent
1694                 *     (any partially sent pages were already discarded
1695                 *     by the previous unsent_pass)
1696                 */
1697                postcopy_discard_send_range(ms, pds, fixup_start_addr,
1698                                            host_ratio);
1699            }
1700
1701            /* Clean up the bitmap */
1702            for (page = fixup_start_addr;
1703                 page < fixup_start_addr + host_ratio; page++) {
1704                /* All pages in this host page are now not sent */
1705                set_bit(page, unsentmap);
1706
1707                /*
1708                 * Remark them as dirty, updating the count for any pages
1709                 * that weren't previously dirty.
1710                 */
1711                migration_dirty_pages += !test_and_set_bit(page, bitmap);
1712            }
1713        }
1714
1715        if (unsent_pass) {
1716            /* Find the next sent page for the next iteration */
1717            run_start = find_next_zero_bit(unsentmap, last + 1,
1718                                           run_start);
1719        } else {
1720            /* Find the next dirty page for the next iteration */
1721            run_start = find_next_bit(bitmap, last + 1, run_start);
1722        }
1723    }
1724}
1725
1726/*
1727 * Utility for the outgoing postcopy code.
1728 *
1729 * Discard any partially sent host-page size chunks, mark any partially
1730 * dirty host-page size chunks as all dirty.
1731 *
1732 * Returns: 0 on success
1733 */
1734static int postcopy_chunk_hostpages(MigrationState *ms)
1735{
1736    struct RAMBlock *block;
1737
1738    if (qemu_host_page_size == TARGET_PAGE_SIZE) {
1739        /* Easy case - TPS==HPS - nothing to be done */
1740        return 0;
1741    }
1742
1743    /* Easiest way to make sure we don't resume in the middle of a host-page */
1744    last_seen_block = NULL;
1745    last_sent_block = NULL;
1746    last_offset     = 0;
1747
1748    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1749        unsigned long first = block->offset >> TARGET_PAGE_BITS;
1750
1751        PostcopyDiscardState *pds =
1752                         postcopy_discard_send_init(ms, first, block->idstr);
1753
1754        /* First pass: Discard all partially sent host pages */
1755        postcopy_chunk_hostpages_pass(ms, true, block, pds);
1756        /*
1757         * Second pass: Ensure that all partially dirty host pages are made
1758         * fully dirty.
1759         */
1760        postcopy_chunk_hostpages_pass(ms, false, block, pds);
1761
1762        postcopy_discard_send_finish(ms, pds);
1763    } /* ram_list loop */
1764
1765    return 0;
1766}
1767
1768/*
1769 * Transmit the set of pages to be discarded after precopy to the target
1770 * these are pages that:
1771 *     a) Have been previously transmitted but are now dirty again
1772 *     b) Pages that have never been transmitted, this ensures that
1773 *        any pages on the destination that have been mapped by background
1774 *        tasks get discarded (transparent huge pages is the specific concern)
1775 * Hopefully this is pretty sparse
1776 */
1777int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1778{
1779    int ret;
1780    unsigned long *bitmap, *unsentmap;
1781
1782    rcu_read_lock();
1783
1784    /* This should be our last sync, the src is now paused */
1785    migration_bitmap_sync();
1786
1787    unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1788    if (!unsentmap) {
1789        /* We don't have a safe way to resize the sentmap, so
1790         * if the bitmap was resized it will be NULL at this
1791         * point.
1792         */
1793        error_report("migration ram resized during precopy phase");
1794        rcu_read_unlock();
1795        return -EINVAL;
1796    }
1797
1798    /* Deal with TPS != HPS */
1799    ret = postcopy_chunk_hostpages(ms);
1800    if (ret) {
1801        rcu_read_unlock();
1802        return ret;
1803    }
1804
1805    /*
1806     * Update the unsentmap to be unsentmap = unsentmap | dirty
1807     */
1808    bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1809    bitmap_or(unsentmap, unsentmap, bitmap,
1810               last_ram_offset() >> TARGET_PAGE_BITS);
1811
1812
1813    trace_ram_postcopy_send_discard_bitmap();
1814#ifdef DEBUG_POSTCOPY
1815    ram_debug_dump_bitmap(unsentmap, true);
1816#endif
1817
1818    ret = postcopy_each_ram_send_discard(ms);
1819    rcu_read_unlock();
1820
1821    return ret;
1822}
1823
1824/*
1825 * At the start of the postcopy phase of migration, any now-dirty
1826 * precopied pages are discarded.
1827 *
1828 * start, length describe a byte address range within the RAMBlock
1829 *
1830 * Returns 0 on success.
1831 */
1832int ram_discard_range(MigrationIncomingState *mis,
1833                      const char *block_name,
1834                      uint64_t start, size_t length)
1835{
1836    int ret = -1;
1837
1838    rcu_read_lock();
1839    RAMBlock *rb = qemu_ram_block_by_name(block_name);
1840
1841    if (!rb) {
1842        error_report("ram_discard_range: Failed to find block '%s'",
1843                     block_name);
1844        goto err;
1845    }
1846
1847    uint8_t *host_startaddr = rb->host + start;
1848
1849    if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
1850        error_report("ram_discard_range: Unaligned start address: %p",
1851                     host_startaddr);
1852        goto err;
1853    }
1854
1855    if ((start + length) <= rb->used_length) {
1856        uint8_t *host_endaddr = host_startaddr + length;
1857        if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
1858            error_report("ram_discard_range: Unaligned end address: %p",
1859                         host_endaddr);
1860            goto err;
1861        }
1862        ret = postcopy_ram_discard_range(mis, host_startaddr, length);
1863    } else {
1864        error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
1865                     "/%zx/" RAM_ADDR_FMT")",
1866                     block_name, start, length, rb->used_length);
1867    }
1868
1869err:
1870    rcu_read_unlock();
1871
1872    return ret;
1873}
1874
1875static int ram_save_init_globals(void)
1876{
1877    int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1878
1879    dirty_rate_high_cnt = 0;
1880    bitmap_sync_count = 0;
1881    migration_bitmap_sync_init();
1882    qemu_mutex_init(&migration_bitmap_mutex);
1883
1884    if (migrate_use_xbzrle()) {
1885        XBZRLE_cache_lock();
1886        ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1887        XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1888                                  TARGET_PAGE_SIZE,
1889                                  TARGET_PAGE_SIZE);
1890        if (!XBZRLE.cache) {
1891            XBZRLE_cache_unlock();
1892            error_report("Error creating cache");
1893            return -1;
1894        }
1895        XBZRLE_cache_unlock();
1896
1897        /* We prefer not to abort if there is no memory */
1898        XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1899        if (!XBZRLE.encoded_buf) {
1900            error_report("Error allocating encoded_buf");
1901            return -1;
1902        }
1903
1904        XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1905        if (!XBZRLE.current_buf) {
1906            error_report("Error allocating current_buf");
1907            g_free(XBZRLE.encoded_buf);
1908            XBZRLE.encoded_buf = NULL;
1909            return -1;
1910        }
1911
1912        acct_clear();
1913    }
1914
1915    /* For memory_global_dirty_log_start below.  */
1916    qemu_mutex_lock_iothread();
1917
1918    qemu_mutex_lock_ramlist();
1919    rcu_read_lock();
1920    bytes_transferred = 0;
1921    reset_ram_globals();
1922
1923    ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1924    migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
1925    migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1926    bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1927
1928    if (migrate_postcopy_ram()) {
1929        migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1930        bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1931    }
1932
1933    /*
1934     * Count the total number of pages used by ram blocks not including any
1935     * gaps due to alignment or unplugs.
1936     */
1937    migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1938
1939    memory_global_dirty_log_start();
1940    migration_bitmap_sync();
1941    qemu_mutex_unlock_ramlist();
1942    qemu_mutex_unlock_iothread();
1943    rcu_read_unlock();
1944
1945    return 0;
1946}
1947
1948/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1949 * long-running RCU critical section.  When rcu-reclaims in the code
1950 * start to become numerous it will be necessary to reduce the
1951 * granularity of these critical sections.
1952 */
1953
1954static int ram_save_setup(QEMUFile *f, void *opaque)
1955{
1956    RAMBlock *block;
1957
1958    /* migration has already setup the bitmap, reuse it. */
1959    if (!migration_in_colo_state()) {
1960        if (ram_save_init_globals() < 0) {
1961            return -1;
1962         }
1963    }
1964
1965    rcu_read_lock();
1966
1967    qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1968
1969    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1970        qemu_put_byte(f, strlen(block->idstr));
1971        qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1972        qemu_put_be64(f, block->used_length);
1973    }
1974
1975    rcu_read_unlock();
1976
1977    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1978    ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1979
1980    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1981
1982    return 0;
1983}
1984
1985static int ram_save_iterate(QEMUFile *f, void *opaque)
1986{
1987    int ret;
1988    int i;
1989    int64_t t0;
1990    int done = 0;
1991
1992    rcu_read_lock();
1993    if (ram_list.version != last_version) {
1994        reset_ram_globals();
1995    }
1996
1997    /* Read version before ram_list.blocks */
1998    smp_rmb();
1999
2000    ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2001
2002    t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2003    i = 0;
2004    while ((ret = qemu_file_rate_limit(f)) == 0) {
2005        int pages;
2006
2007        pages = ram_find_and_save_block(f, false, &bytes_transferred);
2008        /* no more pages to sent */
2009        if (pages == 0) {
2010            done = 1;
2011            break;
2012        }
2013        acct_info.iterations++;
2014
2015        /* we want to check in the 1st loop, just in case it was the 1st time
2016           and we had to sync the dirty bitmap.
2017           qemu_get_clock_ns() is a bit expensive, so we only check each some
2018           iterations
2019        */
2020        if ((i & 63) == 0) {
2021            uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2022            if (t1 > MAX_WAIT) {
2023                DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
2024                        t1, i);
2025                break;
2026            }
2027        }
2028        i++;
2029    }
2030    flush_compressed_data(f);
2031    rcu_read_unlock();
2032
2033    /*
2034     * Must occur before EOS (or any QEMUFile operation)
2035     * because of RDMA protocol.
2036     */
2037    ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2038
2039    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2040    bytes_transferred += 8;
2041
2042    ret = qemu_file_get_error(f);
2043    if (ret < 0) {
2044        return ret;
2045    }
2046
2047    return done;
2048}
2049
2050/* Called with iothread lock */
2051static int ram_save_complete(QEMUFile *f, void *opaque)
2052{
2053    rcu_read_lock();
2054
2055    if (!migration_in_postcopy(migrate_get_current())) {
2056        migration_bitmap_sync();
2057    }
2058
2059    ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2060
2061    /* try transferring iterative blocks of memory */
2062
2063    /* flush all remaining blocks regardless of rate limiting */
2064    while (true) {
2065        int pages;
2066
2067        pages = ram_find_and_save_block(f, !migration_in_colo_state(),
2068                                        &bytes_transferred);
2069        /* no more blocks to sent */
2070        if (pages == 0) {
2071            break;
2072        }
2073    }
2074
2075    flush_compressed_data(f);
2076    ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2077
2078    rcu_read_unlock();
2079
2080    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2081
2082    return 0;
2083}
2084
2085static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2086                             uint64_t *non_postcopiable_pending,
2087                             uint64_t *postcopiable_pending)
2088{
2089    uint64_t remaining_size;
2090
2091    remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2092
2093    if (!migration_in_postcopy(migrate_get_current()) &&
2094        remaining_size < max_size) {
2095        qemu_mutex_lock_iothread();
2096        rcu_read_lock();
2097        migration_bitmap_sync();
2098        rcu_read_unlock();
2099        qemu_mutex_unlock_iothread();
2100        remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2101    }
2102
2103    /* We can do postcopy, and all the data is postcopiable */
2104    *postcopiable_pending += remaining_size;
2105}
2106
2107static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2108{
2109    unsigned int xh_len;
2110    int xh_flags;
2111    uint8_t *loaded_data;
2112
2113    if (!xbzrle_decoded_buf) {
2114        xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2115    }
2116    loaded_data = xbzrle_decoded_buf;
2117
2118    /* extract RLE header */
2119    xh_flags = qemu_get_byte(f);
2120    xh_len = qemu_get_be16(f);
2121
2122    if (xh_flags != ENCODING_FLAG_XBZRLE) {
2123        error_report("Failed to load XBZRLE page - wrong compression!");
2124        return -1;
2125    }
2126
2127    if (xh_len > TARGET_PAGE_SIZE) {
2128        error_report("Failed to load XBZRLE page - len overflow!");
2129        return -1;
2130    }
2131    /* load data and decode */
2132    qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2133
2134    /* decode RLE */
2135    if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2136                             TARGET_PAGE_SIZE) == -1) {
2137        error_report("Failed to load XBZRLE page - decode error!");
2138        return -1;
2139    }
2140
2141    return 0;
2142}
2143
2144/* Must be called from within a rcu critical section.
2145 * Returns a pointer from within the RCU-protected ram_list.
2146 */
2147/*
2148 * Read a RAMBlock ID from the stream f.
2149 *
2150 * f: Stream to read from
2151 * flags: Page flags (mostly to see if it's a continuation of previous block)
2152 */
2153static inline RAMBlock *ram_block_from_stream(QEMUFile *f,
2154                                              int flags)
2155{
2156    static RAMBlock *block = NULL;
2157    char id[256];
2158    uint8_t len;
2159
2160    if (flags & RAM_SAVE_FLAG_CONTINUE) {
2161        if (!block) {
2162            error_report("Ack, bad migration stream!");
2163            return NULL;
2164        }
2165        return block;
2166    }
2167
2168    len = qemu_get_byte(f);
2169    qemu_get_buffer(f, (uint8_t *)id, len);
2170    id[len] = 0;
2171
2172    block = qemu_ram_block_by_name(id);
2173    if (!block) {
2174        error_report("Can't find block %s", id);
2175        return NULL;
2176    }
2177
2178    return block;
2179}
2180
2181static inline void *host_from_ram_block_offset(RAMBlock *block,
2182                                               ram_addr_t offset)
2183{
2184    if (!offset_in_ramblock(block, offset)) {
2185        return NULL;
2186    }
2187
2188    return block->host + offset;
2189}
2190
2191/*
2192 * If a page (or a whole RDMA chunk) has been
2193 * determined to be zero, then zap it.
2194 */
2195void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2196{
2197    if (ch != 0 || !is_zero_range(host, size)) {
2198        memset(host, ch, size);
2199    }
2200}
2201
2202static void *do_data_decompress(void *opaque)
2203{
2204    DecompressParam *param = opaque;
2205    unsigned long pagesize;
2206    uint8_t *des;
2207    int len;
2208
2209    qemu_mutex_lock(&param->mutex);
2210    while (!param->quit) {
2211        if (param->des) {
2212            des = param->des;
2213            len = param->len;
2214            param->des = 0;
2215            qemu_mutex_unlock(&param->mutex);
2216
2217            pagesize = TARGET_PAGE_SIZE;
2218            /* uncompress() will return failed in some case, especially
2219             * when the page is dirted when doing the compression, it's
2220             * not a problem because the dirty page will be retransferred
2221             * and uncompress() won't break the data in other pages.
2222             */
2223            uncompress((Bytef *)des, &pagesize,
2224                       (const Bytef *)param->compbuf, len);
2225
2226            qemu_mutex_lock(&decomp_done_lock);
2227            param->done = true;
2228            qemu_cond_signal(&decomp_done_cond);
2229            qemu_mutex_unlock(&decomp_done_lock);
2230
2231            qemu_mutex_lock(&param->mutex);
2232        } else {
2233            qemu_cond_wait(&param->cond, &param->mutex);
2234        }
2235    }
2236    qemu_mutex_unlock(&param->mutex);
2237
2238    return NULL;
2239}
2240
2241static void wait_for_decompress_done(void)
2242{
2243    int idx, thread_count;
2244
2245    if (!migrate_use_compression()) {
2246        return;
2247    }
2248
2249    thread_count = migrate_decompress_threads();
2250    qemu_mutex_lock(&decomp_done_lock);
2251    for (idx = 0; idx < thread_count; idx++) {
2252        while (!decomp_param[idx].done) {
2253            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2254        }
2255    }
2256    qemu_mutex_unlock(&decomp_done_lock);
2257}
2258
2259void migrate_decompress_threads_create(void)
2260{
2261    int i, thread_count;
2262
2263    thread_count = migrate_decompress_threads();
2264    decompress_threads = g_new0(QemuThread, thread_count);
2265    decomp_param = g_new0(DecompressParam, thread_count);
2266    qemu_mutex_init(&decomp_done_lock);
2267    qemu_cond_init(&decomp_done_cond);
2268    for (i = 0; i < thread_count; i++) {
2269        qemu_mutex_init(&decomp_param[i].mutex);
2270        qemu_cond_init(&decomp_param[i].cond);
2271        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2272        decomp_param[i].done = true;
2273        decomp_param[i].quit = false;
2274        qemu_thread_create(decompress_threads + i, "decompress",
2275                           do_data_decompress, decomp_param + i,
2276                           QEMU_THREAD_JOINABLE);
2277    }
2278}
2279
2280void migrate_decompress_threads_join(void)
2281{
2282    int i, thread_count;
2283
2284    thread_count = migrate_decompress_threads();
2285    for (i = 0; i < thread_count; i++) {
2286        qemu_mutex_lock(&decomp_param[i].mutex);
2287        decomp_param[i].quit = true;
2288        qemu_cond_signal(&decomp_param[i].cond);
2289        qemu_mutex_unlock(&decomp_param[i].mutex);
2290    }
2291    for (i = 0; i < thread_count; i++) {
2292        qemu_thread_join(decompress_threads + i);
2293        qemu_mutex_destroy(&decomp_param[i].mutex);
2294        qemu_cond_destroy(&decomp_param[i].cond);
2295        g_free(decomp_param[i].compbuf);
2296    }
2297    g_free(decompress_threads);
2298    g_free(decomp_param);
2299    decompress_threads = NULL;
2300    decomp_param = NULL;
2301}
2302
2303static void decompress_data_with_multi_threads(QEMUFile *f,
2304                                               void *host, int len)
2305{
2306    int idx, thread_count;
2307
2308    thread_count = migrate_decompress_threads();
2309    qemu_mutex_lock(&decomp_done_lock);
2310    while (true) {
2311        for (idx = 0; idx < thread_count; idx++) {
2312            if (decomp_param[idx].done) {
2313                decomp_param[idx].done = false;
2314                qemu_mutex_lock(&decomp_param[idx].mutex);
2315                qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2316                decomp_param[idx].des = host;
2317                decomp_param[idx].len = len;
2318                qemu_cond_signal(&decomp_param[idx].cond);
2319                qemu_mutex_unlock(&decomp_param[idx].mutex);
2320                break;
2321            }
2322        }
2323        if (idx < thread_count) {
2324            break;
2325        } else {
2326            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2327        }
2328    }
2329    qemu_mutex_unlock(&decomp_done_lock);
2330}
2331
2332/*
2333 * Allocate data structures etc needed by incoming migration with postcopy-ram
2334 * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2335 */
2336int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2337{
2338    size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2339
2340    return postcopy_ram_incoming_init(mis, ram_pages);
2341}
2342
2343/*
2344 * Called in postcopy mode by ram_load().
2345 * rcu_read_lock is taken prior to this being called.
2346 */
2347static int ram_load_postcopy(QEMUFile *f)
2348{
2349    int flags = 0, ret = 0;
2350    bool place_needed = false;
2351    bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
2352    MigrationIncomingState *mis = migration_incoming_get_current();
2353    /* Temporary page that is later 'placed' */
2354    void *postcopy_host_page = postcopy_get_tmp_page(mis);
2355    void *last_host = NULL;
2356    bool all_zero = false;
2357
2358    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2359        ram_addr_t addr;
2360        void *host = NULL;
2361        void *page_buffer = NULL;
2362        void *place_source = NULL;
2363        uint8_t ch;
2364
2365        addr = qemu_get_be64(f);
2366        flags = addr & ~TARGET_PAGE_MASK;
2367        addr &= TARGET_PAGE_MASK;
2368
2369        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2370        place_needed = false;
2371        if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2372            RAMBlock *block = ram_block_from_stream(f, flags);
2373
2374            host = host_from_ram_block_offset(block, addr);
2375            if (!host) {
2376                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2377                ret = -EINVAL;
2378                break;
2379            }
2380            /*
2381             * Postcopy requires that we place whole host pages atomically.
2382             * To make it atomic, the data is read into a temporary page
2383             * that's moved into place later.
2384             * The migration protocol uses,  possibly smaller, target-pages
2385             * however the source ensures it always sends all the components
2386             * of a host page in order.
2387             */
2388            page_buffer = postcopy_host_page +
2389                          ((uintptr_t)host & ~qemu_host_page_mask);
2390            /* If all TP are zero then we can optimise the place */
2391            if (!((uintptr_t)host & ~qemu_host_page_mask)) {
2392                all_zero = true;
2393            } else {
2394                /* not the 1st TP within the HP */
2395                if (host != (last_host + TARGET_PAGE_SIZE)) {
2396                    error_report("Non-sequential target page %p/%p",
2397                                  host, last_host);
2398                    ret = -EINVAL;
2399                    break;
2400                }
2401            }
2402
2403
2404            /*
2405             * If it's the last part of a host page then we place the host
2406             * page
2407             */
2408            place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2409                                     ~qemu_host_page_mask) == 0;
2410            place_source = postcopy_host_page;
2411        }
2412        last_host = host;
2413
2414        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2415        case RAM_SAVE_FLAG_COMPRESS:
2416            ch = qemu_get_byte(f);
2417            memset(page_buffer, ch, TARGET_PAGE_SIZE);
2418            if (ch) {
2419                all_zero = false;
2420            }
2421            break;
2422
2423        case RAM_SAVE_FLAG_PAGE:
2424            all_zero = false;
2425            if (!place_needed || !matching_page_sizes) {
2426                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2427            } else {
2428                /* Avoids the qemu_file copy during postcopy, which is
2429                 * going to do a copy later; can only do it when we
2430                 * do this read in one go (matching page sizes)
2431                 */
2432                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2433                                         TARGET_PAGE_SIZE);
2434            }
2435            break;
2436        case RAM_SAVE_FLAG_EOS:
2437            /* normal exit */
2438            break;
2439        default:
2440            error_report("Unknown combination of migration flags: %#x"
2441                         " (postcopy mode)", flags);
2442            ret = -EINVAL;
2443        }
2444
2445        if (place_needed) {
2446            /* This gets called at the last target page in the host page */
2447            if (all_zero) {
2448                ret = postcopy_place_page_zero(mis,
2449                                               host + TARGET_PAGE_SIZE -
2450                                               qemu_host_page_size);
2451            } else {
2452                ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
2453                                               qemu_host_page_size,
2454                                               place_source);
2455            }
2456        }
2457        if (!ret) {
2458            ret = qemu_file_get_error(f);
2459        }
2460    }
2461
2462    return ret;
2463}
2464
2465static int ram_load(QEMUFile *f, void *opaque, int version_id)
2466{
2467    int flags = 0, ret = 0;
2468    static uint64_t seq_iter;
2469    int len = 0;
2470    /*
2471     * If system is running in postcopy mode, page inserts to host memory must
2472     * be atomic
2473     */
2474    bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2475
2476    seq_iter++;
2477
2478    if (version_id != 4) {
2479        ret = -EINVAL;
2480    }
2481
2482    /* This RCU critical section can be very long running.
2483     * When RCU reclaims in the code start to become numerous,
2484     * it will be necessary to reduce the granularity of this
2485     * critical section.
2486     */
2487    rcu_read_lock();
2488
2489    if (postcopy_running) {
2490        ret = ram_load_postcopy(f);
2491    }
2492
2493    while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2494        ram_addr_t addr, total_ram_bytes;
2495        void *host = NULL;
2496        uint8_t ch;
2497
2498        addr = qemu_get_be64(f);
2499        flags = addr & ~TARGET_PAGE_MASK;
2500        addr &= TARGET_PAGE_MASK;
2501
2502        if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2503                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2504            RAMBlock *block = ram_block_from_stream(f, flags);
2505
2506            host = host_from_ram_block_offset(block, addr);
2507            if (!host) {
2508                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2509                ret = -EINVAL;
2510                break;
2511            }
2512        }
2513
2514        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2515        case RAM_SAVE_FLAG_MEM_SIZE:
2516            /* Synchronize RAM block list */
2517            total_ram_bytes = addr;
2518            while (!ret && total_ram_bytes) {
2519                RAMBlock *block;
2520                char id[256];
2521                ram_addr_t length;
2522
2523                len = qemu_get_byte(f);
2524                qemu_get_buffer(f, (uint8_t *)id, len);
2525                id[len] = 0;
2526                length = qemu_get_be64(f);
2527
2528                block = qemu_ram_block_by_name(id);
2529                if (block) {
2530                    if (length != block->used_length) {
2531                        Error *local_err = NULL;
2532
2533                        ret = qemu_ram_resize(block, length,
2534                                              &local_err);
2535                        if (local_err) {
2536                            error_report_err(local_err);
2537                        }
2538                    }
2539                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2540                                          block->idstr);
2541                } else {
2542                    error_report("Unknown ramblock \"%s\", cannot "
2543                                 "accept migration", id);
2544                    ret = -EINVAL;
2545                }
2546
2547                total_ram_bytes -= length;
2548            }
2549            break;
2550
2551        case RAM_SAVE_FLAG_COMPRESS:
2552            ch = qemu_get_byte(f);
2553            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2554            break;
2555
2556        case RAM_SAVE_FLAG_PAGE:
2557            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2558            break;
2559
2560        case RAM_SAVE_FLAG_COMPRESS_PAGE:
2561            len = qemu_get_be32(f);
2562            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2563                error_report("Invalid compressed data length: %d", len);
2564                ret = -EINVAL;
2565                break;
2566            }
2567            decompress_data_with_multi_threads(f, host, len);
2568            break;
2569
2570        case RAM_SAVE_FLAG_XBZRLE:
2571            if (load_xbzrle(f, addr, host) < 0) {
2572                error_report("Failed to decompress XBZRLE page at "
2573                             RAM_ADDR_FMT, addr);
2574                ret = -EINVAL;
2575                break;
2576            }
2577            break;
2578        case RAM_SAVE_FLAG_EOS:
2579            /* normal exit */
2580            break;
2581        default:
2582            if (flags & RAM_SAVE_FLAG_HOOK) {
2583                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2584            } else {
2585                error_report("Unknown combination of migration flags: %#x",
2586                             flags);
2587                ret = -EINVAL;
2588            }
2589        }
2590        if (!ret) {
2591            ret = qemu_file_get_error(f);
2592        }
2593    }
2594
2595    wait_for_decompress_done();
2596    rcu_read_unlock();
2597    DPRINTF("Completed load of VM with exit code %d seq iteration "
2598            "%" PRIu64 "\n", ret, seq_iter);
2599    return ret;
2600}
2601
2602static SaveVMHandlers savevm_ram_handlers = {
2603    .save_live_setup = ram_save_setup,
2604    .save_live_iterate = ram_save_iterate,
2605    .save_live_complete_postcopy = ram_save_complete,
2606    .save_live_complete_precopy = ram_save_complete,
2607    .save_live_pending = ram_save_pending,
2608    .load_state = ram_load,
2609    .cleanup = ram_migration_cleanup,
2610};
2611
2612void ram_mig_init(void)
2613{
2614    qemu_mutex_init(&XBZRLE.lock);
2615    register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2616}
2617