qemu/migration/ram.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2011-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28#include "qemu/osdep.h"
  29#include <zlib.h>
  30#include "qapi-event.h"
  31#include "qemu/cutils.h"
  32#include "qemu/bitops.h"
  33#include "qemu/bitmap.h"
  34#include "qemu/timer.h"
  35#include "qemu/main-loop.h"
  36#include "migration/migration.h"
  37#include "migration/postcopy-ram.h"
  38#include "exec/address-spaces.h"
  39#include "migration/page_cache.h"
  40#include "qemu/error-report.h"
  41#include "trace.h"
  42#include "exec/ram_addr.h"
  43#include "qemu/rcu_queue.h"
  44
  45#ifdef DEBUG_MIGRATION_RAM
  46#define DPRINTF(fmt, ...) \
  47    do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
  48#else
  49#define DPRINTF(fmt, ...) \
  50    do { } while (0)
  51#endif
  52
  53static int dirty_rate_high_cnt;
  54
  55static uint64_t bitmap_sync_count;
  56
  57/***********************************************************/
  58/* ram save/restore */
  59
  60#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  61#define RAM_SAVE_FLAG_COMPRESS 0x02
  62#define RAM_SAVE_FLAG_MEM_SIZE 0x04
  63#define RAM_SAVE_FLAG_PAGE     0x08
  64#define RAM_SAVE_FLAG_EOS      0x10
  65#define RAM_SAVE_FLAG_CONTINUE 0x20
  66#define RAM_SAVE_FLAG_XBZRLE   0x40
  67/* 0x80 is reserved in migration.h start with 0x100 next */
  68#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  69
  70static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
  71
  72static inline bool is_zero_range(uint8_t *p, uint64_t size)
  73{
  74    return buffer_find_nonzero_offset(p, size) == size;
  75}
  76
  77/* struct contains XBZRLE cache and a static page
  78   used by the compression */
  79static struct {
  80    /* buffer used for XBZRLE encoding */
  81    uint8_t *encoded_buf;
  82    /* buffer for storing page content */
  83    uint8_t *current_buf;
  84    /* Cache for XBZRLE, Protected by lock. */
  85    PageCache *cache;
  86    QemuMutex lock;
  87} XBZRLE;
  88
  89/* buffer used for XBZRLE decoding */
  90static uint8_t *xbzrle_decoded_buf;
  91
  92static void XBZRLE_cache_lock(void)
  93{
  94    if (migrate_use_xbzrle())
  95        qemu_mutex_lock(&XBZRLE.lock);
  96}
  97
  98static void XBZRLE_cache_unlock(void)
  99{
 100    if (migrate_use_xbzrle())
 101        qemu_mutex_unlock(&XBZRLE.lock);
 102}
 103
 104/*
 105 * called from qmp_migrate_set_cache_size in main thread, possibly while
 106 * a migration is in progress.
 107 * A running migration maybe using the cache and might finish during this
 108 * call, hence changes to the cache are protected by XBZRLE.lock().
 109 */
 110int64_t xbzrle_cache_resize(int64_t new_size)
 111{
 112    PageCache *new_cache;
 113    int64_t ret;
 114
 115    if (new_size < TARGET_PAGE_SIZE) {
 116        return -1;
 117    }
 118
 119    XBZRLE_cache_lock();
 120
 121    if (XBZRLE.cache != NULL) {
 122        if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 123            goto out_new_size;
 124        }
 125        new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 126                                        TARGET_PAGE_SIZE);
 127        if (!new_cache) {
 128            error_report("Error creating cache");
 129            ret = -1;
 130            goto out;
 131        }
 132
 133        cache_fini(XBZRLE.cache);
 134        XBZRLE.cache = new_cache;
 135    }
 136
 137out_new_size:
 138    ret = pow2floor(new_size);
 139out:
 140    XBZRLE_cache_unlock();
 141    return ret;
 142}
 143
 144/* accounting for migration statistics */
 145typedef struct AccountingInfo {
 146    uint64_t dup_pages;
 147    uint64_t skipped_pages;
 148    uint64_t norm_pages;
 149    uint64_t iterations;
 150    uint64_t xbzrle_bytes;
 151    uint64_t xbzrle_pages;
 152    uint64_t xbzrle_cache_miss;
 153    double xbzrle_cache_miss_rate;
 154    uint64_t xbzrle_overflows;
 155} AccountingInfo;
 156
 157static AccountingInfo acct_info;
 158
 159static void acct_clear(void)
 160{
 161    memset(&acct_info, 0, sizeof(acct_info));
 162}
 163
 164uint64_t dup_mig_bytes_transferred(void)
 165{
 166    return acct_info.dup_pages * TARGET_PAGE_SIZE;
 167}
 168
 169uint64_t dup_mig_pages_transferred(void)
 170{
 171    return acct_info.dup_pages;
 172}
 173
 174uint64_t skipped_mig_bytes_transferred(void)
 175{
 176    return acct_info.skipped_pages * TARGET_PAGE_SIZE;
 177}
 178
 179uint64_t skipped_mig_pages_transferred(void)
 180{
 181    return acct_info.skipped_pages;
 182}
 183
 184uint64_t norm_mig_bytes_transferred(void)
 185{
 186    return acct_info.norm_pages * TARGET_PAGE_SIZE;
 187}
 188
 189uint64_t norm_mig_pages_transferred(void)
 190{
 191    return acct_info.norm_pages;
 192}
 193
 194uint64_t xbzrle_mig_bytes_transferred(void)
 195{
 196    return acct_info.xbzrle_bytes;
 197}
 198
 199uint64_t xbzrle_mig_pages_transferred(void)
 200{
 201    return acct_info.xbzrle_pages;
 202}
 203
 204uint64_t xbzrle_mig_pages_cache_miss(void)
 205{
 206    return acct_info.xbzrle_cache_miss;
 207}
 208
 209double xbzrle_mig_cache_miss_rate(void)
 210{
 211    return acct_info.xbzrle_cache_miss_rate;
 212}
 213
 214uint64_t xbzrle_mig_pages_overflow(void)
 215{
 216    return acct_info.xbzrle_overflows;
 217}
 218
 219/* This is the last block that we have visited serching for dirty pages
 220 */
 221static RAMBlock *last_seen_block;
 222/* This is the last block from where we have sent data */
 223static RAMBlock *last_sent_block;
 224static ram_addr_t last_offset;
 225static QemuMutex migration_bitmap_mutex;
 226static uint64_t migration_dirty_pages;
 227static uint32_t last_version;
 228static bool ram_bulk_stage;
 229
 230/* used by the search for pages to send */
 231struct PageSearchStatus {
 232    /* Current block being searched */
 233    RAMBlock    *block;
 234    /* Current offset to search from */
 235    ram_addr_t   offset;
 236    /* Set once we wrap around */
 237    bool         complete_round;
 238};
 239typedef struct PageSearchStatus PageSearchStatus;
 240
 241static struct BitmapRcu {
 242    struct rcu_head rcu;
 243    /* Main migration bitmap */
 244    unsigned long *bmap;
 245    /* bitmap of pages that haven't been sent even once
 246     * only maintained and used in postcopy at the moment
 247     * where it's used to send the dirtymap at the start
 248     * of the postcopy phase
 249     */
 250    unsigned long *unsentmap;
 251} *migration_bitmap_rcu;
 252
 253struct CompressParam {
 254    bool start;
 255    bool done;
 256    QEMUFile *file;
 257    QemuMutex mutex;
 258    QemuCond cond;
 259    RAMBlock *block;
 260    ram_addr_t offset;
 261};
 262typedef struct CompressParam CompressParam;
 263
 264struct DecompressParam {
 265    bool start;
 266    QemuMutex mutex;
 267    QemuCond cond;
 268    void *des;
 269    uint8_t *compbuf;
 270    int len;
 271};
 272typedef struct DecompressParam DecompressParam;
 273
 274static CompressParam *comp_param;
 275static QemuThread *compress_threads;
 276/* comp_done_cond is used to wake up the migration thread when
 277 * one of the compression threads has finished the compression.
 278 * comp_done_lock is used to co-work with comp_done_cond.
 279 */
 280static QemuMutex *comp_done_lock;
 281static QemuCond *comp_done_cond;
 282/* The empty QEMUFileOps will be used by file in CompressParam */
 283static const QEMUFileOps empty_ops = { };
 284
 285static bool compression_switch;
 286static bool quit_comp_thread;
 287static bool quit_decomp_thread;
 288static DecompressParam *decomp_param;
 289static QemuThread *decompress_threads;
 290
 291static int do_compress_ram_page(CompressParam *param);
 292
 293static void *do_data_compress(void *opaque)
 294{
 295    CompressParam *param = opaque;
 296
 297    while (!quit_comp_thread) {
 298        qemu_mutex_lock(&param->mutex);
 299        /* Re-check the quit_comp_thread in case of
 300         * terminate_compression_threads is called just before
 301         * qemu_mutex_lock(&param->mutex) and after
 302         * while(!quit_comp_thread), re-check it here can make
 303         * sure the compression thread terminate as expected.
 304         */
 305        while (!param->start && !quit_comp_thread) {
 306            qemu_cond_wait(&param->cond, &param->mutex);
 307        }
 308        if (!quit_comp_thread) {
 309            do_compress_ram_page(param);
 310        }
 311        param->start = false;
 312        qemu_mutex_unlock(&param->mutex);
 313
 314        qemu_mutex_lock(comp_done_lock);
 315        param->done = true;
 316        qemu_cond_signal(comp_done_cond);
 317        qemu_mutex_unlock(comp_done_lock);
 318    }
 319
 320    return NULL;
 321}
 322
 323static inline void terminate_compression_threads(void)
 324{
 325    int idx, thread_count;
 326
 327    thread_count = migrate_compress_threads();
 328    quit_comp_thread = true;
 329    for (idx = 0; idx < thread_count; idx++) {
 330        qemu_mutex_lock(&comp_param[idx].mutex);
 331        qemu_cond_signal(&comp_param[idx].cond);
 332        qemu_mutex_unlock(&comp_param[idx].mutex);
 333    }
 334}
 335
 336void migrate_compress_threads_join(void)
 337{
 338    int i, thread_count;
 339
 340    if (!migrate_use_compression()) {
 341        return;
 342    }
 343    terminate_compression_threads();
 344    thread_count = migrate_compress_threads();
 345    for (i = 0; i < thread_count; i++) {
 346        qemu_thread_join(compress_threads + i);
 347        qemu_fclose(comp_param[i].file);
 348        qemu_mutex_destroy(&comp_param[i].mutex);
 349        qemu_cond_destroy(&comp_param[i].cond);
 350    }
 351    qemu_mutex_destroy(comp_done_lock);
 352    qemu_cond_destroy(comp_done_cond);
 353    g_free(compress_threads);
 354    g_free(comp_param);
 355    g_free(comp_done_cond);
 356    g_free(comp_done_lock);
 357    compress_threads = NULL;
 358    comp_param = NULL;
 359    comp_done_cond = NULL;
 360    comp_done_lock = NULL;
 361}
 362
 363void migrate_compress_threads_create(void)
 364{
 365    int i, thread_count;
 366
 367    if (!migrate_use_compression()) {
 368        return;
 369    }
 370    quit_comp_thread = false;
 371    compression_switch = true;
 372    thread_count = migrate_compress_threads();
 373    compress_threads = g_new0(QemuThread, thread_count);
 374    comp_param = g_new0(CompressParam, thread_count);
 375    comp_done_cond = g_new0(QemuCond, 1);
 376    comp_done_lock = g_new0(QemuMutex, 1);
 377    qemu_cond_init(comp_done_cond);
 378    qemu_mutex_init(comp_done_lock);
 379    for (i = 0; i < thread_count; i++) {
 380        /* com_param[i].file is just used as a dummy buffer to save data, set
 381         * it's ops to empty.
 382         */
 383        comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 384        comp_param[i].done = true;
 385        qemu_mutex_init(&comp_param[i].mutex);
 386        qemu_cond_init(&comp_param[i].cond);
 387        qemu_thread_create(compress_threads + i, "compress",
 388                           do_data_compress, comp_param + i,
 389                           QEMU_THREAD_JOINABLE);
 390    }
 391}
 392
 393/**
 394 * save_page_header: Write page header to wire
 395 *
 396 * If this is the 1st block, it also writes the block identification
 397 *
 398 * Returns: Number of bytes written
 399 *
 400 * @f: QEMUFile where to send the data
 401 * @block: block that contains the page we want to send
 402 * @offset: offset inside the block for the page
 403 *          in the lower bits, it contains flags
 404 */
 405static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 406{
 407    size_t size, len;
 408
 409    qemu_put_be64(f, offset);
 410    size = 8;
 411
 412    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 413        len = strlen(block->idstr);
 414        qemu_put_byte(f, len);
 415        qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 416        size += 1 + len;
 417    }
 418    return size;
 419}
 420
 421/* Reduce amount of guest cpu execution to hopefully slow down memory writes.
 422 * If guest dirty memory rate is reduced below the rate at which we can
 423 * transfer pages to the destination then we should be able to complete
 424 * migration. Some workloads dirty memory way too fast and will not effectively
 425 * converge, even with auto-converge.
 426 */
 427static void mig_throttle_guest_down(void)
 428{
 429    MigrationState *s = migrate_get_current();
 430    uint64_t pct_initial =
 431            s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL];
 432    uint64_t pct_icrement =
 433            s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT];
 434
 435    /* We have not started throttling yet. Let's start it. */
 436    if (!cpu_throttle_active()) {
 437        cpu_throttle_set(pct_initial);
 438    } else {
 439        /* Throttling already on, just increase the rate */
 440        cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 441    }
 442}
 443
 444/* Update the xbzrle cache to reflect a page that's been sent as all 0.
 445 * The important thing is that a stale (not-yet-0'd) page be replaced
 446 * by the new data.
 447 * As a bonus, if the page wasn't in the cache it gets added so that
 448 * when a small write is made into the 0'd page it gets XBZRLE sent
 449 */
 450static void xbzrle_cache_zero_page(ram_addr_t current_addr)
 451{
 452    if (ram_bulk_stage || !migrate_use_xbzrle()) {
 453        return;
 454    }
 455
 456    /* We don't care if this fails to allocate a new cache page
 457     * as long as it updated an old one */
 458    cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 459                 bitmap_sync_count);
 460}
 461
 462#define ENCODING_FLAG_XBZRLE 0x1
 463
 464/**
 465 * save_xbzrle_page: compress and send current page
 466 *
 467 * Returns: 1 means that we wrote the page
 468 *          0 means that page is identical to the one already sent
 469 *          -1 means that xbzrle would be longer than normal
 470 *
 471 * @f: QEMUFile where to send the data
 472 * @current_data:
 473 * @current_addr:
 474 * @block: block that contains the page we want to send
 475 * @offset: offset inside the block for the page
 476 * @last_stage: if we are at the completion stage
 477 * @bytes_transferred: increase it with the number of transferred bytes
 478 */
 479static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
 480                            ram_addr_t current_addr, RAMBlock *block,
 481                            ram_addr_t offset, bool last_stage,
 482                            uint64_t *bytes_transferred)
 483{
 484    int encoded_len = 0, bytes_xbzrle;
 485    uint8_t *prev_cached_page;
 486
 487    if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
 488        acct_info.xbzrle_cache_miss++;
 489        if (!last_stage) {
 490            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 491                             bitmap_sync_count) == -1) {
 492                return -1;
 493            } else {
 494                /* update *current_data when the page has been
 495                   inserted into cache */
 496                *current_data = get_cached_data(XBZRLE.cache, current_addr);
 497            }
 498        }
 499        return -1;
 500    }
 501
 502    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 503
 504    /* save current buffer into memory */
 505    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 506
 507    /* XBZRLE encoding (if there is no overflow) */
 508    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 509                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 510                                       TARGET_PAGE_SIZE);
 511    if (encoded_len == 0) {
 512        DPRINTF("Skipping unmodified page\n");
 513        return 0;
 514    } else if (encoded_len == -1) {
 515        DPRINTF("Overflow\n");
 516        acct_info.xbzrle_overflows++;
 517        /* update data in the cache */
 518        if (!last_stage) {
 519            memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 520            *current_data = prev_cached_page;
 521        }
 522        return -1;
 523    }
 524
 525    /* we need to update the data in the cache, in order to get the same data */
 526    if (!last_stage) {
 527        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 528    }
 529
 530    /* Send XBZRLE based compressed page */
 531    bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
 532    qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
 533    qemu_put_be16(f, encoded_len);
 534    qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
 535    bytes_xbzrle += encoded_len + 1 + 2;
 536    acct_info.xbzrle_pages++;
 537    acct_info.xbzrle_bytes += bytes_xbzrle;
 538    *bytes_transferred += bytes_xbzrle;
 539
 540    return 1;
 541}
 542
 543/* Called with rcu_read_lock() to protect migration_bitmap
 544 * rb: The RAMBlock  to search for dirty pages in
 545 * start: Start address (typically so we can continue from previous page)
 546 * ram_addr_abs: Pointer into which to store the address of the dirty page
 547 *               within the global ram_addr space
 548 *
 549 * Returns: byte offset within memory region of the start of a dirty page
 550 */
 551static inline
 552ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
 553                                       ram_addr_t start,
 554                                       ram_addr_t *ram_addr_abs)
 555{
 556    unsigned long base = rb->offset >> TARGET_PAGE_BITS;
 557    unsigned long nr = base + (start >> TARGET_PAGE_BITS);
 558    uint64_t rb_size = rb->used_length;
 559    unsigned long size = base + (rb_size >> TARGET_PAGE_BITS);
 560    unsigned long *bitmap;
 561
 562    unsigned long next;
 563
 564    bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 565    if (ram_bulk_stage && nr > base) {
 566        next = nr + 1;
 567    } else {
 568        next = find_next_bit(bitmap, size, nr);
 569    }
 570
 571    *ram_addr_abs = next << TARGET_PAGE_BITS;
 572    return (next - base) << TARGET_PAGE_BITS;
 573}
 574
 575static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
 576{
 577    bool ret;
 578    int nr = addr >> TARGET_PAGE_BITS;
 579    unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 580
 581    ret = test_and_clear_bit(nr, bitmap);
 582
 583    if (ret) {
 584        migration_dirty_pages--;
 585    }
 586    return ret;
 587}
 588
 589static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
 590{
 591    unsigned long *bitmap;
 592    bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
 593    migration_dirty_pages +=
 594        cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
 595}
 596
 597/* Fix me: there are too many global variables used in migration process. */
 598static int64_t start_time;
 599static int64_t bytes_xfer_prev;
 600static int64_t num_dirty_pages_period;
 601static uint64_t xbzrle_cache_miss_prev;
 602static uint64_t iterations_prev;
 603
 604static void migration_bitmap_sync_init(void)
 605{
 606    start_time = 0;
 607    bytes_xfer_prev = 0;
 608    num_dirty_pages_period = 0;
 609    xbzrle_cache_miss_prev = 0;
 610    iterations_prev = 0;
 611}
 612
 613static void migration_bitmap_sync(void)
 614{
 615    RAMBlock *block;
 616    uint64_t num_dirty_pages_init = migration_dirty_pages;
 617    MigrationState *s = migrate_get_current();
 618    int64_t end_time;
 619    int64_t bytes_xfer_now;
 620
 621    bitmap_sync_count++;
 622
 623    if (!bytes_xfer_prev) {
 624        bytes_xfer_prev = ram_bytes_transferred();
 625    }
 626
 627    if (!start_time) {
 628        start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 629    }
 630
 631    trace_migration_bitmap_sync_start();
 632    address_space_sync_dirty_bitmap(&address_space_memory);
 633
 634    qemu_mutex_lock(&migration_bitmap_mutex);
 635    rcu_read_lock();
 636    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 637        migration_bitmap_sync_range(block->offset, block->used_length);
 638    }
 639    rcu_read_unlock();
 640    qemu_mutex_unlock(&migration_bitmap_mutex);
 641
 642    trace_migration_bitmap_sync_end(migration_dirty_pages
 643                                    - num_dirty_pages_init);
 644    num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
 645    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 646
 647    /* more than 1 second = 1000 millisecons */
 648    if (end_time > start_time + 1000) {
 649        if (migrate_auto_converge()) {
 650            /* The following detection logic can be refined later. For now:
 651               Check to see if the dirtied bytes is 50% more than the approx.
 652               amount of bytes that just got transferred since the last time we
 653               were in this routine. If that happens twice, start or increase
 654               throttling */
 655            bytes_xfer_now = ram_bytes_transferred();
 656
 657            if (s->dirty_pages_rate &&
 658               (num_dirty_pages_period * TARGET_PAGE_SIZE >
 659                   (bytes_xfer_now - bytes_xfer_prev)/2) &&
 660               (dirty_rate_high_cnt++ >= 2)) {
 661                    trace_migration_throttle();
 662                    dirty_rate_high_cnt = 0;
 663                    mig_throttle_guest_down();
 664             }
 665             bytes_xfer_prev = bytes_xfer_now;
 666        }
 667
 668        if (migrate_use_xbzrle()) {
 669            if (iterations_prev != acct_info.iterations) {
 670                acct_info.xbzrle_cache_miss_rate =
 671                   (double)(acct_info.xbzrle_cache_miss -
 672                            xbzrle_cache_miss_prev) /
 673                   (acct_info.iterations - iterations_prev);
 674            }
 675            iterations_prev = acct_info.iterations;
 676            xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
 677        }
 678        s->dirty_pages_rate = num_dirty_pages_period * 1000
 679            / (end_time - start_time);
 680        s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
 681        start_time = end_time;
 682        num_dirty_pages_period = 0;
 683    }
 684    s->dirty_sync_count = bitmap_sync_count;
 685    if (migrate_use_events()) {
 686        qapi_event_send_migration_pass(bitmap_sync_count, NULL);
 687    }
 688}
 689
 690/**
 691 * save_zero_page: Send the zero page to the stream
 692 *
 693 * Returns: Number of pages written.
 694 *
 695 * @f: QEMUFile where to send the data
 696 * @block: block that contains the page we want to send
 697 * @offset: offset inside the block for the page
 698 * @p: pointer to the page
 699 * @bytes_transferred: increase it with the number of transferred bytes
 700 */
 701static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
 702                          uint8_t *p, uint64_t *bytes_transferred)
 703{
 704    int pages = -1;
 705
 706    if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 707        acct_info.dup_pages++;
 708        *bytes_transferred += save_page_header(f, block,
 709                                               offset | RAM_SAVE_FLAG_COMPRESS);
 710        qemu_put_byte(f, 0);
 711        *bytes_transferred += 1;
 712        pages = 1;
 713    }
 714
 715    return pages;
 716}
 717
 718/**
 719 * ram_save_page: Send the given page to the stream
 720 *
 721 * Returns: Number of pages written.
 722 *          < 0 - error
 723 *          >=0 - Number of pages written - this might legally be 0
 724 *                if xbzrle noticed the page was the same.
 725 *
 726 * @f: QEMUFile where to send the data
 727 * @block: block that contains the page we want to send
 728 * @offset: offset inside the block for the page
 729 * @last_stage: if we are at the completion stage
 730 * @bytes_transferred: increase it with the number of transferred bytes
 731 */
 732static int ram_save_page(QEMUFile *f, PageSearchStatus *pss,
 733                         bool last_stage, uint64_t *bytes_transferred)
 734{
 735    int pages = -1;
 736    uint64_t bytes_xmit;
 737    ram_addr_t current_addr;
 738    uint8_t *p;
 739    int ret;
 740    bool send_async = true;
 741    RAMBlock *block = pss->block;
 742    ram_addr_t offset = pss->offset;
 743
 744    p = block->host + offset;
 745
 746    /* In doubt sent page as normal */
 747    bytes_xmit = 0;
 748    ret = ram_control_save_page(f, block->offset,
 749                           offset, TARGET_PAGE_SIZE, &bytes_xmit);
 750    if (bytes_xmit) {
 751        *bytes_transferred += bytes_xmit;
 752        pages = 1;
 753    }
 754
 755    XBZRLE_cache_lock();
 756
 757    current_addr = block->offset + offset;
 758
 759    if (block == last_sent_block) {
 760        offset |= RAM_SAVE_FLAG_CONTINUE;
 761    }
 762    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 763        if (ret != RAM_SAVE_CONTROL_DELAYED) {
 764            if (bytes_xmit > 0) {
 765                acct_info.norm_pages++;
 766            } else if (bytes_xmit == 0) {
 767                acct_info.dup_pages++;
 768            }
 769        }
 770    } else {
 771        pages = save_zero_page(f, block, offset, p, bytes_transferred);
 772        if (pages > 0) {
 773            /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 774             * page would be stale
 775             */
 776            xbzrle_cache_zero_page(current_addr);
 777        } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
 778            pages = save_xbzrle_page(f, &p, current_addr, block,
 779                                     offset, last_stage, bytes_transferred);
 780            if (!last_stage) {
 781                /* Can't send this cached data async, since the cache page
 782                 * might get updated before it gets to the wire
 783                 */
 784                send_async = false;
 785            }
 786        }
 787    }
 788
 789    /* XBZRLE overflow or normal page */
 790    if (pages == -1) {
 791        *bytes_transferred += save_page_header(f, block,
 792                                               offset | RAM_SAVE_FLAG_PAGE);
 793        if (send_async) {
 794            qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
 795        } else {
 796            qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
 797        }
 798        *bytes_transferred += TARGET_PAGE_SIZE;
 799        pages = 1;
 800        acct_info.norm_pages++;
 801    }
 802
 803    XBZRLE_cache_unlock();
 804
 805    return pages;
 806}
 807
 808static int do_compress_ram_page(CompressParam *param)
 809{
 810    int bytes_sent, blen;
 811    uint8_t *p;
 812    RAMBlock *block = param->block;
 813    ram_addr_t offset = param->offset;
 814
 815    p = block->host + (offset & TARGET_PAGE_MASK);
 816
 817    bytes_sent = save_page_header(param->file, block, offset |
 818                                  RAM_SAVE_FLAG_COMPRESS_PAGE);
 819    blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE,
 820                                     migrate_compress_level());
 821    bytes_sent += blen;
 822
 823    return bytes_sent;
 824}
 825
 826static inline void start_compression(CompressParam *param)
 827{
 828    param->done = false;
 829    qemu_mutex_lock(&param->mutex);
 830    param->start = true;
 831    qemu_cond_signal(&param->cond);
 832    qemu_mutex_unlock(&param->mutex);
 833}
 834
 835static inline void start_decompression(DecompressParam *param)
 836{
 837    qemu_mutex_lock(&param->mutex);
 838    param->start = true;
 839    qemu_cond_signal(&param->cond);
 840    qemu_mutex_unlock(&param->mutex);
 841}
 842
 843static uint64_t bytes_transferred;
 844
 845static void flush_compressed_data(QEMUFile *f)
 846{
 847    int idx, len, thread_count;
 848
 849    if (!migrate_use_compression()) {
 850        return;
 851    }
 852    thread_count = migrate_compress_threads();
 853    for (idx = 0; idx < thread_count; idx++) {
 854        if (!comp_param[idx].done) {
 855            qemu_mutex_lock(comp_done_lock);
 856            while (!comp_param[idx].done && !quit_comp_thread) {
 857                qemu_cond_wait(comp_done_cond, comp_done_lock);
 858            }
 859            qemu_mutex_unlock(comp_done_lock);
 860        }
 861        if (!quit_comp_thread) {
 862            len = qemu_put_qemu_file(f, comp_param[idx].file);
 863            bytes_transferred += len;
 864        }
 865    }
 866}
 867
 868static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 869                                       ram_addr_t offset)
 870{
 871    param->block = block;
 872    param->offset = offset;
 873}
 874
 875static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
 876                                           ram_addr_t offset,
 877                                           uint64_t *bytes_transferred)
 878{
 879    int idx, thread_count, bytes_xmit = -1, pages = -1;
 880
 881    thread_count = migrate_compress_threads();
 882    qemu_mutex_lock(comp_done_lock);
 883    while (true) {
 884        for (idx = 0; idx < thread_count; idx++) {
 885            if (comp_param[idx].done) {
 886                bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
 887                set_compress_params(&comp_param[idx], block, offset);
 888                start_compression(&comp_param[idx]);
 889                pages = 1;
 890                acct_info.norm_pages++;
 891                *bytes_transferred += bytes_xmit;
 892                break;
 893            }
 894        }
 895        if (pages > 0) {
 896            break;
 897        } else {
 898            qemu_cond_wait(comp_done_cond, comp_done_lock);
 899        }
 900    }
 901    qemu_mutex_unlock(comp_done_lock);
 902
 903    return pages;
 904}
 905
 906/**
 907 * ram_save_compressed_page: compress the given page and send it to the stream
 908 *
 909 * Returns: Number of pages written.
 910 *
 911 * @f: QEMUFile where to send the data
 912 * @block: block that contains the page we want to send
 913 * @offset: offset inside the block for the page
 914 * @last_stage: if we are at the completion stage
 915 * @bytes_transferred: increase it with the number of transferred bytes
 916 */
 917static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss,
 918                                    bool last_stage,
 919                                    uint64_t *bytes_transferred)
 920{
 921    int pages = -1;
 922    uint64_t bytes_xmit;
 923    uint8_t *p;
 924    int ret;
 925    RAMBlock *block = pss->block;
 926    ram_addr_t offset = pss->offset;
 927
 928    p = block->host + offset;
 929
 930    bytes_xmit = 0;
 931    ret = ram_control_save_page(f, block->offset,
 932                                offset, TARGET_PAGE_SIZE, &bytes_xmit);
 933    if (bytes_xmit) {
 934        *bytes_transferred += bytes_xmit;
 935        pages = 1;
 936    }
 937    if (block == last_sent_block) {
 938        offset |= RAM_SAVE_FLAG_CONTINUE;
 939    }
 940    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 941        if (ret != RAM_SAVE_CONTROL_DELAYED) {
 942            if (bytes_xmit > 0) {
 943                acct_info.norm_pages++;
 944            } else if (bytes_xmit == 0) {
 945                acct_info.dup_pages++;
 946            }
 947        }
 948    } else {
 949        /* When starting the process of a new block, the first page of
 950         * the block should be sent out before other pages in the same
 951         * block, and all the pages in last block should have been sent
 952         * out, keeping this order is important, because the 'cont' flag
 953         * is used to avoid resending the block name.
 954         */
 955        if (block != last_sent_block) {
 956            flush_compressed_data(f);
 957            pages = save_zero_page(f, block, offset, p, bytes_transferred);
 958            if (pages == -1) {
 959                set_compress_params(&comp_param[0], block, offset);
 960                /* Use the qemu thread to compress the data to make sure the
 961                 * first page is sent out before other pages
 962                 */
 963                bytes_xmit = do_compress_ram_page(&comp_param[0]);
 964                acct_info.norm_pages++;
 965                qemu_put_qemu_file(f, comp_param[0].file);
 966                *bytes_transferred += bytes_xmit;
 967                pages = 1;
 968            }
 969        } else {
 970            pages = save_zero_page(f, block, offset, p, bytes_transferred);
 971            if (pages == -1) {
 972                pages = compress_page_with_multi_thread(f, block, offset,
 973                                                        bytes_transferred);
 974            }
 975        }
 976    }
 977
 978    return pages;
 979}
 980
 981/*
 982 * Find the next dirty page and update any state associated with
 983 * the search process.
 984 *
 985 * Returns: True if a page is found
 986 *
 987 * @f: Current migration stream.
 988 * @pss: Data about the state of the current dirty page scan.
 989 * @*again: Set to false if the search has scanned the whole of RAM
 990 * *ram_addr_abs: Pointer into which to store the address of the dirty page
 991 *               within the global ram_addr space
 992 */
 993static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
 994                             bool *again, ram_addr_t *ram_addr_abs)
 995{
 996    pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
 997                                              ram_addr_abs);
 998    if (pss->complete_round && pss->block == last_seen_block &&
 999        pss->offset >= last_offset) {
1000        /*
1001         * We've been once around the RAM and haven't found anything.
1002         * Give up.
1003         */
1004        *again = false;
1005        return false;
1006    }
1007    if (pss->offset >= pss->block->used_length) {
1008        /* Didn't find anything in this RAM Block */
1009        pss->offset = 0;
1010        pss->block = QLIST_NEXT_RCU(pss->block, next);
1011        if (!pss->block) {
1012            /* Hit the end of the list */
1013            pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1014            /* Flag that we've looped */
1015            pss->complete_round = true;
1016            ram_bulk_stage = false;
1017            if (migrate_use_xbzrle()) {
1018                /* If xbzrle is on, stop using the data compression at this
1019                 * point. In theory, xbzrle can do better than compression.
1020                 */
1021                flush_compressed_data(f);
1022                compression_switch = false;
1023            }
1024        }
1025        /* Didn't find anything this time, but try again on the new block */
1026        *again = true;
1027        return false;
1028    } else {
1029        /* Can go around again, but... */
1030        *again = true;
1031        /* We've found something so probably don't need to */
1032        return true;
1033    }
1034}
1035
1036/*
1037 * Helper for 'get_queued_page' - gets a page off the queue
1038 *      ms:      MigrationState in
1039 * *offset:      Used to return the offset within the RAMBlock
1040 * ram_addr_abs: global offset in the dirty/sent bitmaps
1041 *
1042 * Returns:      block (or NULL if none available)
1043 */
1044static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
1045                              ram_addr_t *ram_addr_abs)
1046{
1047    RAMBlock *block = NULL;
1048
1049    qemu_mutex_lock(&ms->src_page_req_mutex);
1050    if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
1051        struct MigrationSrcPageRequest *entry =
1052                                QSIMPLEQ_FIRST(&ms->src_page_requests);
1053        block = entry->rb;
1054        *offset = entry->offset;
1055        *ram_addr_abs = (entry->offset + entry->rb->offset) &
1056                        TARGET_PAGE_MASK;
1057
1058        if (entry->len > TARGET_PAGE_SIZE) {
1059            entry->len -= TARGET_PAGE_SIZE;
1060            entry->offset += TARGET_PAGE_SIZE;
1061        } else {
1062            memory_region_unref(block->mr);
1063            QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1064            g_free(entry);
1065        }
1066    }
1067    qemu_mutex_unlock(&ms->src_page_req_mutex);
1068
1069    return block;
1070}
1071
1072/*
1073 * Unqueue a page from the queue fed by postcopy page requests; skips pages
1074 * that are already sent (!dirty)
1075 *
1076 *      ms:      MigrationState in
1077 *     pss:      PageSearchStatus structure updated with found block/offset
1078 * ram_addr_abs: global offset in the dirty/sent bitmaps
1079 *
1080 * Returns:      true if a queued page is found
1081 */
1082static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
1083                            ram_addr_t *ram_addr_abs)
1084{
1085    RAMBlock  *block;
1086    ram_addr_t offset;
1087    bool dirty;
1088
1089    do {
1090        block = unqueue_page(ms, &offset, ram_addr_abs);
1091        /*
1092         * We're sending this page, and since it's postcopy nothing else
1093         * will dirty it, and we must make sure it doesn't get sent again
1094         * even if this queue request was received after the background
1095         * search already sent it.
1096         */
1097        if (block) {
1098            unsigned long *bitmap;
1099            bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1100            dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
1101            if (!dirty) {
1102                trace_get_queued_page_not_dirty(
1103                    block->idstr, (uint64_t)offset,
1104                    (uint64_t)*ram_addr_abs,
1105                    test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
1106                         atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
1107            } else {
1108                trace_get_queued_page(block->idstr,
1109                                      (uint64_t)offset,
1110                                      (uint64_t)*ram_addr_abs);
1111            }
1112        }
1113
1114    } while (block && !dirty);
1115
1116    if (block) {
1117        /*
1118         * As soon as we start servicing pages out of order, then we have
1119         * to kill the bulk stage, since the bulk stage assumes
1120         * in (migration_bitmap_find_and_reset_dirty) that every page is
1121         * dirty, that's no longer true.
1122         */
1123        ram_bulk_stage = false;
1124
1125        /*
1126         * We want the background search to continue from the queued page
1127         * since the guest is likely to want other pages near to the page
1128         * it just requested.
1129         */
1130        pss->block = block;
1131        pss->offset = offset;
1132    }
1133
1134    return !!block;
1135}
1136
1137/**
1138 * flush_page_queue: Flush any remaining pages in the ram request queue
1139 *    it should be empty at the end anyway, but in error cases there may be
1140 *    some left.
1141 *
1142 * ms: MigrationState
1143 */
1144void flush_page_queue(MigrationState *ms)
1145{
1146    struct MigrationSrcPageRequest *mspr, *next_mspr;
1147    /* This queue generally should be empty - but in the case of a failed
1148     * migration might have some droppings in.
1149     */
1150    rcu_read_lock();
1151    QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
1152        memory_region_unref(mspr->rb->mr);
1153        QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
1154        g_free(mspr);
1155    }
1156    rcu_read_unlock();
1157}
1158
1159/**
1160 * Queue the pages for transmission, e.g. a request from postcopy destination
1161 *   ms: MigrationStatus in which the queue is held
1162 *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
1163 *   start: Offset from the start of the RAMBlock
1164 *   len: Length (in bytes) to send
1165 *   Return: 0 on success
1166 */
1167int ram_save_queue_pages(MigrationState *ms, const char *rbname,
1168                         ram_addr_t start, ram_addr_t len)
1169{
1170    RAMBlock *ramblock;
1171
1172    rcu_read_lock();
1173    if (!rbname) {
1174        /* Reuse last RAMBlock */
1175        ramblock = ms->last_req_rb;
1176
1177        if (!ramblock) {
1178            /*
1179             * Shouldn't happen, we can't reuse the last RAMBlock if
1180             * it's the 1st request.
1181             */
1182            error_report("ram_save_queue_pages no previous block");
1183            goto err;
1184        }
1185    } else {
1186        ramblock = qemu_ram_block_by_name(rbname);
1187
1188        if (!ramblock) {
1189            /* We shouldn't be asked for a non-existent RAMBlock */
1190            error_report("ram_save_queue_pages no block '%s'", rbname);
1191            goto err;
1192        }
1193        ms->last_req_rb = ramblock;
1194    }
1195    trace_ram_save_queue_pages(ramblock->idstr, start, len);
1196    if (start+len > ramblock->used_length) {
1197        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1198                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1199                     __func__, start, len, ramblock->used_length);
1200        goto err;
1201    }
1202
1203    struct MigrationSrcPageRequest *new_entry =
1204        g_malloc0(sizeof(struct MigrationSrcPageRequest));
1205    new_entry->rb = ramblock;
1206    new_entry->offset = start;
1207    new_entry->len = len;
1208
1209    memory_region_ref(ramblock->mr);
1210    qemu_mutex_lock(&ms->src_page_req_mutex);
1211    QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
1212    qemu_mutex_unlock(&ms->src_page_req_mutex);
1213    rcu_read_unlock();
1214
1215    return 0;
1216
1217err:
1218    rcu_read_unlock();
1219    return -1;
1220}
1221
1222/**
1223 * ram_save_target_page: Save one target page
1224 *
1225 *
1226 * @f: QEMUFile where to send the data
1227 * @block: pointer to block that contains the page we want to send
1228 * @offset: offset inside the block for the page;
1229 * @last_stage: if we are at the completion stage
1230 * @bytes_transferred: increase it with the number of transferred bytes
1231 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1232 *
1233 * Returns: Number of pages written.
1234 */
1235static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
1236                                PageSearchStatus *pss,
1237                                bool last_stage,
1238                                uint64_t *bytes_transferred,
1239                                ram_addr_t dirty_ram_abs)
1240{
1241    int res = 0;
1242
1243    /* Check the pages is dirty and if it is send it */
1244    if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
1245        unsigned long *unsentmap;
1246        if (compression_switch && migrate_use_compression()) {
1247            res = ram_save_compressed_page(f, pss,
1248                                           last_stage,
1249                                           bytes_transferred);
1250        } else {
1251            res = ram_save_page(f, pss, last_stage,
1252                                bytes_transferred);
1253        }
1254
1255        if (res < 0) {
1256            return res;
1257        }
1258        unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1259        if (unsentmap) {
1260            clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
1261        }
1262        /* Only update last_sent_block if a block was actually sent; xbzrle
1263         * might have decided the page was identical so didn't bother writing
1264         * to the stream.
1265         */
1266        if (res > 0) {
1267            last_sent_block = pss->block;
1268        }
1269    }
1270
1271    return res;
1272}
1273
1274/**
1275 * ram_save_host_page: Starting at *offset send pages upto the end
1276 *                     of the current host page.  It's valid for the initial
1277 *                     offset to point into the middle of a host page
1278 *                     in which case the remainder of the hostpage is sent.
1279 *                     Only dirty target pages are sent.
1280 *
1281 * Returns: Number of pages written.
1282 *
1283 * @f: QEMUFile where to send the data
1284 * @block: pointer to block that contains the page we want to send
1285 * @offset: offset inside the block for the page; updated to last target page
1286 *          sent
1287 * @last_stage: if we are at the completion stage
1288 * @bytes_transferred: increase it with the number of transferred bytes
1289 * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
1290 */
1291static int ram_save_host_page(MigrationState *ms, QEMUFile *f,
1292                              PageSearchStatus *pss,
1293                              bool last_stage,
1294                              uint64_t *bytes_transferred,
1295                              ram_addr_t dirty_ram_abs)
1296{
1297    int tmppages, pages = 0;
1298    do {
1299        tmppages = ram_save_target_page(ms, f, pss, last_stage,
1300                                        bytes_transferred, dirty_ram_abs);
1301        if (tmppages < 0) {
1302            return tmppages;
1303        }
1304
1305        pages += tmppages;
1306        pss->offset += TARGET_PAGE_SIZE;
1307        dirty_ram_abs += TARGET_PAGE_SIZE;
1308    } while (pss->offset & (qemu_host_page_size - 1));
1309
1310    /* The offset we leave with is the last one we looked at */
1311    pss->offset -= TARGET_PAGE_SIZE;
1312    return pages;
1313}
1314
1315/**
1316 * ram_find_and_save_block: Finds a dirty page and sends it to f
1317 *
1318 * Called within an RCU critical section.
1319 *
1320 * Returns:  The number of pages written
1321 *           0 means no dirty pages
1322 *
1323 * @f: QEMUFile where to send the data
1324 * @last_stage: if we are at the completion stage
1325 * @bytes_transferred: increase it with the number of transferred bytes
1326 *
1327 * On systems where host-page-size > target-page-size it will send all the
1328 * pages in a host page that are dirty.
1329 */
1330
1331static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
1332                                   uint64_t *bytes_transferred)
1333{
1334    PageSearchStatus pss;
1335    MigrationState *ms = migrate_get_current();
1336    int pages = 0;
1337    bool again, found;
1338    ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
1339                                 ram_addr_t space */
1340
1341    pss.block = last_seen_block;
1342    pss.offset = last_offset;
1343    pss.complete_round = false;
1344
1345    if (!pss.block) {
1346        pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1347    }
1348
1349    do {
1350        again = true;
1351        found = get_queued_page(ms, &pss, &dirty_ram_abs);
1352
1353        if (!found) {
1354            /* priority queue empty, so just search for something dirty */
1355            found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
1356        }
1357
1358        if (found) {
1359            pages = ram_save_host_page(ms, f, &pss,
1360                                       last_stage, bytes_transferred,
1361                                       dirty_ram_abs);
1362        }
1363    } while (!pages && again);
1364
1365    last_seen_block = pss.block;
1366    last_offset = pss.offset;
1367
1368    return pages;
1369}
1370
1371void acct_update_position(QEMUFile *f, size_t size, bool zero)
1372{
1373    uint64_t pages = size / TARGET_PAGE_SIZE;
1374    if (zero) {
1375        acct_info.dup_pages += pages;
1376    } else {
1377        acct_info.norm_pages += pages;
1378        bytes_transferred += size;
1379        qemu_update_position(f, size);
1380    }
1381}
1382
1383static ram_addr_t ram_save_remaining(void)
1384{
1385    return migration_dirty_pages;
1386}
1387
1388uint64_t ram_bytes_remaining(void)
1389{
1390    return ram_save_remaining() * TARGET_PAGE_SIZE;
1391}
1392
1393uint64_t ram_bytes_transferred(void)
1394{
1395    return bytes_transferred;
1396}
1397
1398uint64_t ram_bytes_total(void)
1399{
1400    RAMBlock *block;
1401    uint64_t total = 0;
1402
1403    rcu_read_lock();
1404    QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1405        total += block->used_length;
1406    rcu_read_unlock();
1407    return total;
1408}
1409
1410void free_xbzrle_decoded_buf(void)
1411{
1412    g_free(xbzrle_decoded_buf);
1413    xbzrle_decoded_buf = NULL;
1414}
1415
1416static void migration_bitmap_free(struct BitmapRcu *bmap)
1417{
1418    g_free(bmap->bmap);
1419    g_free(bmap->unsentmap);
1420    g_free(bmap);
1421}
1422
1423static void ram_migration_cleanup(void *opaque)
1424{
1425    /* caller have hold iothread lock or is in a bh, so there is
1426     * no writing race against this migration_bitmap
1427     */
1428    struct BitmapRcu *bitmap = migration_bitmap_rcu;
1429    atomic_rcu_set(&migration_bitmap_rcu, NULL);
1430    if (bitmap) {
1431        memory_global_dirty_log_stop();
1432        call_rcu(bitmap, migration_bitmap_free, rcu);
1433    }
1434
1435    XBZRLE_cache_lock();
1436    if (XBZRLE.cache) {
1437        cache_fini(XBZRLE.cache);
1438        g_free(XBZRLE.encoded_buf);
1439        g_free(XBZRLE.current_buf);
1440        XBZRLE.cache = NULL;
1441        XBZRLE.encoded_buf = NULL;
1442        XBZRLE.current_buf = NULL;
1443    }
1444    XBZRLE_cache_unlock();
1445}
1446
1447static void reset_ram_globals(void)
1448{
1449    last_seen_block = NULL;
1450    last_sent_block = NULL;
1451    last_offset = 0;
1452    last_version = ram_list.version;
1453    ram_bulk_stage = true;
1454}
1455
1456#define MAX_WAIT 50 /* ms, half buffered_file limit */
1457
1458void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
1459{
1460    /* called in qemu main thread, so there is
1461     * no writing race against this migration_bitmap
1462     */
1463    if (migration_bitmap_rcu) {
1464        struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap;
1465        bitmap = g_new(struct BitmapRcu, 1);
1466        bitmap->bmap = bitmap_new(new);
1467
1468        /* prevent migration_bitmap content from being set bit
1469         * by migration_bitmap_sync_range() at the same time.
1470         * it is safe to migration if migration_bitmap is cleared bit
1471         * at the same time.
1472         */
1473        qemu_mutex_lock(&migration_bitmap_mutex);
1474        bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
1475        bitmap_set(bitmap->bmap, old, new - old);
1476
1477        /* We don't have a way to safely extend the sentmap
1478         * with RCU; so mark it as missing, entry to postcopy
1479         * will fail.
1480         */
1481        bitmap->unsentmap = NULL;
1482
1483        atomic_rcu_set(&migration_bitmap_rcu, bitmap);
1484        qemu_mutex_unlock(&migration_bitmap_mutex);
1485        migration_dirty_pages += new - old;
1486        call_rcu(old_bitmap, migration_bitmap_free, rcu);
1487    }
1488}
1489
1490/*
1491 * 'expected' is the value you expect the bitmap mostly to be full
1492 * of; it won't bother printing lines that are all this value.
1493 * If 'todump' is null the migration bitmap is dumped.
1494 */
1495void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
1496{
1497    int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1498
1499    int64_t cur;
1500    int64_t linelen = 128;
1501    char linebuf[129];
1502
1503    if (!todump) {
1504        todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1505    }
1506
1507    for (cur = 0; cur < ram_pages; cur += linelen) {
1508        int64_t curb;
1509        bool found = false;
1510        /*
1511         * Last line; catch the case where the line length
1512         * is longer than remaining ram
1513         */
1514        if (cur + linelen > ram_pages) {
1515            linelen = ram_pages - cur;
1516        }
1517        for (curb = 0; curb < linelen; curb++) {
1518            bool thisbit = test_bit(cur + curb, todump);
1519            linebuf[curb] = thisbit ? '1' : '.';
1520            found = found || (thisbit != expected);
1521        }
1522        if (found) {
1523            linebuf[curb] = '\0';
1524            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1525        }
1526    }
1527}
1528
1529/* **** functions for postcopy ***** */
1530
1531/*
1532 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1533 * Note: At this point the 'unsentmap' is the processed bitmap combined
1534 *       with the dirtymap; so a '1' means it's either dirty or unsent.
1535 * start,length: Indexes into the bitmap for the first bit
1536 *            representing the named block and length in target-pages
1537 */
1538static int postcopy_send_discard_bm_ram(MigrationState *ms,
1539                                        PostcopyDiscardState *pds,
1540                                        unsigned long start,
1541                                        unsigned long length)
1542{
1543    unsigned long end = start + length; /* one after the end */
1544    unsigned long current;
1545    unsigned long *unsentmap;
1546
1547    unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1548    for (current = start; current < end; ) {
1549        unsigned long one = find_next_bit(unsentmap, end, current);
1550
1551        if (one <= end) {
1552            unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1553            unsigned long discard_length;
1554
1555            if (zero >= end) {
1556                discard_length = end - one;
1557            } else {
1558                discard_length = zero - one;
1559            }
1560            postcopy_discard_send_range(ms, pds, one, discard_length);
1561            current = one + discard_length;
1562        } else {
1563            current = one;
1564        }
1565    }
1566
1567    return 0;
1568}
1569
1570/*
1571 * Utility for the outgoing postcopy code.
1572 *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1573 *   passing it bitmap indexes and name.
1574 * Returns: 0 on success
1575 * (qemu_ram_foreach_block ends up passing unscaled lengths
1576 *  which would mean postcopy code would have to deal with target page)
1577 */
1578static int postcopy_each_ram_send_discard(MigrationState *ms)
1579{
1580    struct RAMBlock *block;
1581    int ret;
1582
1583    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1584        unsigned long first = block->offset >> TARGET_PAGE_BITS;
1585        PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
1586                                                               first,
1587                                                               block->idstr);
1588
1589        /*
1590         * Postcopy sends chunks of bitmap over the wire, but it
1591         * just needs indexes at this point, avoids it having
1592         * target page specific code.
1593         */
1594        ret = postcopy_send_discard_bm_ram(ms, pds, first,
1595                                    block->used_length >> TARGET_PAGE_BITS);
1596        postcopy_discard_send_finish(ms, pds);
1597        if (ret) {
1598            return ret;
1599        }
1600    }
1601
1602    return 0;
1603}
1604
1605/*
1606 * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
1607 *   the two bitmaps, that are similar, but one is inverted.
1608 *
1609 * We search for runs of target-pages that don't start or end on a
1610 * host page boundary;
1611 * unsent_pass=true: Cleans up partially unsent host pages by searching
1612 *                 the unsentmap
1613 * unsent_pass=false: Cleans up partially dirty host pages by searching
1614 *                 the main migration bitmap
1615 *
1616 */
1617static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1618                                          RAMBlock *block,
1619                                          PostcopyDiscardState *pds)
1620{
1621    unsigned long *bitmap;
1622    unsigned long *unsentmap;
1623    unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
1624    unsigned long first = block->offset >> TARGET_PAGE_BITS;
1625    unsigned long len = block->used_length >> TARGET_PAGE_BITS;
1626    unsigned long last = first + (len - 1);
1627    unsigned long run_start;
1628
1629    bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1630    unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1631
1632    if (unsent_pass) {
1633        /* Find a sent page */
1634        run_start = find_next_zero_bit(unsentmap, last + 1, first);
1635    } else {
1636        /* Find a dirty page */
1637        run_start = find_next_bit(bitmap, last + 1, first);
1638    }
1639
1640    while (run_start <= last) {
1641        bool do_fixup = false;
1642        unsigned long fixup_start_addr;
1643        unsigned long host_offset;
1644
1645        /*
1646         * If the start of this run of pages is in the middle of a host
1647         * page, then we need to fixup this host page.
1648         */
1649        host_offset = run_start % host_ratio;
1650        if (host_offset) {
1651            do_fixup = true;
1652            run_start -= host_offset;
1653            fixup_start_addr = run_start;
1654            /* For the next pass */
1655            run_start = run_start + host_ratio;
1656        } else {
1657            /* Find the end of this run */
1658            unsigned long run_end;
1659            if (unsent_pass) {
1660                run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
1661            } else {
1662                run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
1663            }
1664            /*
1665             * If the end isn't at the start of a host page, then the
1666             * run doesn't finish at the end of a host page
1667             * and we need to discard.
1668             */
1669            host_offset = run_end % host_ratio;
1670            if (host_offset) {
1671                do_fixup = true;
1672                fixup_start_addr = run_end - host_offset;
1673                /*
1674                 * This host page has gone, the next loop iteration starts
1675                 * from after the fixup
1676                 */
1677                run_start = fixup_start_addr + host_ratio;
1678            } else {
1679                /*
1680                 * No discards on this iteration, next loop starts from
1681                 * next sent/dirty page
1682                 */
1683                run_start = run_end + 1;
1684            }
1685        }
1686
1687        if (do_fixup) {
1688            unsigned long page;
1689
1690            /* Tell the destination to discard this page */
1691            if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1692                /* For the unsent_pass we:
1693                 *     discard partially sent pages
1694                 * For the !unsent_pass (dirty) we:
1695                 *     discard partially dirty pages that were sent
1696                 *     (any partially sent pages were already discarded
1697                 *     by the previous unsent_pass)
1698                 */
1699                postcopy_discard_send_range(ms, pds, fixup_start_addr,
1700                                            host_ratio);
1701            }
1702
1703            /* Clean up the bitmap */
1704            for (page = fixup_start_addr;
1705                 page < fixup_start_addr + host_ratio; page++) {
1706                /* All pages in this host page are now not sent */
1707                set_bit(page, unsentmap);
1708
1709                /*
1710                 * Remark them as dirty, updating the count for any pages
1711                 * that weren't previously dirty.
1712                 */
1713                migration_dirty_pages += !test_and_set_bit(page, bitmap);
1714            }
1715        }
1716
1717        if (unsent_pass) {
1718            /* Find the next sent page for the next iteration */
1719            run_start = find_next_zero_bit(unsentmap, last + 1,
1720                                           run_start);
1721        } else {
1722            /* Find the next dirty page for the next iteration */
1723            run_start = find_next_bit(bitmap, last + 1, run_start);
1724        }
1725    }
1726}
1727
1728/*
1729 * Utility for the outgoing postcopy code.
1730 *
1731 * Discard any partially sent host-page size chunks, mark any partially
1732 * dirty host-page size chunks as all dirty.
1733 *
1734 * Returns: 0 on success
1735 */
1736static int postcopy_chunk_hostpages(MigrationState *ms)
1737{
1738    struct RAMBlock *block;
1739
1740    if (qemu_host_page_size == TARGET_PAGE_SIZE) {
1741        /* Easy case - TPS==HPS - nothing to be done */
1742        return 0;
1743    }
1744
1745    /* Easiest way to make sure we don't resume in the middle of a host-page */
1746    last_seen_block = NULL;
1747    last_sent_block = NULL;
1748    last_offset     = 0;
1749
1750    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1751        unsigned long first = block->offset >> TARGET_PAGE_BITS;
1752
1753        PostcopyDiscardState *pds =
1754                         postcopy_discard_send_init(ms, first, block->idstr);
1755
1756        /* First pass: Discard all partially sent host pages */
1757        postcopy_chunk_hostpages_pass(ms, true, block, pds);
1758        /*
1759         * Second pass: Ensure that all partially dirty host pages are made
1760         * fully dirty.
1761         */
1762        postcopy_chunk_hostpages_pass(ms, false, block, pds);
1763
1764        postcopy_discard_send_finish(ms, pds);
1765    } /* ram_list loop */
1766
1767    return 0;
1768}
1769
1770/*
1771 * Transmit the set of pages to be discarded after precopy to the target
1772 * these are pages that:
1773 *     a) Have been previously transmitted but are now dirty again
1774 *     b) Pages that have never been transmitted, this ensures that
1775 *        any pages on the destination that have been mapped by background
1776 *        tasks get discarded (transparent huge pages is the specific concern)
1777 * Hopefully this is pretty sparse
1778 */
1779int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1780{
1781    int ret;
1782    unsigned long *bitmap, *unsentmap;
1783
1784    rcu_read_lock();
1785
1786    /* This should be our last sync, the src is now paused */
1787    migration_bitmap_sync();
1788
1789    unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
1790    if (!unsentmap) {
1791        /* We don't have a safe way to resize the sentmap, so
1792         * if the bitmap was resized it will be NULL at this
1793         * point.
1794         */
1795        error_report("migration ram resized during precopy phase");
1796        rcu_read_unlock();
1797        return -EINVAL;
1798    }
1799
1800    /* Deal with TPS != HPS */
1801    ret = postcopy_chunk_hostpages(ms);
1802    if (ret) {
1803        rcu_read_unlock();
1804        return ret;
1805    }
1806
1807    /*
1808     * Update the unsentmap to be unsentmap = unsentmap | dirty
1809     */
1810    bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
1811    bitmap_or(unsentmap, unsentmap, bitmap,
1812               last_ram_offset() >> TARGET_PAGE_BITS);
1813
1814
1815    trace_ram_postcopy_send_discard_bitmap();
1816#ifdef DEBUG_POSTCOPY
1817    ram_debug_dump_bitmap(unsentmap, true);
1818#endif
1819
1820    ret = postcopy_each_ram_send_discard(ms);
1821    rcu_read_unlock();
1822
1823    return ret;
1824}
1825
1826/*
1827 * At the start of the postcopy phase of migration, any now-dirty
1828 * precopied pages are discarded.
1829 *
1830 * start, length describe a byte address range within the RAMBlock
1831 *
1832 * Returns 0 on success.
1833 */
1834int ram_discard_range(MigrationIncomingState *mis,
1835                      const char *block_name,
1836                      uint64_t start, size_t length)
1837{
1838    int ret = -1;
1839
1840    rcu_read_lock();
1841    RAMBlock *rb = qemu_ram_block_by_name(block_name);
1842
1843    if (!rb) {
1844        error_report("ram_discard_range: Failed to find block '%s'",
1845                     block_name);
1846        goto err;
1847    }
1848
1849    uint8_t *host_startaddr = rb->host + start;
1850
1851    if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
1852        error_report("ram_discard_range: Unaligned start address: %p",
1853                     host_startaddr);
1854        goto err;
1855    }
1856
1857    if ((start + length) <= rb->used_length) {
1858        uint8_t *host_endaddr = host_startaddr + length;
1859        if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
1860            error_report("ram_discard_range: Unaligned end address: %p",
1861                         host_endaddr);
1862            goto err;
1863        }
1864        ret = postcopy_ram_discard_range(mis, host_startaddr, length);
1865    } else {
1866        error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
1867                     "/%zx/" RAM_ADDR_FMT")",
1868                     block_name, start, length, rb->used_length);
1869    }
1870
1871err:
1872    rcu_read_unlock();
1873
1874    return ret;
1875}
1876
1877
1878/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1879 * long-running RCU critical section.  When rcu-reclaims in the code
1880 * start to become numerous it will be necessary to reduce the
1881 * granularity of these critical sections.
1882 */
1883
1884static int ram_save_setup(QEMUFile *f, void *opaque)
1885{
1886    RAMBlock *block;
1887    int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1888
1889    dirty_rate_high_cnt = 0;
1890    bitmap_sync_count = 0;
1891    migration_bitmap_sync_init();
1892    qemu_mutex_init(&migration_bitmap_mutex);
1893
1894    if (migrate_use_xbzrle()) {
1895        XBZRLE_cache_lock();
1896        XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1897                                  TARGET_PAGE_SIZE,
1898                                  TARGET_PAGE_SIZE);
1899        if (!XBZRLE.cache) {
1900            XBZRLE_cache_unlock();
1901            error_report("Error creating cache");
1902            return -1;
1903        }
1904        XBZRLE_cache_unlock();
1905
1906        /* We prefer not to abort if there is no memory */
1907        XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1908        if (!XBZRLE.encoded_buf) {
1909            error_report("Error allocating encoded_buf");
1910            return -1;
1911        }
1912
1913        XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1914        if (!XBZRLE.current_buf) {
1915            error_report("Error allocating current_buf");
1916            g_free(XBZRLE.encoded_buf);
1917            XBZRLE.encoded_buf = NULL;
1918            return -1;
1919        }
1920
1921        acct_clear();
1922    }
1923
1924    /* For memory_global_dirty_log_start below.  */
1925    qemu_mutex_lock_iothread();
1926
1927    qemu_mutex_lock_ramlist();
1928    rcu_read_lock();
1929    bytes_transferred = 0;
1930    reset_ram_globals();
1931
1932    ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1933    migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
1934    migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
1935    bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
1936
1937    if (migrate_postcopy_ram()) {
1938        migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
1939        bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
1940    }
1941
1942    /*
1943     * Count the total number of pages used by ram blocks not including any
1944     * gaps due to alignment or unplugs.
1945     */
1946    migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1947
1948    memory_global_dirty_log_start();
1949    migration_bitmap_sync();
1950    qemu_mutex_unlock_ramlist();
1951    qemu_mutex_unlock_iothread();
1952
1953    qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1954
1955    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1956        qemu_put_byte(f, strlen(block->idstr));
1957        qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1958        qemu_put_be64(f, block->used_length);
1959    }
1960
1961    rcu_read_unlock();
1962
1963    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1964    ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1965
1966    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1967
1968    return 0;
1969}
1970
1971static int ram_save_iterate(QEMUFile *f, void *opaque)
1972{
1973    int ret;
1974    int i;
1975    int64_t t0;
1976    int pages_sent = 0;
1977
1978    rcu_read_lock();
1979    if (ram_list.version != last_version) {
1980        reset_ram_globals();
1981    }
1982
1983    /* Read version before ram_list.blocks */
1984    smp_rmb();
1985
1986    ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1987
1988    t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1989    i = 0;
1990    while ((ret = qemu_file_rate_limit(f)) == 0) {
1991        int pages;
1992
1993        pages = ram_find_and_save_block(f, false, &bytes_transferred);
1994        /* no more pages to sent */
1995        if (pages == 0) {
1996            break;
1997        }
1998        pages_sent += pages;
1999        acct_info.iterations++;
2000
2001        /* we want to check in the 1st loop, just in case it was the 1st time
2002           and we had to sync the dirty bitmap.
2003           qemu_get_clock_ns() is a bit expensive, so we only check each some
2004           iterations
2005        */
2006        if ((i & 63) == 0) {
2007            uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2008            if (t1 > MAX_WAIT) {
2009                DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
2010                        t1, i);
2011                break;
2012            }
2013        }
2014        i++;
2015    }
2016    flush_compressed_data(f);
2017    rcu_read_unlock();
2018
2019    /*
2020     * Must occur before EOS (or any QEMUFile operation)
2021     * because of RDMA protocol.
2022     */
2023    ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2024
2025    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2026    bytes_transferred += 8;
2027
2028    ret = qemu_file_get_error(f);
2029    if (ret < 0) {
2030        return ret;
2031    }
2032
2033    return pages_sent;
2034}
2035
2036/* Called with iothread lock */
2037static int ram_save_complete(QEMUFile *f, void *opaque)
2038{
2039    rcu_read_lock();
2040
2041    if (!migration_in_postcopy(migrate_get_current())) {
2042        migration_bitmap_sync();
2043    }
2044
2045    ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2046
2047    /* try transferring iterative blocks of memory */
2048
2049    /* flush all remaining blocks regardless of rate limiting */
2050    while (true) {
2051        int pages;
2052
2053        pages = ram_find_and_save_block(f, true, &bytes_transferred);
2054        /* no more blocks to sent */
2055        if (pages == 0) {
2056            break;
2057        }
2058    }
2059
2060    flush_compressed_data(f);
2061    ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2062
2063    rcu_read_unlock();
2064
2065    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2066
2067    return 0;
2068}
2069
2070static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2071                             uint64_t *non_postcopiable_pending,
2072                             uint64_t *postcopiable_pending)
2073{
2074    uint64_t remaining_size;
2075
2076    remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2077
2078    if (!migration_in_postcopy(migrate_get_current()) &&
2079        remaining_size < max_size) {
2080        qemu_mutex_lock_iothread();
2081        rcu_read_lock();
2082        migration_bitmap_sync();
2083        rcu_read_unlock();
2084        qemu_mutex_unlock_iothread();
2085        remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
2086    }
2087
2088    /* We can do postcopy, and all the data is postcopiable */
2089    *postcopiable_pending += remaining_size;
2090}
2091
2092static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2093{
2094    unsigned int xh_len;
2095    int xh_flags;
2096    uint8_t *loaded_data;
2097
2098    if (!xbzrle_decoded_buf) {
2099        xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2100    }
2101    loaded_data = xbzrle_decoded_buf;
2102
2103    /* extract RLE header */
2104    xh_flags = qemu_get_byte(f);
2105    xh_len = qemu_get_be16(f);
2106
2107    if (xh_flags != ENCODING_FLAG_XBZRLE) {
2108        error_report("Failed to load XBZRLE page - wrong compression!");
2109        return -1;
2110    }
2111
2112    if (xh_len > TARGET_PAGE_SIZE) {
2113        error_report("Failed to load XBZRLE page - len overflow!");
2114        return -1;
2115    }
2116    /* load data and decode */
2117    qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2118
2119    /* decode RLE */
2120    if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2121                             TARGET_PAGE_SIZE) == -1) {
2122        error_report("Failed to load XBZRLE page - decode error!");
2123        return -1;
2124    }
2125
2126    return 0;
2127}
2128
2129/* Must be called from within a rcu critical section.
2130 * Returns a pointer from within the RCU-protected ram_list.
2131 */
2132/*
2133 * Read a RAMBlock ID from the stream f.
2134 *
2135 * f: Stream to read from
2136 * flags: Page flags (mostly to see if it's a continuation of previous block)
2137 */
2138static inline RAMBlock *ram_block_from_stream(QEMUFile *f,
2139                                              int flags)
2140{
2141    static RAMBlock *block = NULL;
2142    char id[256];
2143    uint8_t len;
2144
2145    if (flags & RAM_SAVE_FLAG_CONTINUE) {
2146        if (!block) {
2147            error_report("Ack, bad migration stream!");
2148            return NULL;
2149        }
2150        return block;
2151    }
2152
2153    len = qemu_get_byte(f);
2154    qemu_get_buffer(f, (uint8_t *)id, len);
2155    id[len] = 0;
2156
2157    block = qemu_ram_block_by_name(id);
2158    if (!block) {
2159        error_report("Can't find block %s", id);
2160        return NULL;
2161    }
2162
2163    return block;
2164}
2165
2166static inline void *host_from_ram_block_offset(RAMBlock *block,
2167                                               ram_addr_t offset)
2168{
2169    if (!offset_in_ramblock(block, offset)) {
2170        return NULL;
2171    }
2172
2173    return block->host + offset;
2174}
2175
2176/*
2177 * If a page (or a whole RDMA chunk) has been
2178 * determined to be zero, then zap it.
2179 */
2180void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2181{
2182    if (ch != 0 || !is_zero_range(host, size)) {
2183        memset(host, ch, size);
2184    }
2185}
2186
2187static void *do_data_decompress(void *opaque)
2188{
2189    DecompressParam *param = opaque;
2190    unsigned long pagesize;
2191
2192    while (!quit_decomp_thread) {
2193        qemu_mutex_lock(&param->mutex);
2194        while (!param->start && !quit_decomp_thread) {
2195            qemu_cond_wait(&param->cond, &param->mutex);
2196            pagesize = TARGET_PAGE_SIZE;
2197            if (!quit_decomp_thread) {
2198                /* uncompress() will return failed in some case, especially
2199                 * when the page is dirted when doing the compression, it's
2200                 * not a problem because the dirty page will be retransferred
2201                 * and uncompress() won't break the data in other pages.
2202                 */
2203                uncompress((Bytef *)param->des, &pagesize,
2204                           (const Bytef *)param->compbuf, param->len);
2205            }
2206            param->start = false;
2207        }
2208        qemu_mutex_unlock(&param->mutex);
2209    }
2210
2211    return NULL;
2212}
2213
2214void migrate_decompress_threads_create(void)
2215{
2216    int i, thread_count;
2217
2218    thread_count = migrate_decompress_threads();
2219    decompress_threads = g_new0(QemuThread, thread_count);
2220    decomp_param = g_new0(DecompressParam, thread_count);
2221    quit_decomp_thread = false;
2222    for (i = 0; i < thread_count; i++) {
2223        qemu_mutex_init(&decomp_param[i].mutex);
2224        qemu_cond_init(&decomp_param[i].cond);
2225        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2226        qemu_thread_create(decompress_threads + i, "decompress",
2227                           do_data_decompress, decomp_param + i,
2228                           QEMU_THREAD_JOINABLE);
2229    }
2230}
2231
2232void migrate_decompress_threads_join(void)
2233{
2234    int i, thread_count;
2235
2236    quit_decomp_thread = true;
2237    thread_count = migrate_decompress_threads();
2238    for (i = 0; i < thread_count; i++) {
2239        qemu_mutex_lock(&decomp_param[i].mutex);
2240        qemu_cond_signal(&decomp_param[i].cond);
2241        qemu_mutex_unlock(&decomp_param[i].mutex);
2242    }
2243    for (i = 0; i < thread_count; i++) {
2244        qemu_thread_join(decompress_threads + i);
2245        qemu_mutex_destroy(&decomp_param[i].mutex);
2246        qemu_cond_destroy(&decomp_param[i].cond);
2247        g_free(decomp_param[i].compbuf);
2248    }
2249    g_free(decompress_threads);
2250    g_free(decomp_param);
2251    decompress_threads = NULL;
2252    decomp_param = NULL;
2253}
2254
2255static void decompress_data_with_multi_threads(QEMUFile *f,
2256                                               void *host, int len)
2257{
2258    int idx, thread_count;
2259
2260    thread_count = migrate_decompress_threads();
2261    while (true) {
2262        for (idx = 0; idx < thread_count; idx++) {
2263            if (!decomp_param[idx].start) {
2264                qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2265                decomp_param[idx].des = host;
2266                decomp_param[idx].len = len;
2267                start_decompression(&decomp_param[idx]);
2268                break;
2269            }
2270        }
2271        if (idx < thread_count) {
2272            break;
2273        }
2274    }
2275}
2276
2277/*
2278 * Allocate data structures etc needed by incoming migration with postcopy-ram
2279 * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
2280 */
2281int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2282{
2283    size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
2284
2285    return postcopy_ram_incoming_init(mis, ram_pages);
2286}
2287
2288/*
2289 * Called in postcopy mode by ram_load().
2290 * rcu_read_lock is taken prior to this being called.
2291 */
2292static int ram_load_postcopy(QEMUFile *f)
2293{
2294    int flags = 0, ret = 0;
2295    bool place_needed = false;
2296    bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
2297    MigrationIncomingState *mis = migration_incoming_get_current();
2298    /* Temporary page that is later 'placed' */
2299    void *postcopy_host_page = postcopy_get_tmp_page(mis);
2300    void *last_host = NULL;
2301    bool all_zero = false;
2302
2303    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2304        ram_addr_t addr;
2305        void *host = NULL;
2306        void *page_buffer = NULL;
2307        void *place_source = NULL;
2308        uint8_t ch;
2309
2310        addr = qemu_get_be64(f);
2311        flags = addr & ~TARGET_PAGE_MASK;
2312        addr &= TARGET_PAGE_MASK;
2313
2314        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2315        place_needed = false;
2316        if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
2317            RAMBlock *block = ram_block_from_stream(f, flags);
2318
2319            host = host_from_ram_block_offset(block, addr);
2320            if (!host) {
2321                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2322                ret = -EINVAL;
2323                break;
2324            }
2325            page_buffer = host;
2326            /*
2327             * Postcopy requires that we place whole host pages atomically.
2328             * To make it atomic, the data is read into a temporary page
2329             * that's moved into place later.
2330             * The migration protocol uses,  possibly smaller, target-pages
2331             * however the source ensures it always sends all the components
2332             * of a host page in order.
2333             */
2334            page_buffer = postcopy_host_page +
2335                          ((uintptr_t)host & ~qemu_host_page_mask);
2336            /* If all TP are zero then we can optimise the place */
2337            if (!((uintptr_t)host & ~qemu_host_page_mask)) {
2338                all_zero = true;
2339            } else {
2340                /* not the 1st TP within the HP */
2341                if (host != (last_host + TARGET_PAGE_SIZE)) {
2342                    error_report("Non-sequential target page %p/%p",
2343                                  host, last_host);
2344                    ret = -EINVAL;
2345                    break;
2346                }
2347            }
2348
2349
2350            /*
2351             * If it's the last part of a host page then we place the host
2352             * page
2353             */
2354            place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2355                                     ~qemu_host_page_mask) == 0;
2356            place_source = postcopy_host_page;
2357        }
2358        last_host = host;
2359
2360        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2361        case RAM_SAVE_FLAG_COMPRESS:
2362            ch = qemu_get_byte(f);
2363            memset(page_buffer, ch, TARGET_PAGE_SIZE);
2364            if (ch) {
2365                all_zero = false;
2366            }
2367            break;
2368
2369        case RAM_SAVE_FLAG_PAGE:
2370            all_zero = false;
2371            if (!place_needed || !matching_page_sizes) {
2372                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2373            } else {
2374                /* Avoids the qemu_file copy during postcopy, which is
2375                 * going to do a copy later; can only do it when we
2376                 * do this read in one go (matching page sizes)
2377                 */
2378                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2379                                         TARGET_PAGE_SIZE);
2380            }
2381            break;
2382        case RAM_SAVE_FLAG_EOS:
2383            /* normal exit */
2384            break;
2385        default:
2386            error_report("Unknown combination of migration flags: %#x"
2387                         " (postcopy mode)", flags);
2388            ret = -EINVAL;
2389        }
2390
2391        if (place_needed) {
2392            /* This gets called at the last target page in the host page */
2393            if (all_zero) {
2394                ret = postcopy_place_page_zero(mis,
2395                                               host + TARGET_PAGE_SIZE -
2396                                               qemu_host_page_size);
2397            } else {
2398                ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
2399                                               qemu_host_page_size,
2400                                               place_source);
2401            }
2402        }
2403        if (!ret) {
2404            ret = qemu_file_get_error(f);
2405        }
2406    }
2407
2408    return ret;
2409}
2410
2411static int ram_load(QEMUFile *f, void *opaque, int version_id)
2412{
2413    int flags = 0, ret = 0;
2414    static uint64_t seq_iter;
2415    int len = 0;
2416    /*
2417     * If system is running in postcopy mode, page inserts to host memory must
2418     * be atomic
2419     */
2420    bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2421
2422    seq_iter++;
2423
2424    if (version_id != 4) {
2425        ret = -EINVAL;
2426    }
2427
2428    /* This RCU critical section can be very long running.
2429     * When RCU reclaims in the code start to become numerous,
2430     * it will be necessary to reduce the granularity of this
2431     * critical section.
2432     */
2433    rcu_read_lock();
2434
2435    if (postcopy_running) {
2436        ret = ram_load_postcopy(f);
2437    }
2438
2439    while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2440        ram_addr_t addr, total_ram_bytes;
2441        void *host = NULL;
2442        uint8_t ch;
2443
2444        addr = qemu_get_be64(f);
2445        flags = addr & ~TARGET_PAGE_MASK;
2446        addr &= TARGET_PAGE_MASK;
2447
2448        if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
2449                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2450            RAMBlock *block = ram_block_from_stream(f, flags);
2451
2452            host = host_from_ram_block_offset(block, addr);
2453            if (!host) {
2454                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2455                ret = -EINVAL;
2456                break;
2457            }
2458        }
2459
2460        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2461        case RAM_SAVE_FLAG_MEM_SIZE:
2462            /* Synchronize RAM block list */
2463            total_ram_bytes = addr;
2464            while (!ret && total_ram_bytes) {
2465                RAMBlock *block;
2466                char id[256];
2467                ram_addr_t length;
2468
2469                len = qemu_get_byte(f);
2470                qemu_get_buffer(f, (uint8_t *)id, len);
2471                id[len] = 0;
2472                length = qemu_get_be64(f);
2473
2474                block = qemu_ram_block_by_name(id);
2475                if (block) {
2476                    if (length != block->used_length) {
2477                        Error *local_err = NULL;
2478
2479                        ret = qemu_ram_resize(block->offset, length,
2480                                              &local_err);
2481                        if (local_err) {
2482                            error_report_err(local_err);
2483                        }
2484                    }
2485                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2486                                          block->idstr);
2487                } else {
2488                    error_report("Unknown ramblock \"%s\", cannot "
2489                                 "accept migration", id);
2490                    ret = -EINVAL;
2491                }
2492
2493                total_ram_bytes -= length;
2494            }
2495            break;
2496
2497        case RAM_SAVE_FLAG_COMPRESS:
2498            ch = qemu_get_byte(f);
2499            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2500            break;
2501
2502        case RAM_SAVE_FLAG_PAGE:
2503            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2504            break;
2505
2506        case RAM_SAVE_FLAG_COMPRESS_PAGE:
2507            len = qemu_get_be32(f);
2508            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2509                error_report("Invalid compressed data length: %d", len);
2510                ret = -EINVAL;
2511                break;
2512            }
2513            decompress_data_with_multi_threads(f, host, len);
2514            break;
2515
2516        case RAM_SAVE_FLAG_XBZRLE:
2517            if (load_xbzrle(f, addr, host) < 0) {
2518                error_report("Failed to decompress XBZRLE page at "
2519                             RAM_ADDR_FMT, addr);
2520                ret = -EINVAL;
2521                break;
2522            }
2523            break;
2524        case RAM_SAVE_FLAG_EOS:
2525            /* normal exit */
2526            break;
2527        default:
2528            if (flags & RAM_SAVE_FLAG_HOOK) {
2529                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2530            } else {
2531                error_report("Unknown combination of migration flags: %#x",
2532                             flags);
2533                ret = -EINVAL;
2534            }
2535        }
2536        if (!ret) {
2537            ret = qemu_file_get_error(f);
2538        }
2539    }
2540
2541    rcu_read_unlock();
2542    DPRINTF("Completed load of VM with exit code %d seq iteration "
2543            "%" PRIu64 "\n", ret, seq_iter);
2544    return ret;
2545}
2546
2547static SaveVMHandlers savevm_ram_handlers = {
2548    .save_live_setup = ram_save_setup,
2549    .save_live_iterate = ram_save_iterate,
2550    .save_live_complete_postcopy = ram_save_complete,
2551    .save_live_complete_precopy = ram_save_complete,
2552    .save_live_pending = ram_save_pending,
2553    .load_state = ram_load,
2554    .cleanup = ram_migration_cleanup,
2555};
2556
2557void ram_mig_init(void)
2558{
2559    qemu_mutex_init(&XBZRLE.lock);
2560    register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
2561}
2562