LXR qemu/migration/ram.c

   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2011-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28#include "qemu/osdep.h"
  29#include "cpu.h"
  30#include <zlib.h>
  31#include "qapi-event.h"
  32#include "qemu/cutils.h"
  33#include "qemu/bitops.h"
  34#include "qemu/bitmap.h"
  35#include "qemu/main-loop.h"
  36#include "xbzrle.h"
  37#include "ram.h"
  38#include "migration.h"
  39#include "migration/register.h"
  40#include "migration/misc.h"
  41#include "qemu-file.h"
  42#include "postcopy-ram.h"
  43#include "migration/page_cache.h"
  44#include "qemu/error-report.h"
  45#include "trace.h"
  46#include "exec/ram_addr.h"
  47#include "qemu/rcu_queue.h"
  48#include "migration/colo.h"
  49
  50/***********************************************************/
  51/* ram save/restore */
  52
  53/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  54 * worked for pages that where filled with the same char.  We switched
  55 * it to only search for the zero value.  And to avoid confusion with
  56 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  57 */
  58
  59#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  60#define RAM_SAVE_FLAG_ZERO     0x02
  61#define RAM_SAVE_FLAG_MEM_SIZE 0x04
  62#define RAM_SAVE_FLAG_PAGE     0x08
  63#define RAM_SAVE_FLAG_EOS      0x10
  64#define RAM_SAVE_FLAG_CONTINUE 0x20
  65#define RAM_SAVE_FLAG_XBZRLE   0x40
  66/* 0x80 is reserved in migration.h start with 0x100 next */
  67#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  68
  69static inline bool is_zero_range(uint8_t *p, uint64_t size)
  70{
  71    return buffer_is_zero(p, size);
  72}
  73
  74XBZRLECacheStats xbzrle_counters;
  75
  76/* struct contains XBZRLE cache and a static page
  77   used by the compression */
  78static struct {
  79    /* buffer used for XBZRLE encoding */
  80    uint8_t *encoded_buf;
  81    /* buffer for storing page content */
  82    uint8_t *current_buf;
  83    /* Cache for XBZRLE, Protected by lock. */
  84    PageCache *cache;
  85    QemuMutex lock;
  86    /* it will store a page full of zeros */
  87    uint8_t *zero_target_page;
  88    /* buffer used for XBZRLE decoding */
  89    uint8_t *decoded_buf;
  90} XBZRLE;
  91
  92static void XBZRLE_cache_lock(void)
  93{
  94    if (migrate_use_xbzrle())
  95        qemu_mutex_lock(&XBZRLE.lock);
  96}
  97
  98static void XBZRLE_cache_unlock(void)
  99{
 100    if (migrate_use_xbzrle())
 101        qemu_mutex_unlock(&XBZRLE.lock);
 102}
 103
 104/**
 105 * xbzrle_cache_resize: resize the xbzrle cache
 106 *
 107 * This function is called from qmp_migrate_set_cache_size in main
 108 * thread, possibly while a migration is in progress.  A running
 109 * migration may be using the cache and might finish during this call,
 110 * hence changes to the cache are protected by XBZRLE.lock().
 111 *
 112 * Returns the new_size or negative in case of error.
 113 *
 114 * @new_size: new cache size
 115 */
 116int64_t xbzrle_cache_resize(int64_t new_size)
 117{
 118    PageCache *new_cache;
 119    int64_t ret;
 120
 121    if (new_size < TARGET_PAGE_SIZE) {
 122        return -1;
 123    }
 124
 125    XBZRLE_cache_lock();
 126
 127    if (XBZRLE.cache != NULL) {
 128        if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 129            goto out_new_size;
 130        }
 131        new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 132                                        TARGET_PAGE_SIZE);
 133        if (!new_cache) {
 134            error_report("Error creating cache");
 135            ret = -1;
 136            goto out;
 137        }
 138
 139        cache_fini(XBZRLE.cache);
 140        XBZRLE.cache = new_cache;
 141    }
 142
 143out_new_size:
 144    ret = pow2floor(new_size);
 145out:
 146    XBZRLE_cache_unlock();
 147    return ret;
 148}
 149
 150/*
 151 * An outstanding page request, on the source, having been received
 152 * and queued
 153 */
 154struct RAMSrcPageRequest {
 155    RAMBlock *rb;
 156    hwaddr    offset;
 157    hwaddr    len;
 158
 159    QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 160};
 161
 162/* State of RAM for migration */
 163struct RAMState {
 164    /* QEMUFile used for this migration */
 165    QEMUFile *f;
 166    /* Last block that we have visited searching for dirty pages */
 167    RAMBlock *last_seen_block;
 168    /* Last block from where we have sent data */
 169    RAMBlock *last_sent_block;
 170    /* Last dirty target page we have sent */
 171    ram_addr_t last_page;
 172    /* last ram version we have seen */
 173    uint32_t last_version;
 174    /* We are in the first round */
 175    bool ram_bulk_stage;
 176    /* How many times we have dirty too many pages */
 177    int dirty_rate_high_cnt;
 178    /* these variables are used for bitmap sync */
 179    /* last time we did a full bitmap_sync */
 180    int64_t time_last_bitmap_sync;
 181    /* bytes transferred at start_time */
 182    uint64_t bytes_xfer_prev;
 183    /* number of dirty pages since start_time */
 184    uint64_t num_dirty_pages_period;
 185    /* xbzrle misses since the beginning of the period */
 186    uint64_t xbzrle_cache_miss_prev;
 187    /* number of iterations at the beginning of period */
 188    uint64_t iterations_prev;
 189    /* Iterations since start */
 190    uint64_t iterations;
 191    /* number of dirty bits in the bitmap */
 192    uint64_t migration_dirty_pages;
 193    /* protects modification of the bitmap */
 194    QemuMutex bitmap_mutex;
 195    /* The RAMBlock used in the last src_page_requests */
 196    RAMBlock *last_req_rb;
 197    /* Queue of outstanding page requests from the destination */
 198    QemuMutex src_page_req_mutex;
 199    QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 200};
 201typedef struct RAMState RAMState;
 202
 203static RAMState *ram_state;
 204
 205uint64_t ram_bytes_remaining(void)
 206{
 207    return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
 208}
 209
 210MigrationStats ram_counters;
 211
 212/* used by the search for pages to send */
 213struct PageSearchStatus {
 214    /* Current block being searched */
 215    RAMBlock    *block;
 216    /* Current page to search from */
 217    unsigned long page;
 218    /* Set once we wrap around */
 219    bool         complete_round;
 220};
 221typedef struct PageSearchStatus PageSearchStatus;
 222
 223struct CompressParam {
 224    bool done;
 225    bool quit;
 226    QEMUFile *file;
 227    QemuMutex mutex;
 228    QemuCond cond;
 229    RAMBlock *block;
 230    ram_addr_t offset;
 231};
 232typedef struct CompressParam CompressParam;
 233
 234struct DecompressParam {
 235    bool done;
 236    bool quit;
 237    QemuMutex mutex;
 238    QemuCond cond;
 239    void *des;
 240    uint8_t *compbuf;
 241    int len;
 242};
 243typedef struct DecompressParam DecompressParam;
 244
 245static CompressParam *comp_param;
 246static QemuThread *compress_threads;
 247/* comp_done_cond is used to wake up the migration thread when
 248 * one of the compression threads has finished the compression.
 249 * comp_done_lock is used to co-work with comp_done_cond.
 250 */
 251static QemuMutex comp_done_lock;
 252static QemuCond comp_done_cond;
 253/* The empty QEMUFileOps will be used by file in CompressParam */
 254static const QEMUFileOps empty_ops = { };
 255
 256static DecompressParam *decomp_param;
 257static QemuThread *decompress_threads;
 258static QemuMutex decomp_done_lock;
 259static QemuCond decomp_done_cond;
 260
 261static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 262                                ram_addr_t offset);
 263
 264static void *do_data_compress(void *opaque)
 265{
 266    CompressParam *param = opaque;
 267    RAMBlock *block;
 268    ram_addr_t offset;
 269
 270    qemu_mutex_lock(&param->mutex);
 271    while (!param->quit) {
 272        if (param->block) {
 273            block = param->block;
 274            offset = param->offset;
 275            param->block = NULL;
 276            qemu_mutex_unlock(&param->mutex);
 277
 278            do_compress_ram_page(param->file, block, offset);
 279
 280            qemu_mutex_lock(&comp_done_lock);
 281            param->done = true;
 282            qemu_cond_signal(&comp_done_cond);
 283            qemu_mutex_unlock(&comp_done_lock);
 284
 285            qemu_mutex_lock(&param->mutex);
 286        } else {
 287            qemu_cond_wait(&param->cond, &param->mutex);
 288        }
 289    }
 290    qemu_mutex_unlock(&param->mutex);
 291
 292    return NULL;
 293}
 294
 295static inline void terminate_compression_threads(void)
 296{
 297    int idx, thread_count;
 298
 299    thread_count = migrate_compress_threads();
 300
 301    for (idx = 0; idx < thread_count; idx++) {
 302        qemu_mutex_lock(&comp_param[idx].mutex);
 303        comp_param[idx].quit = true;
 304        qemu_cond_signal(&comp_param[idx].cond);
 305        qemu_mutex_unlock(&comp_param[idx].mutex);
 306    }
 307}
 308
 309static void compress_threads_save_cleanup(void)
 310{
 311    int i, thread_count;
 312
 313    if (!migrate_use_compression()) {
 314        return;
 315    }
 316    terminate_compression_threads();
 317    thread_count = migrate_compress_threads();
 318    for (i = 0; i < thread_count; i++) {
 319        qemu_thread_join(compress_threads + i);
 320        qemu_fclose(comp_param[i].file);
 321        qemu_mutex_destroy(&comp_param[i].mutex);
 322        qemu_cond_destroy(&comp_param[i].cond);
 323    }
 324    qemu_mutex_destroy(&comp_done_lock);
 325    qemu_cond_destroy(&comp_done_cond);
 326    g_free(compress_threads);
 327    g_free(comp_param);
 328    compress_threads = NULL;
 329    comp_param = NULL;
 330}
 331
 332static void compress_threads_save_setup(void)
 333{
 334    int i, thread_count;
 335
 336    if (!migrate_use_compression()) {
 337        return;
 338    }
 339    thread_count = migrate_compress_threads();
 340    compress_threads = g_new0(QemuThread, thread_count);
 341    comp_param = g_new0(CompressParam, thread_count);
 342    qemu_cond_init(&comp_done_cond);
 343    qemu_mutex_init(&comp_done_lock);
 344    for (i = 0; i < thread_count; i++) {
 345        /* comp_param[i].file is just used as a dummy buffer to save data,
 346         * set its ops to empty.
 347         */
 348        comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 349        comp_param[i].done = true;
 350        comp_param[i].quit = false;
 351        qemu_mutex_init(&comp_param[i].mutex);
 352        qemu_cond_init(&comp_param[i].cond);
 353        qemu_thread_create(compress_threads + i, "compress",
 354                           do_data_compress, comp_param + i,
 355                           QEMU_THREAD_JOINABLE);
 356    }
 357}
 358
 359/**
 360 * save_page_header: write page header to wire
 361 *
 362 * If this is the 1st block, it also writes the block identification
 363 *
 364 * Returns the number of bytes written
 365 *
 366 * @f: QEMUFile where to send the data
 367 * @block: block that contains the page we want to send
 368 * @offset: offset inside the block for the page
 369 *          in the lower bits, it contains flags
 370 */
 371static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 372                               ram_addr_t offset)
 373{
 374    size_t size, len;
 375
 376    if (block == rs->last_sent_block) {
 377        offset |= RAM_SAVE_FLAG_CONTINUE;
 378    }
 379    qemu_put_be64(f, offset);
 380    size = 8;
 381
 382    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 383        len = strlen(block->idstr);
 384        qemu_put_byte(f, len);
 385        qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 386        size += 1 + len;
 387        rs->last_sent_block = block;
 388    }
 389    return size;
 390}
 391
 392/**
 393 * mig_throttle_guest_down: throotle down the guest
 394 *
 395 * Reduce amount of guest cpu execution to hopefully slow down memory
 396 * writes. If guest dirty memory rate is reduced below the rate at
 397 * which we can transfer pages to the destination then we should be
 398 * able to complete migration. Some workloads dirty memory way too
 399 * fast and will not effectively converge, even with auto-converge.
 400 */
 401static void mig_throttle_guest_down(void)
 402{
 403    MigrationState *s = migrate_get_current();
 404    uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 405    uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 406
 407    /* We have not started throttling yet. Let's start it. */
 408    if (!cpu_throttle_active()) {
 409        cpu_throttle_set(pct_initial);
 410    } else {
 411        /* Throttling already on, just increase the rate */
 412        cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 413    }
 414}
 415
 416/**
 417 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 418 *
 419 * @rs: current RAM state
 420 * @current_addr: address for the zero page
 421 *
 422 * Update the xbzrle cache to reflect a page that's been sent as all 0.
 423 * The important thing is that a stale (not-yet-0'd) page be replaced
 424 * by the new data.
 425 * As a bonus, if the page wasn't in the cache it gets added so that
 426 * when a small write is made into the 0'd page it gets XBZRLE sent.
 427 */
 428static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 429{
 430    if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 431        return;
 432    }
 433
 434    /* We don't care if this fails to allocate a new cache page
 435     * as long as it updated an old one */
 436    cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 437                 ram_counters.dirty_sync_count);
 438}
 439
 440#define ENCODING_FLAG_XBZRLE 0x1
 441
 442/**
 443 * save_xbzrle_page: compress and send current page
 444 *
 445 * Returns: 1 means that we wrote the page
 446 *          0 means that page is identical to the one already sent
 447 *          -1 means that xbzrle would be longer than normal
 448 *
 449 * @rs: current RAM state
 450 * @current_data: pointer to the address of the page contents
 451 * @current_addr: addr of the page
 452 * @block: block that contains the page we want to send
 453 * @offset: offset inside the block for the page
 454 * @last_stage: if we are at the completion stage
 455 */
 456static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 457                            ram_addr_t current_addr, RAMBlock *block,
 458                            ram_addr_t offset, bool last_stage)
 459{
 460    int encoded_len = 0, bytes_xbzrle;
 461    uint8_t *prev_cached_page;
 462
 463    if (!cache_is_cached(XBZRLE.cache, current_addr,
 464                         ram_counters.dirty_sync_count)) {
 465        xbzrle_counters.cache_miss++;
 466        if (!last_stage) {
 467            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 468                             ram_counters.dirty_sync_count) == -1) {
 469                return -1;
 470            } else {
 471                /* update *current_data when the page has been
 472                   inserted into cache */
 473                *current_data = get_cached_data(XBZRLE.cache, current_addr);
 474            }
 475        }
 476        return -1;
 477    }
 478
 479    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 480
 481    /* save current buffer into memory */
 482    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 483
 484    /* XBZRLE encoding (if there is no overflow) */
 485    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 486                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 487                                       TARGET_PAGE_SIZE);
 488    if (encoded_len == 0) {
 489        trace_save_xbzrle_page_skipping();
 490        return 0;
 491    } else if (encoded_len == -1) {
 492        trace_save_xbzrle_page_overflow();
 493        xbzrle_counters.overflow++;
 494        /* update data in the cache */
 495        if (!last_stage) {
 496            memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 497            *current_data = prev_cached_page;
 498        }
 499        return -1;
 500    }
 501
 502    /* we need to update the data in the cache, in order to get the same data */
 503    if (!last_stage) {
 504        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 505    }
 506
 507    /* Send XBZRLE based compressed page */
 508    bytes_xbzrle = save_page_header(rs, rs->f, block,
 509                                    offset | RAM_SAVE_FLAG_XBZRLE);
 510    qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 511    qemu_put_be16(rs->f, encoded_len);
 512    qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 513    bytes_xbzrle += encoded_len + 1 + 2;
 514    xbzrle_counters.pages++;
 515    xbzrle_counters.bytes += bytes_xbzrle;
 516    ram_counters.transferred += bytes_xbzrle;
 517
 518    return 1;
 519}
 520
 521/**
 522 * migration_bitmap_find_dirty: find the next dirty page from start
 523 *
 524 * Called with rcu_read_lock() to protect migration_bitmap
 525 *
 526 * Returns the byte offset within memory region of the start of a dirty page
 527 *
 528 * @rs: current RAM state
 529 * @rb: RAMBlock where to search for dirty pages
 530 * @start: page where we start the search
 531 */
 532static inline
 533unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 534                                          unsigned long start)
 535{
 536    unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 537    unsigned long *bitmap = rb->bmap;
 538    unsigned long next;
 539
 540    if (rs->ram_bulk_stage && start > 0) {
 541        next = start + 1;
 542    } else {
 543        next = find_next_bit(bitmap, size, start);
 544    }
 545
 546    return next;
 547}
 548
 549static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 550                                                RAMBlock *rb,
 551                                                unsigned long page)
 552{
 553    bool ret;
 554
 555    ret = test_and_clear_bit(page, rb->bmap);
 556
 557    if (ret) {
 558        rs->migration_dirty_pages--;
 559    }
 560    return ret;
 561}
 562
 563static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 564                                        ram_addr_t start, ram_addr_t length)
 565{
 566    rs->migration_dirty_pages +=
 567        cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 568                                              &rs->num_dirty_pages_period);
 569}
 570
 571/**
 572 * ram_pagesize_summary: calculate all the pagesizes of a VM
 573 *
 574 * Returns a summary bitmap of the page sizes of all RAMBlocks
 575 *
 576 * For VMs with just normal pages this is equivalent to the host page
 577 * size. If it's got some huge pages then it's the OR of all the
 578 * different page sizes.
 579 */
 580uint64_t ram_pagesize_summary(void)
 581{
 582    RAMBlock *block;
 583    uint64_t summary = 0;
 584
 585    RAMBLOCK_FOREACH(block) {
 586        summary |= block->page_size;
 587    }
 588
 589    return summary;
 590}
 591
 592static void migration_bitmap_sync(RAMState *rs)
 593{
 594    RAMBlock *block;
 595    int64_t end_time;
 596    uint64_t bytes_xfer_now;
 597
 598    ram_counters.dirty_sync_count++;
 599
 600    if (!rs->time_last_bitmap_sync) {
 601        rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 602    }
 603
 604    trace_migration_bitmap_sync_start();
 605    memory_global_dirty_log_sync();
 606
 607    qemu_mutex_lock(&rs->bitmap_mutex);
 608    rcu_read_lock();
 609    RAMBLOCK_FOREACH(block) {
 610        migration_bitmap_sync_range(rs, block, 0, block->used_length);
 611    }
 612    rcu_read_unlock();
 613    qemu_mutex_unlock(&rs->bitmap_mutex);
 614
 615    trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 616
 617    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 618
 619    /* more than 1 second = 1000 millisecons */
 620    if (end_time > rs->time_last_bitmap_sync + 1000) {
 621        /* calculate period counters */
 622        ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 623            / (end_time - rs->time_last_bitmap_sync);
 624        bytes_xfer_now = ram_counters.transferred;
 625
 626        if (migrate_auto_converge()) {
 627            /* The following detection logic can be refined later. For now:
 628               Check to see if the dirtied bytes is 50% more than the approx.
 629               amount of bytes that just got transferred since the last time we
 630               were in this routine. If that happens twice, start or increase
 631               throttling */
 632
 633            if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 634                   (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 635                (++rs->dirty_rate_high_cnt >= 2)) {
 636                    trace_migration_throttle();
 637                    rs->dirty_rate_high_cnt = 0;
 638                    mig_throttle_guest_down();
 639            }
 640        }
 641
 642        if (migrate_use_xbzrle()) {
 643            if (rs->iterations_prev != rs->iterations) {
 644                xbzrle_counters.cache_miss_rate =
 645                   (double)(xbzrle_counters.cache_miss -
 646                            rs->xbzrle_cache_miss_prev) /
 647                   (rs->iterations - rs->iterations_prev);
 648            }
 649            rs->iterations_prev = rs->iterations;
 650            rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 651        }
 652
 653        /* reset period counters */
 654        rs->time_last_bitmap_sync = end_time;
 655        rs->num_dirty_pages_period = 0;
 656        rs->bytes_xfer_prev = bytes_xfer_now;
 657    }
 658    if (migrate_use_events()) {
 659        qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
 660    }
 661}
 662
 663/**
 664 * save_zero_page: send the zero page to the stream
 665 *
 666 * Returns the number of pages written.
 667 *
 668 * @rs: current RAM state
 669 * @block: block that contains the page we want to send
 670 * @offset: offset inside the block for the page
 671 * @p: pointer to the page
 672 */
 673static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 674                          uint8_t *p)
 675{
 676    int pages = -1;
 677
 678    if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 679        ram_counters.duplicate++;
 680        ram_counters.transferred +=
 681            save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 682        qemu_put_byte(rs->f, 0);
 683        ram_counters.transferred += 1;
 684        pages = 1;
 685    }
 686
 687    return pages;
 688}
 689
 690static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 691{
 692    if (!migrate_release_ram() || !migration_in_postcopy()) {
 693        return;
 694    }
 695
 696    ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 697}
 698
 699/**
 700 * ram_save_page: send the given page to the stream
 701 *
 702 * Returns the number of pages written.
 703 *          < 0 - error
 704 *          >=0 - Number of pages written - this might legally be 0
 705 *                if xbzrle noticed the page was the same.
 706 *
 707 * @rs: current RAM state
 708 * @block: block that contains the page we want to send
 709 * @offset: offset inside the block for the page
 710 * @last_stage: if we are at the completion stage
 711 */
 712static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 713{
 714    int pages = -1;
 715    uint64_t bytes_xmit;
 716    ram_addr_t current_addr;
 717    uint8_t *p;
 718    int ret;
 719    bool send_async = true;
 720    RAMBlock *block = pss->block;
 721    ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 722
 723    p = block->host + offset;
 724    trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 725
 726    /* In doubt sent page as normal */
 727    bytes_xmit = 0;
 728    ret = ram_control_save_page(rs->f, block->offset,
 729                           offset, TARGET_PAGE_SIZE, &bytes_xmit);
 730    if (bytes_xmit) {
 731        ram_counters.transferred += bytes_xmit;
 732        pages = 1;
 733    }
 734
 735    XBZRLE_cache_lock();
 736
 737    current_addr = block->offset + offset;
 738
 739    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 740        if (ret != RAM_SAVE_CONTROL_DELAYED) {
 741            if (bytes_xmit > 0) {
 742                ram_counters.normal++;
 743            } else if (bytes_xmit == 0) {
 744                ram_counters.duplicate++;
 745            }
 746        }
 747    } else {
 748        pages = save_zero_page(rs, block, offset, p);
 749        if (pages > 0) {
 750            /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 751             * page would be stale
 752             */
 753            xbzrle_cache_zero_page(rs, current_addr);
 754            ram_release_pages(block->idstr, offset, pages);
 755        } else if (!rs->ram_bulk_stage &&
 756                   !migration_in_postcopy() && migrate_use_xbzrle()) {
 757            pages = save_xbzrle_page(rs, &p, current_addr, block,
 758                                     offset, last_stage);
 759            if (!last_stage) {
 760                /* Can't send this cached data async, since the cache page
 761                 * might get updated before it gets to the wire
 762                 */
 763                send_async = false;
 764            }
 765        }
 766    }
 767
 768    /* XBZRLE overflow or normal page */
 769    if (pages == -1) {
 770        ram_counters.transferred +=
 771            save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
 772        if (send_async) {
 773            qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
 774                                  migrate_release_ram() &
 775                                  migration_in_postcopy());
 776        } else {
 777            qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
 778        }
 779        ram_counters.transferred += TARGET_PAGE_SIZE;
 780        pages = 1;
 781        ram_counters.normal++;
 782    }
 783
 784    XBZRLE_cache_unlock();
 785
 786    return pages;
 787}
 788
 789static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 790                                ram_addr_t offset)
 791{
 792    RAMState *rs = ram_state;
 793    int bytes_sent, blen;
 794    uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
 795
 796    bytes_sent = save_page_header(rs, f, block, offset |
 797                                  RAM_SAVE_FLAG_COMPRESS_PAGE);
 798    blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
 799                                     migrate_compress_level());
 800    if (blen < 0) {
 801        bytes_sent = 0;
 802        qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
 803        error_report("compressed data failed!");
 804    } else {
 805        bytes_sent += blen;
 806        ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
 807    }
 808
 809    return bytes_sent;
 810}
 811
 812static void flush_compressed_data(RAMState *rs)
 813{
 814    int idx, len, thread_count;
 815
 816    if (!migrate_use_compression()) {
 817        return;
 818    }
 819    thread_count = migrate_compress_threads();
 820
 821    qemu_mutex_lock(&comp_done_lock);
 822    for (idx = 0; idx < thread_count; idx++) {
 823        while (!comp_param[idx].done) {
 824            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 825        }
 826    }
 827    qemu_mutex_unlock(&comp_done_lock);
 828
 829    for (idx = 0; idx < thread_count; idx++) {
 830        qemu_mutex_lock(&comp_param[idx].mutex);
 831        if (!comp_param[idx].quit) {
 832            len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 833            ram_counters.transferred += len;
 834        }
 835        qemu_mutex_unlock(&comp_param[idx].mutex);
 836    }
 837}
 838
 839static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 840                                       ram_addr_t offset)
 841{
 842    param->block = block;
 843    param->offset = offset;
 844}
 845
 846static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
 847                                           ram_addr_t offset)
 848{
 849    int idx, thread_count, bytes_xmit = -1, pages = -1;
 850
 851    thread_count = migrate_compress_threads();
 852    qemu_mutex_lock(&comp_done_lock);
 853    while (true) {
 854        for (idx = 0; idx < thread_count; idx++) {
 855            if (comp_param[idx].done) {
 856                comp_param[idx].done = false;
 857                bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
 858                qemu_mutex_lock(&comp_param[idx].mutex);
 859                set_compress_params(&comp_param[idx], block, offset);
 860                qemu_cond_signal(&comp_param[idx].cond);
 861                qemu_mutex_unlock(&comp_param[idx].mutex);
 862                pages = 1;
 863                ram_counters.normal++;
 864                ram_counters.transferred += bytes_xmit;
 865                break;
 866            }
 867        }
 868        if (pages > 0) {
 869            break;
 870        } else {
 871            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
 872        }
 873    }
 874    qemu_mutex_unlock(&comp_done_lock);
 875
 876    return pages;
 877}
 878
 879/**
 880 * ram_save_compressed_page: compress the given page and send it to the stream
 881 *
 882 * Returns the number of pages written.
 883 *
 884 * @rs: current RAM state
 885 * @block: block that contains the page we want to send
 886 * @offset: offset inside the block for the page
 887 * @last_stage: if we are at the completion stage
 888 */
 889static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
 890                                    bool last_stage)
 891{
 892    int pages = -1;
 893    uint64_t bytes_xmit = 0;
 894    uint8_t *p;
 895    int ret, blen;
 896    RAMBlock *block = pss->block;
 897    ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 898
 899    p = block->host + offset;
 900
 901    ret = ram_control_save_page(rs->f, block->offset,
 902                                offset, TARGET_PAGE_SIZE, &bytes_xmit);
 903    if (bytes_xmit) {
 904        ram_counters.transferred += bytes_xmit;
 905        pages = 1;
 906    }
 907    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 908        if (ret != RAM_SAVE_CONTROL_DELAYED) {
 909            if (bytes_xmit > 0) {
 910                ram_counters.normal++;
 911            } else if (bytes_xmit == 0) {
 912                ram_counters.duplicate++;
 913            }
 914        }
 915    } else {
 916        /* When starting the process of a new block, the first page of
 917         * the block should be sent out before other pages in the same
 918         * block, and all the pages in last block should have been sent
 919         * out, keeping this order is important, because the 'cont' flag
 920         * is used to avoid resending the block name.
 921         */
 922        if (block != rs->last_sent_block) {
 923            flush_compressed_data(rs);
 924            pages = save_zero_page(rs, block, offset, p);
 925            if (pages == -1) {
 926                /* Make sure the first page is sent out before other pages */
 927                bytes_xmit = save_page_header(rs, rs->f, block, offset |
 928                                              RAM_SAVE_FLAG_COMPRESS_PAGE);
 929                blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
 930                                                 migrate_compress_level());
 931                if (blen > 0) {
 932                    ram_counters.transferred += bytes_xmit + blen;
 933                    ram_counters.normal++;
 934                    pages = 1;
 935                } else {
 936                    qemu_file_set_error(rs->f, blen);
 937                    error_report("compressed data failed!");
 938                }
 939            }
 940            if (pages > 0) {
 941                ram_release_pages(block->idstr, offset, pages);
 942            }
 943        } else {
 944            pages = save_zero_page(rs, block, offset, p);
 945            if (pages == -1) {
 946                pages = compress_page_with_multi_thread(rs, block, offset);
 947            } else {
 948                ram_release_pages(block->idstr, offset, pages);
 949            }
 950        }
 951    }
 952
 953    return pages;
 954}
 955
 956/**
 957 * find_dirty_block: find the next dirty page and update any state
 958 * associated with the search process.
 959 *
 960 * Returns if a page is found
 961 *
 962 * @rs: current RAM state
 963 * @pss: data about the state of the current dirty page scan
 964 * @again: set to false if the search has scanned the whole of RAM
 965 */
 966static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
 967{
 968    pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
 969    if (pss->complete_round && pss->block == rs->last_seen_block &&
 970        pss->page >= rs->last_page) {
 971        /*
 972         * We've been once around the RAM and haven't found anything.
 973         * Give up.
 974         */
 975        *again = false;
 976        return false;
 977    }
 978    if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
 979        /* Didn't find anything in this RAM Block */
 980        pss->page = 0;
 981        pss->block = QLIST_NEXT_RCU(pss->block, next);
 982        if (!pss->block) {
 983            /* Hit the end of the list */
 984            pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
 985            /* Flag that we've looped */
 986            pss->complete_round = true;
 987            rs->ram_bulk_stage = false;
 988            if (migrate_use_xbzrle()) {
 989                /* If xbzrle is on, stop using the data compression at this
 990                 * point. In theory, xbzrle can do better than compression.
 991                 */
 992                flush_compressed_data(rs);
 993            }
 994        }
 995        /* Didn't find anything this time, but try again on the new block */
 996        *again = true;
 997        return false;
 998    } else {
 999        /* Can go around again, but... */
1000        *again = true;

1001        /* We've found something so probably don't need to */
1002        return true;
1003    }
1004}
1005
1006/**
1007 * unqueue_page: gets a page of the queue
1008 *
1009 * Helper for 'get_queued_page' - gets a page off the queue
1010 *
1011 * Returns the block of the page (or NULL if none available)
1012 *
1013 * @rs: current RAM state
1014 * @offset: used to return the offset within the RAMBlock
1015 */
1016static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1017{
1018    RAMBlock *block = NULL;
1019
1020    qemu_mutex_lock(&rs->src_page_req_mutex);
1021    if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1022        struct RAMSrcPageRequest *entry =
1023                                QSIMPLEQ_FIRST(&rs->src_page_requests);
1024        block = entry->rb;
1025        *offset = entry->offset;
1026
1027        if (entry->len > TARGET_PAGE_SIZE) {
1028            entry->len -= TARGET_PAGE_SIZE;
1029            entry->offset += TARGET_PAGE_SIZE;
1030        } else {
1031            memory_region_unref(block->mr);
1032            QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1033            g_free(entry);
1034        }
1035    }
1036    qemu_mutex_unlock(&rs->src_page_req_mutex);
1037
1038    return block;
1039}
1040
1041/**
1042 * get_queued_page: unqueue a page from the postocpy requests
1043 *
1044 * Skips pages that are already sent (!dirty)
1045 *
1046 * Returns if a queued page is found
1047 *
1048 * @rs: current RAM state
1049 * @pss: data about the state of the current dirty page scan
1050 */
1051static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1052{
1053    RAMBlock  *block;
1054    ram_addr_t offset;
1055    bool dirty;
1056
1057    do {
1058        block = unqueue_page(rs, &offset);
1059        /*
1060         * We're sending this page, and since it's postcopy nothing else
1061         * will dirty it, and we must make sure it doesn't get sent again
1062         * even if this queue request was received after the background
1063         * search already sent it.
1064         */
1065        if (block) {
1066            unsigned long page;
1067
1068            page = offset >> TARGET_PAGE_BITS;
1069            dirty = test_bit(page, block->bmap);
1070            if (!dirty) {
1071                trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1072                       page, test_bit(page, block->unsentmap));
1073            } else {
1074                trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1075            }
1076        }
1077
1078    } while (block && !dirty);
1079
1080    if (block) {
1081        /*
1082         * As soon as we start servicing pages out of order, then we have
1083         * to kill the bulk stage, since the bulk stage assumes
1084         * in (migration_bitmap_find_and_reset_dirty) that every page is
1085         * dirty, that's no longer true.
1086         */
1087        rs->ram_bulk_stage = false;
1088
1089        /*
1090         * We want the background search to continue from the queued page
1091         * since the guest is likely to want other pages near to the page
1092         * it just requested.
1093         */
1094        pss->block = block;
1095        pss->page = offset >> TARGET_PAGE_BITS;
1096    }
1097
1098    return !!block;
1099}
1100
1101/**
1102 * migration_page_queue_free: drop any remaining pages in the ram
1103 * request queue
1104 *
1105 * It should be empty at the end anyway, but in error cases there may
1106 * be some left.  in case that there is any page left, we drop it.
1107 *
1108 */
1109static void migration_page_queue_free(RAMState *rs)
1110{
1111    struct RAMSrcPageRequest *mspr, *next_mspr;
1112    /* This queue generally should be empty - but in the case of a failed
1113     * migration might have some droppings in.
1114     */
1115    rcu_read_lock();
1116    QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1117        memory_region_unref(mspr->rb->mr);
1118        QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1119        g_free(mspr);
1120    }
1121    rcu_read_unlock();
1122}
1123
1124/**
1125 * ram_save_queue_pages: queue the page for transmission
1126 *
1127 * A request from postcopy destination for example.
1128 *
1129 * Returns zero on success or negative on error
1130 *
1131 * @rbname: Name of the RAMBLock of the request. NULL means the
1132 *          same that last one.
1133 * @start: starting address from the start of the RAMBlock
1134 * @len: length (in bytes) to send
1135 */
1136int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1137{
1138    RAMBlock *ramblock;
1139    RAMState *rs = ram_state;
1140
1141    ram_counters.postcopy_requests++;
1142    rcu_read_lock();
1143    if (!rbname) {
1144        /* Reuse last RAMBlock */
1145        ramblock = rs->last_req_rb;
1146
1147        if (!ramblock) {
1148            /*
1149             * Shouldn't happen, we can't reuse the last RAMBlock if
1150             * it's the 1st request.
1151             */
1152            error_report("ram_save_queue_pages no previous block");
1153            goto err;
1154        }
1155    } else {
1156        ramblock = qemu_ram_block_by_name(rbname);
1157
1158        if (!ramblock) {
1159            /* We shouldn't be asked for a non-existent RAMBlock */
1160            error_report("ram_save_queue_pages no block '%s'", rbname);
1161            goto err;
1162        }
1163        rs->last_req_rb = ramblock;
1164    }
1165    trace_ram_save_queue_pages(ramblock->idstr, start, len);
1166    if (start+len > ramblock->used_length) {
1167        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1168                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1169                     __func__, start, len, ramblock->used_length);
1170        goto err;
1171    }
1172
1173    struct RAMSrcPageRequest *new_entry =
1174        g_malloc0(sizeof(struct RAMSrcPageRequest));
1175    new_entry->rb = ramblock;
1176    new_entry->offset = start;
1177    new_entry->len = len;
1178
1179    memory_region_ref(ramblock->mr);
1180    qemu_mutex_lock(&rs->src_page_req_mutex);
1181    QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1182    qemu_mutex_unlock(&rs->src_page_req_mutex);
1183    rcu_read_unlock();
1184
1185    return 0;
1186
1187err:
1188    rcu_read_unlock();
1189    return -1;
1190}
1191
1192/**
1193 * ram_save_target_page: save one target page
1194 *
1195 * Returns the number of pages written
1196 *
1197 * @rs: current RAM state
1198 * @ms: current migration state
1199 * @pss: data about the page we want to send
1200 * @last_stage: if we are at the completion stage
1201 */
1202static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1203                                bool last_stage)
1204{
1205    int res = 0;
1206
1207    /* Check the pages is dirty and if it is send it */
1208    if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1209        /*
1210         * If xbzrle is on, stop using the data compression after first
1211         * round of migration even if compression is enabled. In theory,
1212         * xbzrle can do better than compression.
1213         */
1214        if (migrate_use_compression() &&
1215            (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1216            res = ram_save_compressed_page(rs, pss, last_stage);
1217        } else {
1218            res = ram_save_page(rs, pss, last_stage);
1219        }
1220
1221        if (res < 0) {
1222            return res;
1223        }
1224        if (pss->block->unsentmap) {
1225            clear_bit(pss->page, pss->block->unsentmap);
1226        }
1227    }
1228
1229    return res;
1230}
1231
1232/**
1233 * ram_save_host_page: save a whole host page
1234 *
1235 * Starting at *offset send pages up to the end of the current host
1236 * page. It's valid for the initial offset to point into the middle of
1237 * a host page in which case the remainder of the hostpage is sent.
1238 * Only dirty target pages are sent. Note that the host page size may
1239 * be a huge page for this block.
1240 * The saving stops at the boundary of the used_length of the block
1241 * if the RAMBlock isn't a multiple of the host page size.
1242 *
1243 * Returns the number of pages written or negative on error
1244 *
1245 * @rs: current RAM state
1246 * @ms: current migration state
1247 * @pss: data about the page we want to send
1248 * @last_stage: if we are at the completion stage
1249 */
1250static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1251                              bool last_stage)
1252{
1253    int tmppages, pages = 0;
1254    size_t pagesize_bits =
1255        qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1256
1257    do {
1258        tmppages = ram_save_target_page(rs, pss, last_stage);
1259        if (tmppages < 0) {
1260            return tmppages;
1261        }
1262
1263        pages += tmppages;
1264        pss->page++;
1265    } while ((pss->page & (pagesize_bits - 1)) &&
1266             offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1267
1268    /* The offset we leave with is the last one we looked at */
1269    pss->page--;
1270    return pages;
1271}
1272
1273/**
1274 * ram_find_and_save_block: finds a dirty page and sends it to f
1275 *
1276 * Called within an RCU critical section.
1277 *
1278 * Returns the number of pages written where zero means no dirty pages
1279 *
1280 * @rs: current RAM state
1281 * @last_stage: if we are at the completion stage
1282 *
1283 * On systems where host-page-size > target-page-size it will send all the
1284 * pages in a host page that are dirty.
1285 */
1286
1287static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1288{
1289    PageSearchStatus pss;
1290    int pages = 0;
1291    bool again, found;
1292
1293    /* No dirty page as there is zero RAM */
1294    if (!ram_bytes_total()) {
1295        return pages;
1296    }
1297
1298    pss.block = rs->last_seen_block;
1299    pss.page = rs->last_page;
1300    pss.complete_round = false;
1301
1302    if (!pss.block) {
1303        pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1304    }
1305
1306    do {
1307        again = true;
1308        found = get_queued_page(rs, &pss);
1309
1310        if (!found) {
1311            /* priority queue empty, so just search for something dirty */
1312            found = find_dirty_block(rs, &pss, &again);
1313        }
1314
1315        if (found) {
1316            pages = ram_save_host_page(rs, &pss, last_stage);
1317        }
1318    } while (!pages && again);
1319
1320    rs->last_seen_block = pss.block;
1321    rs->last_page = pss.page;
1322
1323    return pages;
1324}
1325
1326void acct_update_position(QEMUFile *f, size_t size, bool zero)
1327{
1328    uint64_t pages = size / TARGET_PAGE_SIZE;
1329
1330    if (zero) {
1331        ram_counters.duplicate += pages;
1332    } else {
1333        ram_counters.normal += pages;
1334        ram_counters.transferred += size;
1335        qemu_update_position(f, size);
1336    }
1337}
1338
1339uint64_t ram_bytes_total(void)
1340{
1341    RAMBlock *block;
1342    uint64_t total = 0;
1343
1344    rcu_read_lock();
1345    RAMBLOCK_FOREACH(block) {
1346        total += block->used_length;
1347    }
1348    rcu_read_unlock();
1349    return total;
1350}
1351
1352static void xbzrle_load_setup(void)
1353{
1354    XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1355}
1356
1357static void xbzrle_load_cleanup(void)
1358{
1359    g_free(XBZRLE.decoded_buf);
1360    XBZRLE.decoded_buf = NULL;
1361}
1362
1363static void ram_save_cleanup(void *opaque)
1364{
1365    RAMState **rsp = opaque;
1366    RAMBlock *block;
1367
1368    /* caller have hold iothread lock or is in a bh, so there is
1369     * no writing race against this migration_bitmap
1370     */
1371    memory_global_dirty_log_stop();
1372
1373    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1374        g_free(block->bmap);
1375        block->bmap = NULL;
1376        g_free(block->unsentmap);
1377        block->unsentmap = NULL;
1378    }
1379
1380    XBZRLE_cache_lock();
1381    if (XBZRLE.cache) {
1382        cache_fini(XBZRLE.cache);
1383        g_free(XBZRLE.encoded_buf);
1384        g_free(XBZRLE.current_buf);
1385        g_free(XBZRLE.zero_target_page);
1386        XBZRLE.cache = NULL;
1387        XBZRLE.encoded_buf = NULL;
1388        XBZRLE.current_buf = NULL;
1389        XBZRLE.zero_target_page = NULL;
1390    }
1391    XBZRLE_cache_unlock();
1392    migration_page_queue_free(*rsp);
1393    compress_threads_save_cleanup();
1394    g_free(*rsp);
1395    *rsp = NULL;
1396}
1397
1398static void ram_state_reset(RAMState *rs)
1399{
1400    rs->last_seen_block = NULL;
1401    rs->last_sent_block = NULL;
1402    rs->last_page = 0;
1403    rs->last_version = ram_list.version;
1404    rs->ram_bulk_stage = true;
1405}
1406
1407#define MAX_WAIT 50 /* ms, half buffered_file limit */
1408
1409/*
1410 * 'expected' is the value you expect the bitmap mostly to be full
1411 * of; it won't bother printing lines that are all this value.
1412 * If 'todump' is null the migration bitmap is dumped.
1413 */
1414void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1415                           unsigned long pages)
1416{
1417    int64_t cur;
1418    int64_t linelen = 128;
1419    char linebuf[129];
1420
1421    for (cur = 0; cur < pages; cur += linelen) {
1422        int64_t curb;
1423        bool found = false;
1424        /*
1425         * Last line; catch the case where the line length
1426         * is longer than remaining ram
1427         */
1428        if (cur + linelen > pages) {
1429            linelen = pages - cur;
1430        }
1431        for (curb = 0; curb < linelen; curb++) {
1432            bool thisbit = test_bit(cur + curb, todump);
1433            linebuf[curb] = thisbit ? '1' : '.';
1434            found = found || (thisbit != expected);
1435        }
1436        if (found) {
1437            linebuf[curb] = '\0';
1438            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1439        }
1440    }
1441}
1442
1443/* **** functions for postcopy ***** */
1444
1445void ram_postcopy_migrated_memory_release(MigrationState *ms)
1446{
1447    struct RAMBlock *block;
1448
1449    RAMBLOCK_FOREACH(block) {
1450        unsigned long *bitmap = block->bmap;
1451        unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1452        unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1453
1454        while (run_start < range) {
1455            unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1456            ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1457                              (run_end - run_start) << TARGET_PAGE_BITS);
1458            run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1459        }
1460    }
1461}
1462
1463/**
1464 * postcopy_send_discard_bm_ram: discard a RAMBlock
1465 *
1466 * Returns zero on success
1467 *
1468 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1469 * Note: At this point the 'unsentmap' is the processed bitmap combined
1470 *       with the dirtymap; so a '1' means it's either dirty or unsent.
1471 *
1472 * @ms: current migration state
1473 * @pds: state for postcopy
1474 * @start: RAMBlock starting page
1475 * @length: RAMBlock size
1476 */
1477static int postcopy_send_discard_bm_ram(MigrationState *ms,
1478                                        PostcopyDiscardState *pds,
1479                                        RAMBlock *block)
1480{
1481    unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1482    unsigned long current;
1483    unsigned long *unsentmap = block->unsentmap;
1484
1485    for (current = 0; current < end; ) {
1486        unsigned long one = find_next_bit(unsentmap, end, current);
1487
1488        if (one <= end) {
1489            unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1490            unsigned long discard_length;
1491
1492            if (zero >= end) {
1493                discard_length = end - one;
1494            } else {
1495                discard_length = zero - one;
1496            }
1497            if (discard_length) {
1498                postcopy_discard_send_range(ms, pds, one, discard_length);
1499            }
1500            current = one + discard_length;
1501        } else {
1502            current = one;
1503        }
1504    }
1505
1506    return 0;
1507}
1508
1509/**
1510 * postcopy_each_ram_send_discard: discard all RAMBlocks
1511 *
1512 * Returns 0 for success or negative for error
1513 *
1514 * Utility for the outgoing postcopy code.
1515 *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1516 *   passing it bitmap indexes and name.
1517 * (qemu_ram_foreach_block ends up passing unscaled lengths
1518 *  which would mean postcopy code would have to deal with target page)
1519 *
1520 * @ms: current migration state
1521 */
1522static int postcopy_each_ram_send_discard(MigrationState *ms)
1523{
1524    struct RAMBlock *block;
1525    int ret;
1526
1527    RAMBLOCK_FOREACH(block) {
1528        PostcopyDiscardState *pds =
1529            postcopy_discard_send_init(ms, block->idstr);
1530
1531        /*
1532         * Postcopy sends chunks of bitmap over the wire, but it
1533         * just needs indexes at this point, avoids it having
1534         * target page specific code.
1535         */
1536        ret = postcopy_send_discard_bm_ram(ms, pds, block);
1537        postcopy_discard_send_finish(ms, pds);
1538        if (ret) {
1539            return ret;
1540        }
1541    }
1542
1543    return 0;
1544}
1545
1546/**
1547 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1548 *
1549 * Helper for postcopy_chunk_hostpages; it's called twice to
1550 * canonicalize the two bitmaps, that are similar, but one is
1551 * inverted.
1552 *
1553 * Postcopy requires that all target pages in a hostpage are dirty or
1554 * clean, not a mix.  This function canonicalizes the bitmaps.
1555 *
1556 * @ms: current migration state
1557 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1558 *               otherwise we need to canonicalize partially dirty host pages
1559 * @block: block that contains the page we want to canonicalize
1560 * @pds: state for postcopy
1561 */
1562static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1563                                          RAMBlock *block,
1564                                          PostcopyDiscardState *pds)
1565{
1566    RAMState *rs = ram_state;
1567    unsigned long *bitmap = block->bmap;
1568    unsigned long *unsentmap = block->unsentmap;
1569    unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1570    unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1571    unsigned long run_start;
1572
1573    if (block->page_size == TARGET_PAGE_SIZE) {
1574        /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1575        return;
1576    }
1577
1578    if (unsent_pass) {
1579        /* Find a sent page */
1580        run_start = find_next_zero_bit(unsentmap, pages, 0);
1581    } else {
1582        /* Find a dirty page */
1583        run_start = find_next_bit(bitmap, pages, 0);
1584    }
1585
1586    while (run_start < pages) {
1587        bool do_fixup = false;
1588        unsigned long fixup_start_addr;
1589        unsigned long host_offset;
1590
1591        /*
1592         * If the start of this run of pages is in the middle of a host
1593         * page, then we need to fixup this host page.
1594         */
1595        host_offset = run_start % host_ratio;
1596        if (host_offset) {
1597            do_fixup = true;
1598            run_start -= host_offset;
1599            fixup_start_addr = run_start;
1600            /* For the next pass */
1601            run_start = run_start + host_ratio;
1602        } else {
1603            /* Find the end of this run */
1604            unsigned long run_end;
1605            if (unsent_pass) {
1606                run_end = find_next_bit(unsentmap, pages, run_start + 1);
1607            } else {
1608                run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1609            }
1610            /*
1611             * If the end isn't at the start of a host page, then the
1612             * run doesn't finish at the end of a host page
1613             * and we need to discard.
1614             */
1615            host_offset = run_end % host_ratio;
1616            if (host_offset) {
1617                do_fixup = true;
1618                fixup_start_addr = run_end - host_offset;
1619                /*
1620                 * This host page has gone, the next loop iteration starts
1621                 * from after the fixup
1622                 */
1623                run_start = fixup_start_addr + host_ratio;
1624            } else {
1625                /*
1626                 * No discards on this iteration, next loop starts from
1627                 * next sent/dirty page
1628                 */
1629                run_start = run_end + 1;
1630            }
1631        }
1632
1633        if (do_fixup) {
1634            unsigned long page;
1635
1636            /* Tell the destination to discard this page */
1637            if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1638                /* For the unsent_pass we:
1639                 *     discard partially sent pages
1640                 * For the !unsent_pass (dirty) we:
1641                 *     discard partially dirty pages that were sent
1642                 *     (any partially sent pages were already discarded
1643                 *     by the previous unsent_pass)
1644                 */
1645                postcopy_discard_send_range(ms, pds, fixup_start_addr,
1646                                            host_ratio);
1647            }
1648
1649            /* Clean up the bitmap */
1650            for (page = fixup_start_addr;
1651                 page < fixup_start_addr + host_ratio; page++) {
1652                /* All pages in this host page are now not sent */
1653                set_bit(page, unsentmap);
1654
1655                /*
1656                 * Remark them as dirty, updating the count for any pages
1657                 * that weren't previously dirty.
1658                 */
1659                rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1660            }
1661        }
1662
1663        if (unsent_pass) {
1664            /* Find the next sent page for the next iteration */
1665            run_start = find_next_zero_bit(unsentmap, pages, run_start);
1666        } else {
1667            /* Find the next dirty page for the next iteration */
1668            run_start = find_next_bit(bitmap, pages, run_start);
1669        }
1670    }
1671}
1672
1673/**
1674 * postcopy_chuck_hostpages: discrad any partially sent host page
1675 *
1676 * Utility for the outgoing postcopy code.
1677 *
1678 * Discard any partially sent host-page size chunks, mark any partially
1679 * dirty host-page size chunks as all dirty.  In this case the host-page
1680 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1681 *
1682 * Returns zero on success
1683 *
1684 * @ms: current migration state
1685 * @block: block we want to work with
1686 */
1687static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1688{
1689    PostcopyDiscardState *pds =
1690        postcopy_discard_send_init(ms, block->idstr);
1691
1692    /* First pass: Discard all partially sent host pages */
1693    postcopy_chunk_hostpages_pass(ms, true, block, pds);
1694    /*
1695     * Second pass: Ensure that all partially dirty host pages are made
1696     * fully dirty.
1697     */
1698    postcopy_chunk_hostpages_pass(ms, false, block, pds);
1699
1700    postcopy_discard_send_finish(ms, pds);
1701    return 0;
1702}
1703
1704/**
1705 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1706 *
1707 * Returns zero on success
1708 *
1709 * Transmit the set of pages to be discarded after precopy to the target
1710 * these are pages that:
1711 *     a) Have been previously transmitted but are now dirty again
1712 *     b) Pages that have never been transmitted, this ensures that
1713 *        any pages on the destination that have been mapped by background
1714 *        tasks get discarded (transparent huge pages is the specific concern)
1715 * Hopefully this is pretty sparse
1716 *
1717 * @ms: current migration state
1718 */
1719int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1720{
1721    RAMState *rs = ram_state;
1722    RAMBlock *block;
1723    int ret;
1724
1725    rcu_read_lock();
1726
1727    /* This should be our last sync, the src is now paused */
1728    migration_bitmap_sync(rs);
1729
1730    /* Easiest way to make sure we don't resume in the middle of a host-page */
1731    rs->last_seen_block = NULL;
1732    rs->last_sent_block = NULL;
1733    rs->last_page = 0;
1734
1735    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1736        unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1737        unsigned long *bitmap = block->bmap;
1738        unsigned long *unsentmap = block->unsentmap;
1739
1740        if (!unsentmap) {
1741            /* We don't have a safe way to resize the sentmap, so
1742             * if the bitmap was resized it will be NULL at this
1743             * point.
1744             */
1745            error_report("migration ram resized during precopy phase");
1746            rcu_read_unlock();
1747            return -EINVAL;
1748        }
1749        /* Deal with TPS != HPS and huge pages */
1750        ret = postcopy_chunk_hostpages(ms, block);
1751        if (ret) {
1752            rcu_read_unlock();
1753            return ret;
1754        }
1755
1756        /*
1757         * Update the unsentmap to be unsentmap = unsentmap | dirty
1758         */
1759        bitmap_or(unsentmap, unsentmap, bitmap, pages);
1760#ifdef DEBUG_POSTCOPY
1761        ram_debug_dump_bitmap(unsentmap, true, pages);
1762#endif
1763    }
1764    trace_ram_postcopy_send_discard_bitmap();
1765
1766    ret = postcopy_each_ram_send_discard(ms);
1767    rcu_read_unlock();
1768
1769    return ret;
1770}
1771
1772/**
1773 * ram_discard_range: discard dirtied pages at the beginning of postcopy
1774 *
1775 * Returns zero on success
1776 *
1777 * @rbname: name of the RAMBlock of the request. NULL means the
1778 *          same that last one.
1779 * @start: RAMBlock starting page
1780 * @length: RAMBlock size
1781 */
1782int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1783{
1784    int ret = -1;
1785
1786    trace_ram_discard_range(rbname, start, length);
1787
1788    rcu_read_lock();
1789    RAMBlock *rb = qemu_ram_block_by_name(rbname);
1790
1791    if (!rb) {
1792        error_report("ram_discard_range: Failed to find block '%s'", rbname);
1793        goto err;
1794    }
1795
1796    ret = ram_block_discard_range(rb, start, length);
1797
1798err:
1799    rcu_read_unlock();
1800
1801    return ret;
1802}
1803
1804static int ram_state_init(RAMState **rsp)
1805{
1806    *rsp = g_new0(RAMState, 1);
1807
1808    qemu_mutex_init(&(*rsp)->bitmap_mutex);
1809    qemu_mutex_init(&(*rsp)->src_page_req_mutex);
1810    QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
1811
1812    if (migrate_use_xbzrle()) {
1813        XBZRLE_cache_lock();
1814        XBZRLE.zero_target_page = g_malloc0(TARGET_PAGE_SIZE);
1815        XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1816                                  TARGET_PAGE_SIZE,
1817                                  TARGET_PAGE_SIZE);
1818        if (!XBZRLE.cache) {
1819            XBZRLE_cache_unlock();
1820            error_report("Error creating cache");
1821            g_free(*rsp);
1822            *rsp = NULL;
1823            return -1;
1824        }
1825        XBZRLE_cache_unlock();
1826
1827        /* We prefer not to abort if there is no memory */
1828        XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1829        if (!XBZRLE.encoded_buf) {
1830            error_report("Error allocating encoded_buf");
1831            g_free(*rsp);
1832            *rsp = NULL;
1833            return -1;
1834        }
1835
1836        XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1837        if (!XBZRLE.current_buf) {
1838            error_report("Error allocating current_buf");
1839            g_free(XBZRLE.encoded_buf);
1840            XBZRLE.encoded_buf = NULL;
1841            g_free(*rsp);
1842            *rsp = NULL;
1843            return -1;
1844        }
1845    }
1846
1847    /* For memory_global_dirty_log_start below.  */
1848    qemu_mutex_lock_iothread();
1849
1850    qemu_mutex_lock_ramlist();
1851    rcu_read_lock();
1852    ram_state_reset(*rsp);
1853
1854    /* Skip setting bitmap if there is no RAM */
1855    if (ram_bytes_total()) {
1856        RAMBlock *block;
1857
1858        QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1859            unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1860
1861            block->bmap = bitmap_new(pages);
1862            bitmap_set(block->bmap, 0, pages);
1863            if (migrate_postcopy_ram()) {
1864                block->unsentmap = bitmap_new(pages);
1865                bitmap_set(block->unsentmap, 0, pages);
1866            }
1867        }
1868    }
1869
1870    /*
1871     * Count the total number of pages used by ram blocks not including any
1872     * gaps due to alignment or unplugs.
1873     */
1874    (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1875
1876    memory_global_dirty_log_start();
1877    migration_bitmap_sync(*rsp);
1878    qemu_mutex_unlock_ramlist();
1879    qemu_mutex_unlock_iothread();
1880    rcu_read_unlock();
1881
1882    return 0;
1883}
1884
1885/*
1886 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1887 * long-running RCU critical section.  When rcu-reclaims in the code
1888 * start to become numerous it will be necessary to reduce the
1889 * granularity of these critical sections.
1890 */
1891
1892/**
1893 * ram_save_setup: Setup RAM for migration
1894 *
1895 * Returns zero to indicate success and negative for error
1896 *
1897 * @f: QEMUFile where to send the data
1898 * @opaque: RAMState pointer
1899 */
1900static int ram_save_setup(QEMUFile *f, void *opaque)
1901{
1902    RAMState **rsp = opaque;
1903    RAMBlock *block;
1904
1905    /* migration has already setup the bitmap, reuse it. */
1906    if (!migration_in_colo_state()) {
1907        if (ram_state_init(rsp) != 0) {
1908            return -1;
1909        }
1910    }
1911    (*rsp)->f = f;
1912
1913    rcu_read_lock();
1914
1915    qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1916
1917    RAMBLOCK_FOREACH(block) {
1918        qemu_put_byte(f, strlen(block->idstr));
1919        qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1920        qemu_put_be64(f, block->used_length);
1921        if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1922            qemu_put_be64(f, block->page_size);
1923        }
1924    }
1925
1926    rcu_read_unlock();
1927    compress_threads_save_setup();
1928
1929    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1930    ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1931
1932    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1933
1934    return 0;
1935}
1936
1937/**
1938 * ram_save_iterate: iterative stage for migration
1939 *
1940 * Returns zero to indicate success and negative for error
1941 *
1942 * @f: QEMUFile where to send the data
1943 * @opaque: RAMState pointer
1944 */
1945static int ram_save_iterate(QEMUFile *f, void *opaque)
1946{
1947    RAMState **temp = opaque;
1948    RAMState *rs = *temp;
1949    int ret;
1950    int i;
1951    int64_t t0;
1952    int done = 0;
1953
1954    rcu_read_lock();
1955    if (ram_list.version != rs->last_version) {
1956        ram_state_reset(rs);
1957    }
1958
1959    /* Read version before ram_list.blocks */
1960    smp_rmb();
1961
1962    ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1963
1964    t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1965    i = 0;
1966    while ((ret = qemu_file_rate_limit(f)) == 0) {
1967        int pages;
1968
1969        pages = ram_find_and_save_block(rs, false);
1970        /* no more pages to sent */
1971        if (pages == 0) {
1972            done = 1;
1973            break;
1974        }
1975        rs->iterations++;
1976
1977        /* we want to check in the 1st loop, just in case it was the 1st time
1978           and we had to sync the dirty bitmap.
1979           qemu_get_clock_ns() is a bit expensive, so we only check each some
1980           iterations
1981        */
1982        if ((i & 63) == 0) {
1983            uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
1984            if (t1 > MAX_WAIT) {
1985                trace_ram_save_iterate_big_wait(t1, i);
1986                break;
1987            }
1988        }
1989        i++;
1990    }
1991    flush_compressed_data(rs);
1992    rcu_read_unlock();
1993
1994    /*
1995     * Must occur before EOS (or any QEMUFile operation)
1996     * because of RDMA protocol.
1997     */
1998    ram_control_after_iterate(f, RAM_CONTROL_ROUND);
1999
2000    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

2001    ram_counters.transferred += 8;
2002
2003    ret = qemu_file_get_error(f);
2004    if (ret < 0) {
2005        return ret;
2006    }
2007
2008    return done;
2009}
2010
2011/**
2012 * ram_save_complete: function called to send the remaining amount of ram
2013 *
2014 * Returns zero to indicate success
2015 *
2016 * Called with iothread lock
2017 *
2018 * @f: QEMUFile where to send the data
2019 * @opaque: RAMState pointer
2020 */
2021static int ram_save_complete(QEMUFile *f, void *opaque)
2022{
2023    RAMState **temp = opaque;
2024    RAMState *rs = *temp;
2025
2026    rcu_read_lock();
2027
2028    if (!migration_in_postcopy()) {
2029        migration_bitmap_sync(rs);
2030    }
2031
2032    ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2033
2034    /* try transferring iterative blocks of memory */
2035
2036    /* flush all remaining blocks regardless of rate limiting */
2037    while (true) {
2038        int pages;
2039
2040        pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2041        /* no more blocks to sent */
2042        if (pages == 0) {
2043            break;
2044        }
2045    }
2046
2047    flush_compressed_data(rs);
2048    ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2049
2050    rcu_read_unlock();
2051
2052    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2053
2054    return 0;
2055}
2056
2057static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2058                             uint64_t *non_postcopiable_pending,
2059                             uint64_t *postcopiable_pending)
2060{
2061    RAMState **temp = opaque;
2062    RAMState *rs = *temp;
2063    uint64_t remaining_size;
2064
2065    remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2066
2067    if (!migration_in_postcopy() &&
2068        remaining_size < max_size) {
2069        qemu_mutex_lock_iothread();
2070        rcu_read_lock();
2071        migration_bitmap_sync(rs);
2072        rcu_read_unlock();
2073        qemu_mutex_unlock_iothread();
2074        remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2075    }
2076
2077    /* We can do postcopy, and all the data is postcopiable */
2078    *postcopiable_pending += remaining_size;
2079}
2080
2081static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2082{
2083    unsigned int xh_len;
2084    int xh_flags;
2085    uint8_t *loaded_data;
2086
2087    /* extract RLE header */
2088    xh_flags = qemu_get_byte(f);
2089    xh_len = qemu_get_be16(f);
2090
2091    if (xh_flags != ENCODING_FLAG_XBZRLE) {
2092        error_report("Failed to load XBZRLE page - wrong compression!");
2093        return -1;
2094    }
2095
2096    if (xh_len > TARGET_PAGE_SIZE) {
2097        error_report("Failed to load XBZRLE page - len overflow!");
2098        return -1;
2099    }
2100    loaded_data = XBZRLE.decoded_buf;
2101    /* load data and decode */
2102    /* it can change loaded_data to point to an internal buffer */
2103    qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2104
2105    /* decode RLE */
2106    if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2107                             TARGET_PAGE_SIZE) == -1) {
2108        error_report("Failed to load XBZRLE page - decode error!");
2109        return -1;
2110    }
2111
2112    return 0;
2113}
2114
2115/**
2116 * ram_block_from_stream: read a RAMBlock id from the migration stream
2117 *
2118 * Must be called from within a rcu critical section.
2119 *
2120 * Returns a pointer from within the RCU-protected ram_list.
2121 *
2122 * @f: QEMUFile where to read the data from
2123 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2124 */
2125static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2126{
2127    static RAMBlock *block = NULL;
2128    char id[256];
2129    uint8_t len;
2130
2131    if (flags & RAM_SAVE_FLAG_CONTINUE) {
2132        if (!block) {
2133            error_report("Ack, bad migration stream!");
2134            return NULL;
2135        }
2136        return block;
2137    }
2138
2139    len = qemu_get_byte(f);
2140    qemu_get_buffer(f, (uint8_t *)id, len);
2141    id[len] = 0;
2142
2143    block = qemu_ram_block_by_name(id);
2144    if (!block) {
2145        error_report("Can't find block %s", id);
2146        return NULL;
2147    }
2148
2149    return block;
2150}
2151
2152static inline void *host_from_ram_block_offset(RAMBlock *block,
2153                                               ram_addr_t offset)
2154{
2155    if (!offset_in_ramblock(block, offset)) {
2156        return NULL;
2157    }
2158
2159    return block->host + offset;
2160}
2161
2162/**
2163 * ram_handle_compressed: handle the zero page case
2164 *
2165 * If a page (or a whole RDMA chunk) has been
2166 * determined to be zero, then zap it.
2167 *
2168 * @host: host address for the zero page
2169 * @ch: what the page is filled from.  We only support zero
2170 * @size: size of the zero page
2171 */
2172void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2173{
2174    if (ch != 0 || !is_zero_range(host, size)) {
2175        memset(host, ch, size);
2176    }
2177}
2178
2179static void *do_data_decompress(void *opaque)
2180{
2181    DecompressParam *param = opaque;
2182    unsigned long pagesize;
2183    uint8_t *des;
2184    int len;
2185
2186    qemu_mutex_lock(&param->mutex);
2187    while (!param->quit) {
2188        if (param->des) {
2189            des = param->des;
2190            len = param->len;
2191            param->des = 0;
2192            qemu_mutex_unlock(&param->mutex);
2193
2194            pagesize = TARGET_PAGE_SIZE;
2195            /* uncompress() will return failed in some case, especially
2196             * when the page is dirted when doing the compression, it's
2197             * not a problem because the dirty page will be retransferred
2198             * and uncompress() won't break the data in other pages.
2199             */
2200            uncompress((Bytef *)des, &pagesize,
2201                       (const Bytef *)param->compbuf, len);
2202
2203            qemu_mutex_lock(&decomp_done_lock);
2204            param->done = true;
2205            qemu_cond_signal(&decomp_done_cond);
2206            qemu_mutex_unlock(&decomp_done_lock);
2207
2208            qemu_mutex_lock(&param->mutex);
2209        } else {
2210            qemu_cond_wait(&param->cond, &param->mutex);
2211        }
2212    }
2213    qemu_mutex_unlock(&param->mutex);
2214
2215    return NULL;
2216}
2217
2218static void wait_for_decompress_done(void)
2219{
2220    int idx, thread_count;
2221
2222    if (!migrate_use_compression()) {
2223        return;
2224    }
2225
2226    thread_count = migrate_decompress_threads();
2227    qemu_mutex_lock(&decomp_done_lock);
2228    for (idx = 0; idx < thread_count; idx++) {
2229        while (!decomp_param[idx].done) {
2230            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2231        }
2232    }
2233    qemu_mutex_unlock(&decomp_done_lock);
2234}
2235
2236static void compress_threads_load_setup(void)
2237{
2238    int i, thread_count;
2239
2240    if (!migrate_use_compression()) {
2241        return;
2242    }
2243    thread_count = migrate_decompress_threads();
2244    decompress_threads = g_new0(QemuThread, thread_count);
2245    decomp_param = g_new0(DecompressParam, thread_count);
2246    qemu_mutex_init(&decomp_done_lock);
2247    qemu_cond_init(&decomp_done_cond);
2248    for (i = 0; i < thread_count; i++) {
2249        qemu_mutex_init(&decomp_param[i].mutex);
2250        qemu_cond_init(&decomp_param[i].cond);
2251        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2252        decomp_param[i].done = true;
2253        decomp_param[i].quit = false;
2254        qemu_thread_create(decompress_threads + i, "decompress",
2255                           do_data_decompress, decomp_param + i,
2256                           QEMU_THREAD_JOINABLE);
2257    }
2258}
2259
2260static void compress_threads_load_cleanup(void)
2261{
2262    int i, thread_count;
2263
2264    if (!migrate_use_compression()) {
2265        return;
2266    }
2267    thread_count = migrate_decompress_threads();
2268    for (i = 0; i < thread_count; i++) {
2269        qemu_mutex_lock(&decomp_param[i].mutex);
2270        decomp_param[i].quit = true;
2271        qemu_cond_signal(&decomp_param[i].cond);
2272        qemu_mutex_unlock(&decomp_param[i].mutex);
2273    }
2274    for (i = 0; i < thread_count; i++) {
2275        qemu_thread_join(decompress_threads + i);
2276        qemu_mutex_destroy(&decomp_param[i].mutex);
2277        qemu_cond_destroy(&decomp_param[i].cond);
2278        g_free(decomp_param[i].compbuf);
2279    }
2280    g_free(decompress_threads);
2281    g_free(decomp_param);
2282    decompress_threads = NULL;
2283    decomp_param = NULL;
2284}
2285
2286static void decompress_data_with_multi_threads(QEMUFile *f,
2287                                               void *host, int len)
2288{
2289    int idx, thread_count;
2290
2291    thread_count = migrate_decompress_threads();
2292    qemu_mutex_lock(&decomp_done_lock);
2293    while (true) {
2294        for (idx = 0; idx < thread_count; idx++) {
2295            if (decomp_param[idx].done) {
2296                decomp_param[idx].done = false;
2297                qemu_mutex_lock(&decomp_param[idx].mutex);
2298                qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2299                decomp_param[idx].des = host;
2300                decomp_param[idx].len = len;
2301                qemu_cond_signal(&decomp_param[idx].cond);
2302                qemu_mutex_unlock(&decomp_param[idx].mutex);
2303                break;
2304            }
2305        }
2306        if (idx < thread_count) {
2307            break;
2308        } else {
2309            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2310        }
2311    }
2312    qemu_mutex_unlock(&decomp_done_lock);
2313}
2314
2315/**
2316 * ram_load_setup: Setup RAM for migration incoming side
2317 *
2318 * Returns zero to indicate success and negative for error
2319 *
2320 * @f: QEMUFile where to receive the data
2321 * @opaque: RAMState pointer
2322 */
2323static int ram_load_setup(QEMUFile *f, void *opaque)
2324{
2325    xbzrle_load_setup();
2326    compress_threads_load_setup();
2327    return 0;
2328}
2329
2330static int ram_load_cleanup(void *opaque)
2331{
2332    xbzrle_load_cleanup();
2333    compress_threads_load_cleanup();
2334    return 0;
2335}
2336
2337/**
2338 * ram_postcopy_incoming_init: allocate postcopy data structures
2339 *
2340 * Returns 0 for success and negative if there was one error
2341 *
2342 * @mis: current migration incoming state
2343 *
2344 * Allocate data structures etc needed by incoming migration with
2345 * postcopy-ram. postcopy-ram's similarly names
2346 * postcopy_ram_incoming_init does the work.
2347 */
2348int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2349{
2350    unsigned long ram_pages = last_ram_page();
2351
2352    return postcopy_ram_incoming_init(mis, ram_pages);
2353}
2354
2355/**
2356 * ram_load_postcopy: load a page in postcopy case
2357 *
2358 * Returns 0 for success or -errno in case of error
2359 *
2360 * Called in postcopy mode by ram_load().
2361 * rcu_read_lock is taken prior to this being called.
2362 *
2363 * @f: QEMUFile where to send the data
2364 */
2365static int ram_load_postcopy(QEMUFile *f)
2366{
2367    int flags = 0, ret = 0;
2368    bool place_needed = false;
2369    bool matching_page_sizes = false;
2370    MigrationIncomingState *mis = migration_incoming_get_current();
2371    /* Temporary page that is later 'placed' */
2372    void *postcopy_host_page = postcopy_get_tmp_page(mis);
2373    void *last_host = NULL;
2374    bool all_zero = false;
2375
2376    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2377        ram_addr_t addr;
2378        void *host = NULL;
2379        void *page_buffer = NULL;
2380        void *place_source = NULL;
2381        RAMBlock *block = NULL;
2382        uint8_t ch;
2383
2384        addr = qemu_get_be64(f);
2385        flags = addr & ~TARGET_PAGE_MASK;
2386        addr &= TARGET_PAGE_MASK;
2387
2388        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2389        place_needed = false;
2390        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2391            block = ram_block_from_stream(f, flags);
2392
2393            host = host_from_ram_block_offset(block, addr);
2394            if (!host) {
2395                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2396                ret = -EINVAL;
2397                break;
2398            }
2399            matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2400            /*
2401             * Postcopy requires that we place whole host pages atomically;
2402             * these may be huge pages for RAMBlocks that are backed by
2403             * hugetlbfs.
2404             * To make it atomic, the data is read into a temporary page
2405             * that's moved into place later.
2406             * The migration protocol uses,  possibly smaller, target-pages
2407             * however the source ensures it always sends all the components
2408             * of a host page in order.
2409             */
2410            page_buffer = postcopy_host_page +
2411                          ((uintptr_t)host & (block->page_size - 1));
2412            /* If all TP are zero then we can optimise the place */
2413            if (!((uintptr_t)host & (block->page_size - 1))) {
2414                all_zero = true;
2415            } else {
2416                /* not the 1st TP within the HP */
2417                if (host != (last_host + TARGET_PAGE_SIZE)) {
2418                    error_report("Non-sequential target page %p/%p",
2419                                  host, last_host);
2420                    ret = -EINVAL;
2421                    break;
2422                }
2423            }
2424
2425
2426            /*
2427             * If it's the last part of a host page then we place the host
2428             * page
2429             */
2430            place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2431                                     (block->page_size - 1)) == 0;
2432            place_source = postcopy_host_page;
2433        }
2434        last_host = host;
2435
2436        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2437        case RAM_SAVE_FLAG_ZERO:
2438            ch = qemu_get_byte(f);
2439            memset(page_buffer, ch, TARGET_PAGE_SIZE);
2440            if (ch) {
2441                all_zero = false;
2442            }
2443            break;
2444
2445        case RAM_SAVE_FLAG_PAGE:
2446            all_zero = false;
2447            if (!place_needed || !matching_page_sizes) {
2448                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2449            } else {
2450                /* Avoids the qemu_file copy during postcopy, which is
2451                 * going to do a copy later; can only do it when we
2452                 * do this read in one go (matching page sizes)
2453                 */
2454                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2455                                         TARGET_PAGE_SIZE);
2456            }
2457            break;
2458        case RAM_SAVE_FLAG_EOS:
2459            /* normal exit */
2460            break;
2461        default:
2462            error_report("Unknown combination of migration flags: %#x"
2463                         " (postcopy mode)", flags);
2464            ret = -EINVAL;
2465        }
2466
2467        if (place_needed) {
2468            /* This gets called at the last target page in the host page */
2469            void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2470
2471            if (all_zero) {
2472                ret = postcopy_place_page_zero(mis, place_dest,
2473                                               block->page_size);
2474            } else {
2475                ret = postcopy_place_page(mis, place_dest,
2476                                          place_source, block->page_size);
2477            }
2478        }
2479        if (!ret) {
2480            ret = qemu_file_get_error(f);
2481        }
2482    }
2483
2484    return ret;
2485}
2486
2487static int ram_load(QEMUFile *f, void *opaque, int version_id)
2488{
2489    int flags = 0, ret = 0, invalid_flags = 0;
2490    static uint64_t seq_iter;
2491    int len = 0;
2492    /*
2493     * If system is running in postcopy mode, page inserts to host memory must
2494     * be atomic
2495     */
2496    bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2497    /* ADVISE is earlier, it shows the source has the postcopy capability on */
2498    bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2499
2500    seq_iter++;
2501
2502    if (version_id != 4) {
2503        ret = -EINVAL;
2504    }
2505
2506    if (!migrate_use_compression()) {
2507        invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2508    }
2509    /* This RCU critical section can be very long running.
2510     * When RCU reclaims in the code start to become numerous,
2511     * it will be necessary to reduce the granularity of this
2512     * critical section.
2513     */
2514    rcu_read_lock();
2515
2516    if (postcopy_running) {
2517        ret = ram_load_postcopy(f);
2518    }
2519
2520    while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2521        ram_addr_t addr, total_ram_bytes;
2522        void *host = NULL;
2523        uint8_t ch;
2524
2525        addr = qemu_get_be64(f);
2526        flags = addr & ~TARGET_PAGE_MASK;
2527        addr &= TARGET_PAGE_MASK;
2528
2529        if (flags & invalid_flags) {
2530            if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2531                error_report("Received an unexpected compressed page");
2532            }
2533
2534            ret = -EINVAL;
2535            break;
2536        }
2537
2538        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2539                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2540            RAMBlock *block = ram_block_from_stream(f, flags);
2541
2542            host = host_from_ram_block_offset(block, addr);
2543            if (!host) {
2544                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2545                ret = -EINVAL;
2546                break;
2547            }
2548            trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2549        }
2550
2551        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2552        case RAM_SAVE_FLAG_MEM_SIZE:
2553            /* Synchronize RAM block list */
2554            total_ram_bytes = addr;
2555            while (!ret && total_ram_bytes) {
2556                RAMBlock *block;
2557                char id[256];
2558                ram_addr_t length;
2559
2560                len = qemu_get_byte(f);
2561                qemu_get_buffer(f, (uint8_t *)id, len);
2562                id[len] = 0;
2563                length = qemu_get_be64(f);
2564
2565                block = qemu_ram_block_by_name(id);
2566                if (block) {
2567                    if (length != block->used_length) {
2568                        Error *local_err = NULL;
2569
2570                        ret = qemu_ram_resize(block, length,
2571                                              &local_err);
2572                        if (local_err) {
2573                            error_report_err(local_err);
2574                        }
2575                    }
2576                    /* For postcopy we need to check hugepage sizes match */
2577                    if (postcopy_advised &&
2578                        block->page_size != qemu_host_page_size) {
2579                        uint64_t remote_page_size = qemu_get_be64(f);
2580                        if (remote_page_size != block->page_size) {
2581                            error_report("Mismatched RAM page size %s "
2582                                         "(local) %zd != %" PRId64,
2583                                         id, block->page_size,
2584                                         remote_page_size);
2585                            ret = -EINVAL;
2586                        }
2587                    }
2588                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2589                                          block->idstr);
2590                } else {
2591                    error_report("Unknown ramblock \"%s\", cannot "
2592                                 "accept migration", id);
2593                    ret = -EINVAL;
2594                }
2595
2596                total_ram_bytes -= length;
2597            }
2598            break;
2599
2600        case RAM_SAVE_FLAG_ZERO:
2601            ch = qemu_get_byte(f);
2602            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2603            break;
2604
2605        case RAM_SAVE_FLAG_PAGE:
2606            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2607            break;
2608
2609        case RAM_SAVE_FLAG_COMPRESS_PAGE:
2610            len = qemu_get_be32(f);
2611            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2612                error_report("Invalid compressed data length: %d", len);
2613                ret = -EINVAL;
2614                break;
2615            }
2616            decompress_data_with_multi_threads(f, host, len);
2617            break;
2618
2619        case RAM_SAVE_FLAG_XBZRLE:
2620            if (load_xbzrle(f, addr, host) < 0) {
2621                error_report("Failed to decompress XBZRLE page at "
2622                             RAM_ADDR_FMT, addr);
2623                ret = -EINVAL;
2624                break;
2625            }
2626            break;
2627        case RAM_SAVE_FLAG_EOS:
2628            /* normal exit */
2629            break;
2630        default:
2631            if (flags & RAM_SAVE_FLAG_HOOK) {
2632                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2633            } else {
2634                error_report("Unknown combination of migration flags: %#x",
2635                             flags);
2636                ret = -EINVAL;
2637            }
2638        }
2639        if (!ret) {
2640            ret = qemu_file_get_error(f);
2641        }
2642    }
2643
2644    wait_for_decompress_done();
2645    rcu_read_unlock();
2646    trace_ram_load_complete(ret, seq_iter);
2647    return ret;
2648}
2649
2650static SaveVMHandlers savevm_ram_handlers = {
2651    .save_setup = ram_save_setup,
2652    .save_live_iterate = ram_save_iterate,
2653    .save_live_complete_postcopy = ram_save_complete,
2654    .save_live_complete_precopy = ram_save_complete,
2655    .save_live_pending = ram_save_pending,
2656    .load_state = ram_load,
2657    .save_cleanup = ram_save_cleanup,
2658    .load_setup = ram_load_setup,
2659    .load_cleanup = ram_load_cleanup,
2660};
2661
2662void ram_mig_init(void)
2663{
2664    qemu_mutex_init(&XBZRLE.lock);
2665    register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2666}
2667