qemu/migration/ram.c
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2011-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28#include "qemu/osdep.h"
  29#include "cpu.h"
  30#include <zlib.h>
  31#include "qapi-event.h"
  32#include "qemu/cutils.h"
  33#include "qemu/bitops.h"
  34#include "qemu/bitmap.h"
  35#include "qemu/main-loop.h"
  36#include "xbzrle.h"
  37#include "ram.h"
  38#include "migration.h"
  39#include "migration/register.h"
  40#include "migration/misc.h"
  41#include "qemu-file.h"
  42#include "postcopy-ram.h"
  43#include "migration/page_cache.h"
  44#include "qemu/error-report.h"
  45#include "qapi/qmp/qerror.h"
  46#include "trace.h"
  47#include "exec/ram_addr.h"
  48#include "exec/target_page.h"
  49#include "qemu/rcu_queue.h"
  50#include "migration/colo.h"
  51#include "migration/block.h"
  52
  53/***********************************************************/
  54/* ram save/restore */
  55
  56/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  57 * worked for pages that where filled with the same char.  We switched
  58 * it to only search for the zero value.  And to avoid confusion with
  59 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  60 */
  61
  62#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  63#define RAM_SAVE_FLAG_ZERO     0x02
  64#define RAM_SAVE_FLAG_MEM_SIZE 0x04
  65#define RAM_SAVE_FLAG_PAGE     0x08
  66#define RAM_SAVE_FLAG_EOS      0x10
  67#define RAM_SAVE_FLAG_CONTINUE 0x20
  68#define RAM_SAVE_FLAG_XBZRLE   0x40
  69/* 0x80 is reserved in migration.h start with 0x100 next */
  70#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  71
  72static inline bool is_zero_range(uint8_t *p, uint64_t size)
  73{
  74    return buffer_is_zero(p, size);
  75}
  76
  77XBZRLECacheStats xbzrle_counters;
  78
  79/* struct contains XBZRLE cache and a static page
  80   used by the compression */
  81static struct {
  82    /* buffer used for XBZRLE encoding */
  83    uint8_t *encoded_buf;
  84    /* buffer for storing page content */
  85    uint8_t *current_buf;
  86    /* Cache for XBZRLE, Protected by lock. */
  87    PageCache *cache;
  88    QemuMutex lock;
  89    /* it will store a page full of zeros */
  90    uint8_t *zero_target_page;
  91    /* buffer used for XBZRLE decoding */
  92    uint8_t *decoded_buf;
  93} XBZRLE;
  94
  95static void XBZRLE_cache_lock(void)
  96{
  97    if (migrate_use_xbzrle())
  98        qemu_mutex_lock(&XBZRLE.lock);
  99}
 100
 101static void XBZRLE_cache_unlock(void)
 102{
 103    if (migrate_use_xbzrle())
 104        qemu_mutex_unlock(&XBZRLE.lock);
 105}
 106
 107/**
 108 * xbzrle_cache_resize: resize the xbzrle cache
 109 *
 110 * This function is called from qmp_migrate_set_cache_size in main
 111 * thread, possibly while a migration is in progress.  A running
 112 * migration may be using the cache and might finish during this call,
 113 * hence changes to the cache are protected by XBZRLE.lock().
 114 *
 115 * Returns 0 for success or -1 for error
 116 *
 117 * @new_size: new cache size
 118 * @errp: set *errp if the check failed, with reason
 119 */
 120int xbzrle_cache_resize(int64_t new_size, Error **errp)
 121{
 122    PageCache *new_cache;
 123    int64_t ret = 0;
 124
 125    /* Check for truncation */
 126    if (new_size != (size_t)new_size) {
 127        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 128                   "exceeding address space");
 129        return -1;
 130    }
 131
 132    if (new_size == migrate_xbzrle_cache_size()) {
 133        /* nothing to do */
 134        return 0;
 135    }
 136
 137    XBZRLE_cache_lock();
 138
 139    if (XBZRLE.cache != NULL) {
 140        new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 141        if (!new_cache) {
 142            ret = -1;
 143            goto out;
 144        }
 145
 146        cache_fini(XBZRLE.cache);
 147        XBZRLE.cache = new_cache;
 148    }
 149out:
 150    XBZRLE_cache_unlock();
 151    return ret;
 152}
 153
 154static void ramblock_recv_map_init(void)
 155{
 156    RAMBlock *rb;
 157
 158    RAMBLOCK_FOREACH(rb) {
 159        assert(!rb->receivedmap);
 160        rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 161    }
 162}
 163
 164int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 165{
 166    return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 167                    rb->receivedmap);
 168}
 169
 170void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 171{
 172    set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 173}
 174
 175void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 176                                    size_t nr)
 177{
 178    bitmap_set_atomic(rb->receivedmap,
 179                      ramblock_recv_bitmap_offset(host_addr, rb),
 180                      nr);
 181}
 182
 183/*
 184 * An outstanding page request, on the source, having been received
 185 * and queued
 186 */
 187struct RAMSrcPageRequest {
 188    RAMBlock *rb;
 189    hwaddr    offset;
 190    hwaddr    len;
 191
 192    QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 193};
 194
 195/* State of RAM for migration */
 196struct RAMState {
 197    /* QEMUFile used for this migration */
 198    QEMUFile *f;
 199    /* Last block that we have visited searching for dirty pages */
 200    RAMBlock *last_seen_block;
 201    /* Last block from where we have sent data */
 202    RAMBlock *last_sent_block;
 203    /* Last dirty target page we have sent */
 204    ram_addr_t last_page;
 205    /* last ram version we have seen */
 206    uint32_t last_version;
 207    /* We are in the first round */
 208    bool ram_bulk_stage;
 209    /* How many times we have dirty too many pages */
 210    int dirty_rate_high_cnt;
 211    /* these variables are used for bitmap sync */
 212    /* last time we did a full bitmap_sync */
 213    int64_t time_last_bitmap_sync;
 214    /* bytes transferred at start_time */
 215    uint64_t bytes_xfer_prev;
 216    /* number of dirty pages since start_time */
 217    uint64_t num_dirty_pages_period;
 218    /* xbzrle misses since the beginning of the period */
 219    uint64_t xbzrle_cache_miss_prev;
 220    /* number of iterations at the beginning of period */
 221    uint64_t iterations_prev;
 222    /* Iterations since start */
 223    uint64_t iterations;
 224    /* number of dirty bits in the bitmap */
 225    uint64_t migration_dirty_pages;
 226    /* protects modification of the bitmap */
 227    QemuMutex bitmap_mutex;
 228    /* The RAMBlock used in the last src_page_requests */
 229    RAMBlock *last_req_rb;
 230    /* Queue of outstanding page requests from the destination */
 231    QemuMutex src_page_req_mutex;
 232    QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 233};
 234typedef struct RAMState RAMState;
 235
 236static RAMState *ram_state;
 237
 238uint64_t ram_bytes_remaining(void)
 239{
 240    return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
 241}
 242
 243MigrationStats ram_counters;
 244
 245/* used by the search for pages to send */
 246struct PageSearchStatus {
 247    /* Current block being searched */
 248    RAMBlock    *block;
 249    /* Current page to search from */
 250    unsigned long page;
 251    /* Set once we wrap around */
 252    bool         complete_round;
 253};
 254typedef struct PageSearchStatus PageSearchStatus;
 255
 256struct CompressParam {
 257    bool done;
 258    bool quit;
 259    QEMUFile *file;
 260    QemuMutex mutex;
 261    QemuCond cond;
 262    RAMBlock *block;
 263    ram_addr_t offset;
 264};
 265typedef struct CompressParam CompressParam;
 266
 267struct DecompressParam {
 268    bool done;
 269    bool quit;
 270    QemuMutex mutex;
 271    QemuCond cond;
 272    void *des;
 273    uint8_t *compbuf;
 274    int len;
 275};
 276typedef struct DecompressParam DecompressParam;
 277
 278static CompressParam *comp_param;
 279static QemuThread *compress_threads;
 280/* comp_done_cond is used to wake up the migration thread when
 281 * one of the compression threads has finished the compression.
 282 * comp_done_lock is used to co-work with comp_done_cond.
 283 */
 284static QemuMutex comp_done_lock;
 285static QemuCond comp_done_cond;
 286/* The empty QEMUFileOps will be used by file in CompressParam */
 287static const QEMUFileOps empty_ops = { };
 288
 289static DecompressParam *decomp_param;
 290static QemuThread *decompress_threads;
 291static QemuMutex decomp_done_lock;
 292static QemuCond decomp_done_cond;
 293
 294static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 295                                ram_addr_t offset);
 296
 297static void *do_data_compress(void *opaque)
 298{
 299    CompressParam *param = opaque;
 300    RAMBlock *block;
 301    ram_addr_t offset;
 302
 303    qemu_mutex_lock(&param->mutex);
 304    while (!param->quit) {
 305        if (param->block) {
 306            block = param->block;
 307            offset = param->offset;
 308            param->block = NULL;
 309            qemu_mutex_unlock(&param->mutex);
 310
 311            do_compress_ram_page(param->file, block, offset);
 312
 313            qemu_mutex_lock(&comp_done_lock);
 314            param->done = true;
 315            qemu_cond_signal(&comp_done_cond);
 316            qemu_mutex_unlock(&comp_done_lock);
 317
 318            qemu_mutex_lock(&param->mutex);
 319        } else {
 320            qemu_cond_wait(&param->cond, &param->mutex);
 321        }
 322    }
 323    qemu_mutex_unlock(&param->mutex);
 324
 325    return NULL;
 326}
 327
 328static inline void terminate_compression_threads(void)
 329{
 330    int idx, thread_count;
 331
 332    thread_count = migrate_compress_threads();
 333
 334    for (idx = 0; idx < thread_count; idx++) {
 335        qemu_mutex_lock(&comp_param[idx].mutex);
 336        comp_param[idx].quit = true;
 337        qemu_cond_signal(&comp_param[idx].cond);
 338        qemu_mutex_unlock(&comp_param[idx].mutex);
 339    }
 340}
 341
 342static void compress_threads_save_cleanup(void)
 343{
 344    int i, thread_count;
 345
 346    if (!migrate_use_compression()) {
 347        return;
 348    }
 349    terminate_compression_threads();
 350    thread_count = migrate_compress_threads();
 351    for (i = 0; i < thread_count; i++) {
 352        qemu_thread_join(compress_threads + i);
 353        qemu_fclose(comp_param[i].file);
 354        qemu_mutex_destroy(&comp_param[i].mutex);
 355        qemu_cond_destroy(&comp_param[i].cond);
 356    }
 357    qemu_mutex_destroy(&comp_done_lock);
 358    qemu_cond_destroy(&comp_done_cond);
 359    g_free(compress_threads);
 360    g_free(comp_param);
 361    compress_threads = NULL;
 362    comp_param = NULL;
 363}
 364
 365static void compress_threads_save_setup(void)
 366{
 367    int i, thread_count;
 368
 369    if (!migrate_use_compression()) {
 370        return;
 371    }
 372    thread_count = migrate_compress_threads();
 373    compress_threads = g_new0(QemuThread, thread_count);
 374    comp_param = g_new0(CompressParam, thread_count);
 375    qemu_cond_init(&comp_done_cond);
 376    qemu_mutex_init(&comp_done_lock);
 377    for (i = 0; i < thread_count; i++) {
 378        /* comp_param[i].file is just used as a dummy buffer to save data,
 379         * set its ops to empty.
 380         */
 381        comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 382        comp_param[i].done = true;
 383        comp_param[i].quit = false;
 384        qemu_mutex_init(&comp_param[i].mutex);
 385        qemu_cond_init(&comp_param[i].cond);
 386        qemu_thread_create(compress_threads + i, "compress",
 387                           do_data_compress, comp_param + i,
 388                           QEMU_THREAD_JOINABLE);
 389    }
 390}
 391
 392/* Multiple fd's */
 393
 394struct MultiFDSendParams {
 395    uint8_t id;
 396    char *name;
 397    QemuThread thread;
 398    QemuSemaphore sem;
 399    QemuMutex mutex;
 400    bool quit;
 401};
 402typedef struct MultiFDSendParams MultiFDSendParams;
 403
 404struct {
 405    MultiFDSendParams *params;
 406    /* number of created threads */
 407    int count;
 408} *multifd_send_state;
 409
 410static void terminate_multifd_send_threads(Error *errp)
 411{
 412    int i;
 413
 414    for (i = 0; i < multifd_send_state->count; i++) {
 415        MultiFDSendParams *p = &multifd_send_state->params[i];
 416
 417        qemu_mutex_lock(&p->mutex);
 418        p->quit = true;
 419        qemu_sem_post(&p->sem);
 420        qemu_mutex_unlock(&p->mutex);
 421    }
 422}
 423
 424int multifd_save_cleanup(Error **errp)
 425{
 426    int i;
 427    int ret = 0;
 428
 429    if (!migrate_use_multifd()) {
 430        return 0;
 431    }
 432    terminate_multifd_send_threads(NULL);
 433    for (i = 0; i < multifd_send_state->count; i++) {
 434        MultiFDSendParams *p = &multifd_send_state->params[i];
 435
 436        qemu_thread_join(&p->thread);
 437        qemu_mutex_destroy(&p->mutex);
 438        qemu_sem_destroy(&p->sem);
 439        g_free(p->name);
 440        p->name = NULL;
 441    }
 442    g_free(multifd_send_state->params);
 443    multifd_send_state->params = NULL;
 444    g_free(multifd_send_state);
 445    multifd_send_state = NULL;
 446    return ret;
 447}
 448
 449static void *multifd_send_thread(void *opaque)
 450{
 451    MultiFDSendParams *p = opaque;
 452
 453    while (true) {
 454        qemu_mutex_lock(&p->mutex);
 455        if (p->quit) {
 456            qemu_mutex_unlock(&p->mutex);
 457            break;
 458        }
 459        qemu_mutex_unlock(&p->mutex);
 460        qemu_sem_wait(&p->sem);
 461    }
 462
 463    return NULL;
 464}
 465
 466int multifd_save_setup(void)
 467{
 468    int thread_count;
 469    uint8_t i;
 470
 471    if (!migrate_use_multifd()) {
 472        return 0;
 473    }
 474    thread_count = migrate_multifd_channels();
 475    multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 476    multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 477    multifd_send_state->count = 0;
 478    for (i = 0; i < thread_count; i++) {
 479        MultiFDSendParams *p = &multifd_send_state->params[i];
 480
 481        qemu_mutex_init(&p->mutex);
 482        qemu_sem_init(&p->sem, 0);
 483        p->quit = false;
 484        p->id = i;
 485        p->name = g_strdup_printf("multifdsend_%d", i);
 486        qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
 487                           QEMU_THREAD_JOINABLE);
 488
 489        multifd_send_state->count++;
 490    }
 491    return 0;
 492}
 493
 494struct MultiFDRecvParams {
 495    uint8_t id;
 496    char *name;
 497    QemuThread thread;
 498    QemuSemaphore sem;
 499    QemuMutex mutex;
 500    bool quit;
 501};
 502typedef struct MultiFDRecvParams MultiFDRecvParams;
 503
 504struct {
 505    MultiFDRecvParams *params;
 506    /* number of created threads */
 507    int count;
 508} *multifd_recv_state;
 509
 510static void terminate_multifd_recv_threads(Error *errp)
 511{
 512    int i;
 513
 514    for (i = 0; i < multifd_recv_state->count; i++) {
 515        MultiFDRecvParams *p = &multifd_recv_state->params[i];
 516
 517        qemu_mutex_lock(&p->mutex);
 518        p->quit = true;
 519        qemu_sem_post(&p->sem);
 520        qemu_mutex_unlock(&p->mutex);
 521    }
 522}
 523
 524int multifd_load_cleanup(Error **errp)
 525{
 526    int i;
 527    int ret = 0;
 528
 529    if (!migrate_use_multifd()) {
 530        return 0;
 531    }
 532    terminate_multifd_recv_threads(NULL);
 533    for (i = 0; i < multifd_recv_state->count; i++) {
 534        MultiFDRecvParams *p = &multifd_recv_state->params[i];
 535
 536        qemu_thread_join(&p->thread);
 537        qemu_mutex_destroy(&p->mutex);
 538        qemu_sem_destroy(&p->sem);
 539        g_free(p->name);
 540        p->name = NULL;
 541    }
 542    g_free(multifd_recv_state->params);
 543    multifd_recv_state->params = NULL;
 544    g_free(multifd_recv_state);
 545    multifd_recv_state = NULL;
 546
 547    return ret;
 548}
 549
 550static void *multifd_recv_thread(void *opaque)
 551{
 552    MultiFDRecvParams *p = opaque;
 553
 554    while (true) {
 555        qemu_mutex_lock(&p->mutex);
 556        if (p->quit) {
 557            qemu_mutex_unlock(&p->mutex);
 558            break;
 559        }
 560        qemu_mutex_unlock(&p->mutex);
 561        qemu_sem_wait(&p->sem);
 562    }
 563
 564    return NULL;
 565}
 566
 567int multifd_load_setup(void)
 568{
 569    int thread_count;
 570    uint8_t i;
 571
 572    if (!migrate_use_multifd()) {
 573        return 0;
 574    }
 575    thread_count = migrate_multifd_channels();
 576    multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 577    multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
 578    multifd_recv_state->count = 0;
 579    for (i = 0; i < thread_count; i++) {
 580        MultiFDRecvParams *p = &multifd_recv_state->params[i];
 581
 582        qemu_mutex_init(&p->mutex);
 583        qemu_sem_init(&p->sem, 0);
 584        p->quit = false;
 585        p->id = i;
 586        p->name = g_strdup_printf("multifdrecv_%d", i);
 587        qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
 588                           QEMU_THREAD_JOINABLE);
 589        multifd_recv_state->count++;
 590    }
 591    return 0;
 592}
 593
 594/**
 595 * save_page_header: write page header to wire
 596 *
 597 * If this is the 1st block, it also writes the block identification
 598 *
 599 * Returns the number of bytes written
 600 *
 601 * @f: QEMUFile where to send the data
 602 * @block: block that contains the page we want to send
 603 * @offset: offset inside the block for the page
 604 *          in the lower bits, it contains flags
 605 */
 606static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 607                               ram_addr_t offset)
 608{
 609    size_t size, len;
 610
 611    if (block == rs->last_sent_block) {
 612        offset |= RAM_SAVE_FLAG_CONTINUE;
 613    }
 614    qemu_put_be64(f, offset);
 615    size = 8;
 616
 617    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 618        len = strlen(block->idstr);
 619        qemu_put_byte(f, len);
 620        qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 621        size += 1 + len;
 622        rs->last_sent_block = block;
 623    }
 624    return size;
 625}
 626
 627/**
 628 * mig_throttle_guest_down: throotle down the guest
 629 *
 630 * Reduce amount of guest cpu execution to hopefully slow down memory
 631 * writes. If guest dirty memory rate is reduced below the rate at
 632 * which we can transfer pages to the destination then we should be
 633 * able to complete migration. Some workloads dirty memory way too
 634 * fast and will not effectively converge, even with auto-converge.
 635 */
 636static void mig_throttle_guest_down(void)
 637{
 638    MigrationState *s = migrate_get_current();
 639    uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 640    uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 641
 642    /* We have not started throttling yet. Let's start it. */
 643    if (!cpu_throttle_active()) {
 644        cpu_throttle_set(pct_initial);
 645    } else {
 646        /* Throttling already on, just increase the rate */
 647        cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 648    }
 649}
 650
 651/**
 652 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 653 *
 654 * @rs: current RAM state
 655 * @current_addr: address for the zero page
 656 *
 657 * Update the xbzrle cache to reflect a page that's been sent as all 0.
 658 * The important thing is that a stale (not-yet-0'd) page be replaced
 659 * by the new data.
 660 * As a bonus, if the page wasn't in the cache it gets added so that
 661 * when a small write is made into the 0'd page it gets XBZRLE sent.
 662 */
 663static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 664{
 665    if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 666        return;
 667    }
 668
 669    /* We don't care if this fails to allocate a new cache page
 670     * as long as it updated an old one */
 671    cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 672                 ram_counters.dirty_sync_count);
 673}
 674
 675#define ENCODING_FLAG_XBZRLE 0x1
 676
 677/**
 678 * save_xbzrle_page: compress and send current page
 679 *
 680 * Returns: 1 means that we wrote the page
 681 *          0 means that page is identical to the one already sent
 682 *          -1 means that xbzrle would be longer than normal
 683 *
 684 * @rs: current RAM state
 685 * @current_data: pointer to the address of the page contents
 686 * @current_addr: addr of the page
 687 * @block: block that contains the page we want to send
 688 * @offset: offset inside the block for the page
 689 * @last_stage: if we are at the completion stage
 690 */
 691static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 692                            ram_addr_t current_addr, RAMBlock *block,
 693                            ram_addr_t offset, bool last_stage)
 694{
 695    int encoded_len = 0, bytes_xbzrle;
 696    uint8_t *prev_cached_page;
 697
 698    if (!cache_is_cached(XBZRLE.cache, current_addr,
 699                         ram_counters.dirty_sync_count)) {
 700        xbzrle_counters.cache_miss++;
 701        if (!last_stage) {
 702            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 703                             ram_counters.dirty_sync_count) == -1) {
 704                return -1;
 705            } else {
 706                /* update *current_data when the page has been
 707                   inserted into cache */
 708                *current_data = get_cached_data(XBZRLE.cache, current_addr);
 709            }
 710        }
 711        return -1;
 712    }
 713
 714    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 715
 716    /* save current buffer into memory */
 717    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 718
 719    /* XBZRLE encoding (if there is no overflow) */
 720    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 721                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 722                                       TARGET_PAGE_SIZE);
 723    if (encoded_len == 0) {
 724        trace_save_xbzrle_page_skipping();
 725        return 0;
 726    } else if (encoded_len == -1) {
 727        trace_save_xbzrle_page_overflow();
 728        xbzrle_counters.overflow++;
 729        /* update data in the cache */
 730        if (!last_stage) {
 731            memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 732            *current_data = prev_cached_page;
 733        }
 734        return -1;
 735    }
 736
 737    /* we need to update the data in the cache, in order to get the same data */
 738    if (!last_stage) {
 739        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 740    }
 741
 742    /* Send XBZRLE based compressed page */
 743    bytes_xbzrle = save_page_header(rs, rs->f, block,
 744                                    offset | RAM_SAVE_FLAG_XBZRLE);
 745    qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 746    qemu_put_be16(rs->f, encoded_len);
 747    qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 748    bytes_xbzrle += encoded_len + 1 + 2;
 749    xbzrle_counters.pages++;
 750    xbzrle_counters.bytes += bytes_xbzrle;
 751    ram_counters.transferred += bytes_xbzrle;
 752
 753    return 1;
 754}
 755
 756/**
 757 * migration_bitmap_find_dirty: find the next dirty page from start
 758 *
 759 * Called with rcu_read_lock() to protect migration_bitmap
 760 *
 761 * Returns the byte offset within memory region of the start of a dirty page
 762 *
 763 * @rs: current RAM state
 764 * @rb: RAMBlock where to search for dirty pages
 765 * @start: page where we start the search
 766 */
 767static inline
 768unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 769                                          unsigned long start)
 770{
 771    unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 772    unsigned long *bitmap = rb->bmap;
 773    unsigned long next;
 774
 775    if (rs->ram_bulk_stage && start > 0) {
 776        next = start + 1;
 777    } else {
 778        next = find_next_bit(bitmap, size, start);
 779    }
 780
 781    return next;
 782}
 783
 784static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 785                                                RAMBlock *rb,
 786                                                unsigned long page)
 787{
 788    bool ret;
 789
 790    ret = test_and_clear_bit(page, rb->bmap);
 791
 792    if (ret) {
 793        rs->migration_dirty_pages--;
 794    }
 795    return ret;
 796}
 797
 798static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 799                                        ram_addr_t start, ram_addr_t length)
 800{
 801    rs->migration_dirty_pages +=
 802        cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 803                                              &rs->num_dirty_pages_period);
 804}
 805
 806/**
 807 * ram_pagesize_summary: calculate all the pagesizes of a VM
 808 *
 809 * Returns a summary bitmap of the page sizes of all RAMBlocks
 810 *
 811 * For VMs with just normal pages this is equivalent to the host page
 812 * size. If it's got some huge pages then it's the OR of all the
 813 * different page sizes.
 814 */
 815uint64_t ram_pagesize_summary(void)
 816{
 817    RAMBlock *block;
 818    uint64_t summary = 0;
 819
 820    RAMBLOCK_FOREACH(block) {
 821        summary |= block->page_size;
 822    }
 823
 824    return summary;
 825}
 826
 827static void migration_bitmap_sync(RAMState *rs)
 828{
 829    RAMBlock *block;
 830    int64_t end_time;
 831    uint64_t bytes_xfer_now;
 832
 833    ram_counters.dirty_sync_count++;
 834
 835    if (!rs->time_last_bitmap_sync) {
 836        rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 837    }
 838
 839    trace_migration_bitmap_sync_start();
 840    memory_global_dirty_log_sync();
 841
 842    qemu_mutex_lock(&rs->bitmap_mutex);
 843    rcu_read_lock();
 844    RAMBLOCK_FOREACH(block) {
 845        migration_bitmap_sync_range(rs, block, 0, block->used_length);
 846    }
 847    rcu_read_unlock();
 848    qemu_mutex_unlock(&rs->bitmap_mutex);
 849
 850    trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 851
 852    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 853
 854    /* more than 1 second = 1000 millisecons */
 855    if (end_time > rs->time_last_bitmap_sync + 1000) {
 856        /* calculate period counters */
 857        ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 858            / (end_time - rs->time_last_bitmap_sync);
 859        bytes_xfer_now = ram_counters.transferred;
 860
 861        /* During block migration the auto-converge logic incorrectly detects
 862         * that ram migration makes no progress. Avoid this by disabling the
 863         * throttling logic during the bulk phase of block migration. */
 864        if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 865            /* The following detection logic can be refined later. For now:
 866               Check to see if the dirtied bytes is 50% more than the approx.
 867               amount of bytes that just got transferred since the last time we
 868               were in this routine. If that happens twice, start or increase
 869               throttling */
 870
 871            if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 872                   (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 873                (++rs->dirty_rate_high_cnt >= 2)) {
 874                    trace_migration_throttle();
 875                    rs->dirty_rate_high_cnt = 0;
 876                    mig_throttle_guest_down();
 877            }
 878        }
 879
 880        if (migrate_use_xbzrle()) {
 881            if (rs->iterations_prev != rs->iterations) {
 882                xbzrle_counters.cache_miss_rate =
 883                   (double)(xbzrle_counters.cache_miss -
 884                            rs->xbzrle_cache_miss_prev) /
 885                   (rs->iterations - rs->iterations_prev);
 886            }
 887            rs->iterations_prev = rs->iterations;
 888            rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 889        }
 890
 891        /* reset period counters */
 892        rs->time_last_bitmap_sync = end_time;
 893        rs->num_dirty_pages_period = 0;
 894        rs->bytes_xfer_prev = bytes_xfer_now;
 895    }
 896    if (migrate_use_events()) {
 897        qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
 898    }
 899}
 900
 901/**
 902 * save_zero_page: send the zero page to the stream
 903 *
 904 * Returns the number of pages written.
 905 *
 906 * @rs: current RAM state
 907 * @block: block that contains the page we want to send
 908 * @offset: offset inside the block for the page
 909 * @p: pointer to the page
 910 */
 911static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 912                          uint8_t *p)
 913{
 914    int pages = -1;
 915
 916    if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 917        ram_counters.duplicate++;
 918        ram_counters.transferred +=
 919            save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 920        qemu_put_byte(rs->f, 0);
 921        ram_counters.transferred += 1;
 922        pages = 1;
 923    }
 924
 925    return pages;
 926}
 927
 928static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 929{
 930    if (!migrate_release_ram() || !migration_in_postcopy()) {
 931        return;
 932    }
 933
 934    ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 935}
 936
 937/**
 938 * ram_save_page: send the given page to the stream
 939 *
 940 * Returns the number of pages written.
 941 *          < 0 - error
 942 *          >=0 - Number of pages written - this might legally be 0
 943 *                if xbzrle noticed the page was the same.
 944 *
 945 * @rs: current RAM state
 946 * @block: block that contains the page we want to send
 947 * @offset: offset inside the block for the page
 948 * @last_stage: if we are at the completion stage
 949 */
 950static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 951{
 952    int pages = -1;
 953    uint64_t bytes_xmit;
 954    ram_addr_t current_addr;
 955    uint8_t *p;
 956    int ret;
 957    bool send_async = true;
 958    RAMBlock *block = pss->block;
 959    ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 960
 961    p = block->host + offset;
 962    trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 963
 964    /* In doubt sent page as normal */
 965    bytes_xmit = 0;
 966    ret = ram_control_save_page(rs->f, block->offset,
 967                           offset, TARGET_PAGE_SIZE, &bytes_xmit);
 968    if (bytes_xmit) {
 969        ram_counters.transferred += bytes_xmit;
 970        pages = 1;
 971    }
 972
 973    XBZRLE_cache_lock();
 974
 975    current_addr = block->offset + offset;
 976
 977    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 978        if (ret != RAM_SAVE_CONTROL_DELAYED) {
 979            if (bytes_xmit > 0) {
 980                ram_counters.normal++;
 981            } else if (bytes_xmit == 0) {
 982                ram_counters.duplicate++;
 983            }
 984        }
 985    } else {
 986        pages = save_zero_page(rs, block, offset, p);
 987        if (pages > 0) {
 988            /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 989             * page would be stale
 990             */
 991            xbzrle_cache_zero_page(rs, current_addr);
 992            ram_release_pages(block->idstr, offset, pages);
 993        } else if (!rs->ram_bulk_stage &&
 994                   !migration_in_postcopy() && migrate_use_xbzrle()) {
 995            pages = save_xbzrle_page(rs, &p, current_addr, block,
 996                                     offset, last_stage);
 997            if (!last_stage) {
 998                /* Can't send this cached data async, since the cache page
 999                 * might get updated before it gets to the wire
1000                 */
1001                send_async = false;
1002            }
1003        }
1004    }
1005
1006    /* XBZRLE overflow or normal page */
1007    if (pages == -1) {
1008        ram_counters.transferred +=
1009            save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
1010        if (send_async) {
1011            qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
1012                                  migrate_release_ram() &
1013                                  migration_in_postcopy());
1014        } else {
1015            qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
1016        }
1017        ram_counters.transferred += TARGET_PAGE_SIZE;
1018        pages = 1;
1019        ram_counters.normal++;
1020    }
1021
1022    XBZRLE_cache_unlock();
1023
1024    return pages;
1025}
1026
1027static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
1028                                ram_addr_t offset)
1029{
1030    RAMState *rs = ram_state;
1031    int bytes_sent, blen;
1032    uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1033
1034    bytes_sent = save_page_header(rs, f, block, offset |
1035                                  RAM_SAVE_FLAG_COMPRESS_PAGE);
1036    blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1037                                     migrate_compress_level());
1038    if (blen < 0) {
1039        bytes_sent = 0;
1040        qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1041        error_report("compressed data failed!");
1042    } else {
1043        bytes_sent += blen;
1044        ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1045    }
1046
1047    return bytes_sent;
1048}
1049
1050static void flush_compressed_data(RAMState *rs)
1051{
1052    int idx, len, thread_count;
1053
1054    if (!migrate_use_compression()) {
1055        return;
1056    }
1057    thread_count = migrate_compress_threads();
1058
1059    qemu_mutex_lock(&comp_done_lock);
1060    for (idx = 0; idx < thread_count; idx++) {
1061        while (!comp_param[idx].done) {
1062            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1063        }
1064    }
1065    qemu_mutex_unlock(&comp_done_lock);
1066
1067    for (idx = 0; idx < thread_count; idx++) {
1068        qemu_mutex_lock(&comp_param[idx].mutex);
1069        if (!comp_param[idx].quit) {
1070            len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1071            ram_counters.transferred += len;
1072        }
1073        qemu_mutex_unlock(&comp_param[idx].mutex);
1074    }
1075}
1076
1077static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1078                                       ram_addr_t offset)
1079{
1080    param->block = block;
1081    param->offset = offset;
1082}
1083
1084static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1085                                           ram_addr_t offset)
1086{
1087    int idx, thread_count, bytes_xmit = -1, pages = -1;
1088
1089    thread_count = migrate_compress_threads();
1090    qemu_mutex_lock(&comp_done_lock);
1091    while (true) {
1092        for (idx = 0; idx < thread_count; idx++) {
1093            if (comp_param[idx].done) {
1094                comp_param[idx].done = false;
1095                bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1096                qemu_mutex_lock(&comp_param[idx].mutex);
1097                set_compress_params(&comp_param[idx], block, offset);
1098                qemu_cond_signal(&comp_param[idx].cond);
1099                qemu_mutex_unlock(&comp_param[idx].mutex);
1100                pages = 1;
1101                ram_counters.normal++;
1102                ram_counters.transferred += bytes_xmit;
1103                break;
1104            }
1105        }
1106        if (pages > 0) {
1107            break;
1108        } else {
1109            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1110        }
1111    }
1112    qemu_mutex_unlock(&comp_done_lock);
1113
1114    return pages;
1115}
1116
1117/**
1118 * ram_save_compressed_page: compress the given page and send it to the stream
1119 *
1120 * Returns the number of pages written.
1121 *
1122 * @rs: current RAM state
1123 * @block: block that contains the page we want to send
1124 * @offset: offset inside the block for the page
1125 * @last_stage: if we are at the completion stage
1126 */
1127static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1128                                    bool last_stage)
1129{
1130    int pages = -1;
1131    uint64_t bytes_xmit = 0;
1132    uint8_t *p;
1133    int ret, blen;
1134    RAMBlock *block = pss->block;
1135    ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1136
1137    p = block->host + offset;
1138
1139    ret = ram_control_save_page(rs->f, block->offset,
1140                                offset, TARGET_PAGE_SIZE, &bytes_xmit);
1141    if (bytes_xmit) {
1142        ram_counters.transferred += bytes_xmit;
1143        pages = 1;
1144    }
1145    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1146        if (ret != RAM_SAVE_CONTROL_DELAYED) {
1147            if (bytes_xmit > 0) {
1148                ram_counters.normal++;
1149            } else if (bytes_xmit == 0) {
1150                ram_counters.duplicate++;
1151            }
1152        }
1153    } else {
1154        /* When starting the process of a new block, the first page of
1155         * the block should be sent out before other pages in the same
1156         * block, and all the pages in last block should have been sent
1157         * out, keeping this order is important, because the 'cont' flag
1158         * is used to avoid resending the block name.
1159         */
1160        if (block != rs->last_sent_block) {
1161            flush_compressed_data(rs);
1162            pages = save_zero_page(rs, block, offset, p);
1163            if (pages == -1) {
1164                /* Make sure the first page is sent out before other pages */
1165                bytes_xmit = save_page_header(rs, rs->f, block, offset |
1166                                              RAM_SAVE_FLAG_COMPRESS_PAGE);
1167                blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1168                                                 migrate_compress_level());
1169                if (blen > 0) {
1170                    ram_counters.transferred += bytes_xmit + blen;
1171                    ram_counters.normal++;
1172                    pages = 1;
1173                } else {
1174                    qemu_file_set_error(rs->f, blen);
1175                    error_report("compressed data failed!");
1176                }
1177            }
1178            if (pages > 0) {
1179                ram_release_pages(block->idstr, offset, pages);
1180            }
1181        } else {
1182            pages = save_zero_page(rs, block, offset, p);
1183            if (pages == -1) {
1184                pages = compress_page_with_multi_thread(rs, block, offset);
1185            } else {
1186                ram_release_pages(block->idstr, offset, pages);
1187            }
1188        }
1189    }
1190
1191    return pages;
1192}
1193
1194/**
1195 * find_dirty_block: find the next dirty page and update any state
1196 * associated with the search process.
1197 *
1198 * Returns if a page is found
1199 *
1200 * @rs: current RAM state
1201 * @pss: data about the state of the current dirty page scan
1202 * @again: set to false if the search has scanned the whole of RAM
1203 */
1204static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1205{
1206    pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1207    if (pss->complete_round && pss->block == rs->last_seen_block &&
1208        pss->page >= rs->last_page) {
1209        /*
1210         * We've been once around the RAM and haven't found anything.
1211         * Give up.
1212         */
1213        *again = false;
1214        return false;
1215    }
1216    if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1217        /* Didn't find anything in this RAM Block */
1218        pss->page = 0;
1219        pss->block = QLIST_NEXT_RCU(pss->block, next);
1220        if (!pss->block) {
1221            /* Hit the end of the list */
1222            pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1223            /* Flag that we've looped */
1224            pss->complete_round = true;
1225            rs->ram_bulk_stage = false;
1226            if (migrate_use_xbzrle()) {
1227                /* If xbzrle is on, stop using the data compression at this
1228                 * point. In theory, xbzrle can do better than compression.
1229                 */
1230                flush_compressed_data(rs);
1231            }
1232        }
1233        /* Didn't find anything this time, but try again on the new block */
1234        *again = true;
1235        return false;
1236    } else {
1237        /* Can go around again, but... */
1238        *again = true;
1239        /* We've found something so probably don't need to */
1240        return true;
1241    }
1242}
1243
1244/**
1245 * unqueue_page: gets a page of the queue
1246 *
1247 * Helper for 'get_queued_page' - gets a page off the queue
1248 *
1249 * Returns the block of the page (or NULL if none available)
1250 *
1251 * @rs: current RAM state
1252 * @offset: used to return the offset within the RAMBlock
1253 */
1254static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1255{
1256    RAMBlock *block = NULL;
1257
1258    qemu_mutex_lock(&rs->src_page_req_mutex);
1259    if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1260        struct RAMSrcPageRequest *entry =
1261                                QSIMPLEQ_FIRST(&rs->src_page_requests);
1262        block = entry->rb;
1263        *offset = entry->offset;
1264
1265        if (entry->len > TARGET_PAGE_SIZE) {
1266            entry->len -= TARGET_PAGE_SIZE;
1267            entry->offset += TARGET_PAGE_SIZE;
1268        } else {
1269            memory_region_unref(block->mr);
1270            QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1271            g_free(entry);
1272        }
1273    }
1274    qemu_mutex_unlock(&rs->src_page_req_mutex);
1275
1276    return block;
1277}
1278
1279/**
1280 * get_queued_page: unqueue a page from the postocpy requests
1281 *
1282 * Skips pages that are already sent (!dirty)
1283 *
1284 * Returns if a queued page is found
1285 *
1286 * @rs: current RAM state
1287 * @pss: data about the state of the current dirty page scan
1288 */
1289static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1290{
1291    RAMBlock  *block;
1292    ram_addr_t offset;
1293    bool dirty;
1294
1295    do {
1296        block = unqueue_page(rs, &offset);
1297        /*
1298         * We're sending this page, and since it's postcopy nothing else
1299         * will dirty it, and we must make sure it doesn't get sent again
1300         * even if this queue request was received after the background
1301         * search already sent it.
1302         */
1303        if (block) {
1304            unsigned long page;
1305
1306            page = offset >> TARGET_PAGE_BITS;
1307            dirty = test_bit(page, block->bmap);
1308            if (!dirty) {
1309                trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1310                       page, test_bit(page, block->unsentmap));
1311            } else {
1312                trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1313            }
1314        }
1315
1316    } while (block && !dirty);
1317
1318    if (block) {
1319        /*
1320         * As soon as we start servicing pages out of order, then we have
1321         * to kill the bulk stage, since the bulk stage assumes
1322         * in (migration_bitmap_find_and_reset_dirty) that every page is
1323         * dirty, that's no longer true.
1324         */
1325        rs->ram_bulk_stage = false;
1326
1327        /*
1328         * We want the background search to continue from the queued page
1329         * since the guest is likely to want other pages near to the page
1330         * it just requested.
1331         */
1332        pss->block = block;
1333        pss->page = offset >> TARGET_PAGE_BITS;
1334    }
1335
1336    return !!block;
1337}
1338
1339/**
1340 * migration_page_queue_free: drop any remaining pages in the ram
1341 * request queue
1342 *
1343 * It should be empty at the end anyway, but in error cases there may
1344 * be some left.  in case that there is any page left, we drop it.
1345 *
1346 */
1347static void migration_page_queue_free(RAMState *rs)
1348{
1349    struct RAMSrcPageRequest *mspr, *next_mspr;
1350    /* This queue generally should be empty - but in the case of a failed
1351     * migration might have some droppings in.
1352     */
1353    rcu_read_lock();
1354    QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1355        memory_region_unref(mspr->rb->mr);
1356        QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1357        g_free(mspr);
1358    }
1359    rcu_read_unlock();
1360}
1361
1362/**
1363 * ram_save_queue_pages: queue the page for transmission
1364 *
1365 * A request from postcopy destination for example.
1366 *
1367 * Returns zero on success or negative on error
1368 *
1369 * @rbname: Name of the RAMBLock of the request. NULL means the
1370 *          same that last one.
1371 * @start: starting address from the start of the RAMBlock
1372 * @len: length (in bytes) to send
1373 */
1374int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1375{
1376    RAMBlock *ramblock;
1377    RAMState *rs = ram_state;
1378
1379    ram_counters.postcopy_requests++;
1380    rcu_read_lock();
1381    if (!rbname) {
1382        /* Reuse last RAMBlock */
1383        ramblock = rs->last_req_rb;
1384
1385        if (!ramblock) {
1386            /*
1387             * Shouldn't happen, we can't reuse the last RAMBlock if
1388             * it's the 1st request.
1389             */
1390            error_report("ram_save_queue_pages no previous block");
1391            goto err;
1392        }
1393    } else {
1394        ramblock = qemu_ram_block_by_name(rbname);
1395
1396        if (!ramblock) {
1397            /* We shouldn't be asked for a non-existent RAMBlock */
1398            error_report("ram_save_queue_pages no block '%s'", rbname);
1399            goto err;
1400        }
1401        rs->last_req_rb = ramblock;
1402    }
1403    trace_ram_save_queue_pages(ramblock->idstr, start, len);
1404    if (start+len > ramblock->used_length) {
1405        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1406                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1407                     __func__, start, len, ramblock->used_length);
1408        goto err;
1409    }
1410
1411    struct RAMSrcPageRequest *new_entry =
1412        g_malloc0(sizeof(struct RAMSrcPageRequest));
1413    new_entry->rb = ramblock;
1414    new_entry->offset = start;
1415    new_entry->len = len;
1416
1417    memory_region_ref(ramblock->mr);
1418    qemu_mutex_lock(&rs->src_page_req_mutex);
1419    QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1420    qemu_mutex_unlock(&rs->src_page_req_mutex);
1421    rcu_read_unlock();
1422
1423    return 0;
1424
1425err:
1426    rcu_read_unlock();
1427    return -1;
1428}
1429
1430/**
1431 * ram_save_target_page: save one target page
1432 *
1433 * Returns the number of pages written
1434 *
1435 * @rs: current RAM state
1436 * @ms: current migration state
1437 * @pss: data about the page we want to send
1438 * @last_stage: if we are at the completion stage
1439 */
1440static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1441                                bool last_stage)
1442{
1443    int res = 0;
1444
1445    /* Check the pages is dirty and if it is send it */
1446    if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1447        /*
1448         * If xbzrle is on, stop using the data compression after first
1449         * round of migration even if compression is enabled. In theory,
1450         * xbzrle can do better than compression.
1451         */
1452        if (migrate_use_compression() &&
1453            (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1454            res = ram_save_compressed_page(rs, pss, last_stage);
1455        } else {
1456            res = ram_save_page(rs, pss, last_stage);
1457        }
1458
1459        if (res < 0) {
1460            return res;
1461        }
1462        if (pss->block->unsentmap) {
1463            clear_bit(pss->page, pss->block->unsentmap);
1464        }
1465    }
1466
1467    return res;
1468}
1469
1470/**
1471 * ram_save_host_page: save a whole host page
1472 *
1473 * Starting at *offset send pages up to the end of the current host
1474 * page. It's valid for the initial offset to point into the middle of
1475 * a host page in which case the remainder of the hostpage is sent.
1476 * Only dirty target pages are sent. Note that the host page size may
1477 * be a huge page for this block.
1478 * The saving stops at the boundary of the used_length of the block
1479 * if the RAMBlock isn't a multiple of the host page size.
1480 *
1481 * Returns the number of pages written or negative on error
1482 *
1483 * @rs: current RAM state
1484 * @ms: current migration state
1485 * @pss: data about the page we want to send
1486 * @last_stage: if we are at the completion stage
1487 */
1488static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1489                              bool last_stage)
1490{
1491    int tmppages, pages = 0;
1492    size_t pagesize_bits =
1493        qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1494
1495    do {
1496        tmppages = ram_save_target_page(rs, pss, last_stage);
1497        if (tmppages < 0) {
1498            return tmppages;
1499        }
1500
1501        pages += tmppages;
1502        pss->page++;
1503    } while ((pss->page & (pagesize_bits - 1)) &&
1504             offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1505
1506    /* The offset we leave with is the last one we looked at */
1507    pss->page--;
1508    return pages;
1509}
1510
1511/**
1512 * ram_find_and_save_block: finds a dirty page and sends it to f
1513 *
1514 * Called within an RCU critical section.
1515 *
1516 * Returns the number of pages written where zero means no dirty pages
1517 *
1518 * @rs: current RAM state
1519 * @last_stage: if we are at the completion stage
1520 *
1521 * On systems where host-page-size > target-page-size it will send all the
1522 * pages in a host page that are dirty.
1523 */
1524
1525static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1526{
1527    PageSearchStatus pss;
1528    int pages = 0;
1529    bool again, found;
1530
1531    /* No dirty page as there is zero RAM */
1532    if (!ram_bytes_total()) {
1533        return pages;
1534    }
1535
1536    pss.block = rs->last_seen_block;
1537    pss.page = rs->last_page;
1538    pss.complete_round = false;
1539
1540    if (!pss.block) {
1541        pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1542    }
1543
1544    do {
1545        again = true;
1546        found = get_queued_page(rs, &pss);
1547
1548        if (!found) {
1549            /* priority queue empty, so just search for something dirty */
1550            found = find_dirty_block(rs, &pss, &again);
1551        }
1552
1553        if (found) {
1554            pages = ram_save_host_page(rs, &pss, last_stage);
1555        }
1556    } while (!pages && again);
1557
1558    rs->last_seen_block = pss.block;
1559    rs->last_page = pss.page;
1560
1561    return pages;
1562}
1563
1564void acct_update_position(QEMUFile *f, size_t size, bool zero)
1565{
1566    uint64_t pages = size / TARGET_PAGE_SIZE;
1567
1568    if (zero) {
1569        ram_counters.duplicate += pages;
1570    } else {
1571        ram_counters.normal += pages;
1572        ram_counters.transferred += size;
1573        qemu_update_position(f, size);
1574    }
1575}
1576
1577uint64_t ram_bytes_total(void)
1578{
1579    RAMBlock *block;
1580    uint64_t total = 0;
1581
1582    rcu_read_lock();
1583    RAMBLOCK_FOREACH(block) {
1584        total += block->used_length;
1585    }
1586    rcu_read_unlock();
1587    return total;
1588}
1589
1590static void xbzrle_load_setup(void)
1591{
1592    XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1593}
1594
1595static void xbzrle_load_cleanup(void)
1596{
1597    g_free(XBZRLE.decoded_buf);
1598    XBZRLE.decoded_buf = NULL;
1599}
1600
1601static void ram_state_cleanup(RAMState **rsp)
1602{
1603    migration_page_queue_free(*rsp);
1604    qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1605    qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1606    g_free(*rsp);
1607    *rsp = NULL;
1608}
1609
1610static void xbzrle_cleanup(void)
1611{
1612    XBZRLE_cache_lock();
1613    if (XBZRLE.cache) {
1614        cache_fini(XBZRLE.cache);
1615        g_free(XBZRLE.encoded_buf);
1616        g_free(XBZRLE.current_buf);
1617        g_free(XBZRLE.zero_target_page);
1618        XBZRLE.cache = NULL;
1619        XBZRLE.encoded_buf = NULL;
1620        XBZRLE.current_buf = NULL;
1621        XBZRLE.zero_target_page = NULL;
1622    }
1623    XBZRLE_cache_unlock();
1624}
1625
1626static void ram_save_cleanup(void *opaque)
1627{
1628    RAMState **rsp = opaque;
1629    RAMBlock *block;
1630
1631    /* caller have hold iothread lock or is in a bh, so there is
1632     * no writing race against this migration_bitmap
1633     */
1634    memory_global_dirty_log_stop();
1635
1636    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1637        g_free(block->bmap);
1638        block->bmap = NULL;
1639        g_free(block->unsentmap);
1640        block->unsentmap = NULL;
1641    }
1642
1643    xbzrle_cleanup();
1644    compress_threads_save_cleanup();
1645    ram_state_cleanup(rsp);
1646}
1647
1648static void ram_state_reset(RAMState *rs)
1649{
1650    rs->last_seen_block = NULL;
1651    rs->last_sent_block = NULL;
1652    rs->last_page = 0;
1653    rs->last_version = ram_list.version;
1654    rs->ram_bulk_stage = true;
1655}
1656
1657#define MAX_WAIT 50 /* ms, half buffered_file limit */
1658
1659/*
1660 * 'expected' is the value you expect the bitmap mostly to be full
1661 * of; it won't bother printing lines that are all this value.
1662 * If 'todump' is null the migration bitmap is dumped.
1663 */
1664void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1665                           unsigned long pages)
1666{
1667    int64_t cur;
1668    int64_t linelen = 128;
1669    char linebuf[129];
1670
1671    for (cur = 0; cur < pages; cur += linelen) {
1672        int64_t curb;
1673        bool found = false;
1674        /*
1675         * Last line; catch the case where the line length
1676         * is longer than remaining ram
1677         */
1678        if (cur + linelen > pages) {
1679            linelen = pages - cur;
1680        }
1681        for (curb = 0; curb < linelen; curb++) {
1682            bool thisbit = test_bit(cur + curb, todump);
1683            linebuf[curb] = thisbit ? '1' : '.';
1684            found = found || (thisbit != expected);
1685        }
1686        if (found) {
1687            linebuf[curb] = '\0';
1688            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1689        }
1690    }
1691}
1692
1693/* **** functions for postcopy ***** */
1694
1695void ram_postcopy_migrated_memory_release(MigrationState *ms)
1696{
1697    struct RAMBlock *block;
1698
1699    RAMBLOCK_FOREACH(block) {
1700        unsigned long *bitmap = block->bmap;
1701        unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1702        unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1703
1704        while (run_start < range) {
1705            unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1706            ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1707                              (run_end - run_start) << TARGET_PAGE_BITS);
1708            run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1709        }
1710    }
1711}
1712
1713/**
1714 * postcopy_send_discard_bm_ram: discard a RAMBlock
1715 *
1716 * Returns zero on success
1717 *
1718 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1719 * Note: At this point the 'unsentmap' is the processed bitmap combined
1720 *       with the dirtymap; so a '1' means it's either dirty or unsent.
1721 *
1722 * @ms: current migration state
1723 * @pds: state for postcopy
1724 * @start: RAMBlock starting page
1725 * @length: RAMBlock size
1726 */
1727static int postcopy_send_discard_bm_ram(MigrationState *ms,
1728                                        PostcopyDiscardState *pds,
1729                                        RAMBlock *block)
1730{
1731    unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1732    unsigned long current;
1733    unsigned long *unsentmap = block->unsentmap;
1734
1735    for (current = 0; current < end; ) {
1736        unsigned long one = find_next_bit(unsentmap, end, current);
1737
1738        if (one <= end) {
1739            unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1740            unsigned long discard_length;
1741
1742            if (zero >= end) {
1743                discard_length = end - one;
1744            } else {
1745                discard_length = zero - one;
1746            }
1747            if (discard_length) {
1748                postcopy_discard_send_range(ms, pds, one, discard_length);
1749            }
1750            current = one + discard_length;
1751        } else {
1752            current = one;
1753        }
1754    }
1755
1756    return 0;
1757}
1758
1759/**
1760 * postcopy_each_ram_send_discard: discard all RAMBlocks
1761 *
1762 * Returns 0 for success or negative for error
1763 *
1764 * Utility for the outgoing postcopy code.
1765 *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1766 *   passing it bitmap indexes and name.
1767 * (qemu_ram_foreach_block ends up passing unscaled lengths
1768 *  which would mean postcopy code would have to deal with target page)
1769 *
1770 * @ms: current migration state
1771 */
1772static int postcopy_each_ram_send_discard(MigrationState *ms)
1773{
1774    struct RAMBlock *block;
1775    int ret;
1776
1777    RAMBLOCK_FOREACH(block) {
1778        PostcopyDiscardState *pds =
1779            postcopy_discard_send_init(ms, block->idstr);
1780
1781        /*
1782         * Postcopy sends chunks of bitmap over the wire, but it
1783         * just needs indexes at this point, avoids it having
1784         * target page specific code.
1785         */
1786        ret = postcopy_send_discard_bm_ram(ms, pds, block);
1787        postcopy_discard_send_finish(ms, pds);
1788        if (ret) {
1789            return ret;
1790        }
1791    }
1792
1793    return 0;
1794}
1795
1796/**
1797 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1798 *
1799 * Helper for postcopy_chunk_hostpages; it's called twice to
1800 * canonicalize the two bitmaps, that are similar, but one is
1801 * inverted.
1802 *
1803 * Postcopy requires that all target pages in a hostpage are dirty or
1804 * clean, not a mix.  This function canonicalizes the bitmaps.
1805 *
1806 * @ms: current migration state
1807 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1808 *               otherwise we need to canonicalize partially dirty host pages
1809 * @block: block that contains the page we want to canonicalize
1810 * @pds: state for postcopy
1811 */
1812static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1813                                          RAMBlock *block,
1814                                          PostcopyDiscardState *pds)
1815{
1816    RAMState *rs = ram_state;
1817    unsigned long *bitmap = block->bmap;
1818    unsigned long *unsentmap = block->unsentmap;
1819    unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1820    unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1821    unsigned long run_start;
1822
1823    if (block->page_size == TARGET_PAGE_SIZE) {
1824        /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1825        return;
1826    }
1827
1828    if (unsent_pass) {
1829        /* Find a sent page */
1830        run_start = find_next_zero_bit(unsentmap, pages, 0);
1831    } else {
1832        /* Find a dirty page */
1833        run_start = find_next_bit(bitmap, pages, 0);
1834    }
1835
1836    while (run_start < pages) {
1837        bool do_fixup = false;
1838        unsigned long fixup_start_addr;
1839        unsigned long host_offset;
1840
1841        /*
1842         * If the start of this run of pages is in the middle of a host
1843         * page, then we need to fixup this host page.
1844         */
1845        host_offset = run_start % host_ratio;
1846        if (host_offset) {
1847            do_fixup = true;
1848            run_start -= host_offset;
1849            fixup_start_addr = run_start;
1850            /* For the next pass */
1851            run_start = run_start + host_ratio;
1852        } else {
1853            /* Find the end of this run */
1854            unsigned long run_end;
1855            if (unsent_pass) {
1856                run_end = find_next_bit(unsentmap, pages, run_start + 1);
1857            } else {
1858                run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1859            }
1860            /*
1861             * If the end isn't at the start of a host page, then the
1862             * run doesn't finish at the end of a host page
1863             * and we need to discard.
1864             */
1865            host_offset = run_end % host_ratio;
1866            if (host_offset) {
1867                do_fixup = true;
1868                fixup_start_addr = run_end - host_offset;
1869                /*
1870                 * This host page has gone, the next loop iteration starts
1871                 * from after the fixup
1872                 */
1873                run_start = fixup_start_addr + host_ratio;
1874            } else {
1875                /*
1876                 * No discards on this iteration, next loop starts from
1877                 * next sent/dirty page
1878                 */
1879                run_start = run_end + 1;
1880            }
1881        }
1882
1883        if (do_fixup) {
1884            unsigned long page;
1885
1886            /* Tell the destination to discard this page */
1887            if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1888                /* For the unsent_pass we:
1889                 *     discard partially sent pages
1890                 * For the !unsent_pass (dirty) we:
1891                 *     discard partially dirty pages that were sent
1892                 *     (any partially sent pages were already discarded
1893                 *     by the previous unsent_pass)
1894                 */
1895                postcopy_discard_send_range(ms, pds, fixup_start_addr,
1896                                            host_ratio);
1897            }
1898
1899            /* Clean up the bitmap */
1900            for (page = fixup_start_addr;
1901                 page < fixup_start_addr + host_ratio; page++) {
1902                /* All pages in this host page are now not sent */
1903                set_bit(page, unsentmap);
1904
1905                /*
1906                 * Remark them as dirty, updating the count for any pages
1907                 * that weren't previously dirty.
1908                 */
1909                rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1910            }
1911        }
1912
1913        if (unsent_pass) {
1914            /* Find the next sent page for the next iteration */
1915            run_start = find_next_zero_bit(unsentmap, pages, run_start);
1916        } else {
1917            /* Find the next dirty page for the next iteration */
1918            run_start = find_next_bit(bitmap, pages, run_start);
1919        }
1920    }
1921}
1922
1923/**
1924 * postcopy_chuck_hostpages: discrad any partially sent host page
1925 *
1926 * Utility for the outgoing postcopy code.
1927 *
1928 * Discard any partially sent host-page size chunks, mark any partially
1929 * dirty host-page size chunks as all dirty.  In this case the host-page
1930 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1931 *
1932 * Returns zero on success
1933 *
1934 * @ms: current migration state
1935 * @block: block we want to work with
1936 */
1937static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1938{
1939    PostcopyDiscardState *pds =
1940        postcopy_discard_send_init(ms, block->idstr);
1941
1942    /* First pass: Discard all partially sent host pages */
1943    postcopy_chunk_hostpages_pass(ms, true, block, pds);
1944    /*
1945     * Second pass: Ensure that all partially dirty host pages are made
1946     * fully dirty.
1947     */
1948    postcopy_chunk_hostpages_pass(ms, false, block, pds);
1949
1950    postcopy_discard_send_finish(ms, pds);
1951    return 0;
1952}
1953
1954/**
1955 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1956 *
1957 * Returns zero on success
1958 *
1959 * Transmit the set of pages to be discarded after precopy to the target
1960 * these are pages that:
1961 *     a) Have been previously transmitted but are now dirty again
1962 *     b) Pages that have never been transmitted, this ensures that
1963 *        any pages on the destination that have been mapped by background
1964 *        tasks get discarded (transparent huge pages is the specific concern)
1965 * Hopefully this is pretty sparse
1966 *
1967 * @ms: current migration state
1968 */
1969int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1970{
1971    RAMState *rs = ram_state;
1972    RAMBlock *block;
1973    int ret;
1974
1975    rcu_read_lock();
1976
1977    /* This should be our last sync, the src is now paused */
1978    migration_bitmap_sync(rs);
1979
1980    /* Easiest way to make sure we don't resume in the middle of a host-page */
1981    rs->last_seen_block = NULL;
1982    rs->last_sent_block = NULL;
1983    rs->last_page = 0;
1984
1985    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1986        unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1987        unsigned long *bitmap = block->bmap;
1988        unsigned long *unsentmap = block->unsentmap;
1989
1990        if (!unsentmap) {
1991            /* We don't have a safe way to resize the sentmap, so
1992             * if the bitmap was resized it will be NULL at this
1993             * point.
1994             */
1995            error_report("migration ram resized during precopy phase");
1996            rcu_read_unlock();
1997            return -EINVAL;
1998        }
1999        /* Deal with TPS != HPS and huge pages */
2000        ret = postcopy_chunk_hostpages(ms, block);
2001        if (ret) {
2002            rcu_read_unlock();
2003            return ret;
2004        }
2005
2006        /*
2007         * Update the unsentmap to be unsentmap = unsentmap | dirty
2008         */
2009        bitmap_or(unsentmap, unsentmap, bitmap, pages);
2010#ifdef DEBUG_POSTCOPY
2011        ram_debug_dump_bitmap(unsentmap, true, pages);
2012#endif
2013    }
2014    trace_ram_postcopy_send_discard_bitmap();
2015
2016    ret = postcopy_each_ram_send_discard(ms);
2017    rcu_read_unlock();
2018
2019    return ret;
2020}
2021
2022/**
2023 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2024 *
2025 * Returns zero on success
2026 *
2027 * @rbname: name of the RAMBlock of the request. NULL means the
2028 *          same that last one.
2029 * @start: RAMBlock starting page
2030 * @length: RAMBlock size
2031 */
2032int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2033{
2034    int ret = -1;
2035
2036    trace_ram_discard_range(rbname, start, length);
2037
2038    rcu_read_lock();
2039    RAMBlock *rb = qemu_ram_block_by_name(rbname);
2040
2041    if (!rb) {
2042        error_report("ram_discard_range: Failed to find block '%s'", rbname);
2043        goto err;
2044    }
2045
2046    bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2047                 length >> qemu_target_page_bits());
2048    ret = ram_block_discard_range(rb, start, length);
2049
2050err:
2051    rcu_read_unlock();
2052
2053    return ret;
2054}
2055
2056/*
2057 * For every allocation, we will try not to crash the VM if the
2058 * allocation failed.
2059 */
2060static int xbzrle_init(void)
2061{
2062    Error *local_err = NULL;
2063
2064    if (!migrate_use_xbzrle()) {
2065        return 0;
2066    }
2067
2068    XBZRLE_cache_lock();
2069
2070    XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2071    if (!XBZRLE.zero_target_page) {
2072        error_report("%s: Error allocating zero page", __func__);
2073        goto err_out;
2074    }
2075
2076    XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2077                              TARGET_PAGE_SIZE, &local_err);
2078    if (!XBZRLE.cache) {
2079        error_report_err(local_err);
2080        goto free_zero_page;
2081    }
2082
2083    XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2084    if (!XBZRLE.encoded_buf) {
2085        error_report("%s: Error allocating encoded_buf", __func__);
2086        goto free_cache;
2087    }
2088
2089    XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2090    if (!XBZRLE.current_buf) {
2091        error_report("%s: Error allocating current_buf", __func__);
2092        goto free_encoded_buf;
2093    }
2094
2095    /* We are all good */
2096    XBZRLE_cache_unlock();
2097    return 0;
2098
2099free_encoded_buf:
2100    g_free(XBZRLE.encoded_buf);
2101    XBZRLE.encoded_buf = NULL;
2102free_cache:
2103    cache_fini(XBZRLE.cache);
2104    XBZRLE.cache = NULL;
2105free_zero_page:
2106    g_free(XBZRLE.zero_target_page);
2107    XBZRLE.zero_target_page = NULL;
2108err_out:
2109    XBZRLE_cache_unlock();
2110    return -ENOMEM;
2111}
2112
2113static int ram_state_init(RAMState **rsp)
2114{
2115    *rsp = g_try_new0(RAMState, 1);
2116
2117    if (!*rsp) {
2118        error_report("%s: Init ramstate fail", __func__);
2119        return -1;
2120    }
2121
2122    qemu_mutex_init(&(*rsp)->bitmap_mutex);
2123    qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2124    QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2125
2126    /*
2127     * Count the total number of pages used by ram blocks not including any
2128     * gaps due to alignment or unplugs.
2129     */
2130    (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2131
2132    ram_state_reset(*rsp);
2133
2134    return 0;
2135}
2136
2137static void ram_list_init_bitmaps(void)
2138{
2139    RAMBlock *block;
2140    unsigned long pages;
2141
2142    /* Skip setting bitmap if there is no RAM */
2143    if (ram_bytes_total()) {
2144        QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2145            pages = block->max_length >> TARGET_PAGE_BITS;
2146            block->bmap = bitmap_new(pages);
2147            bitmap_set(block->bmap, 0, pages);
2148            if (migrate_postcopy_ram()) {
2149                block->unsentmap = bitmap_new(pages);
2150                bitmap_set(block->unsentmap, 0, pages);
2151            }
2152        }
2153    }
2154}
2155
2156static void ram_init_bitmaps(RAMState *rs)
2157{
2158    /* For memory_global_dirty_log_start below.  */
2159    qemu_mutex_lock_iothread();
2160    qemu_mutex_lock_ramlist();
2161    rcu_read_lock();
2162
2163    ram_list_init_bitmaps();
2164    memory_global_dirty_log_start();
2165    migration_bitmap_sync(rs);
2166
2167    rcu_read_unlock();
2168    qemu_mutex_unlock_ramlist();
2169    qemu_mutex_unlock_iothread();
2170}
2171
2172static int ram_init_all(RAMState **rsp)
2173{
2174    if (ram_state_init(rsp)) {
2175        return -1;
2176    }
2177
2178    if (xbzrle_init()) {
2179        ram_state_cleanup(rsp);
2180        return -1;
2181    }
2182
2183    ram_init_bitmaps(*rsp);
2184
2185    return 0;
2186}
2187
2188/*
2189 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2190 * long-running RCU critical section.  When rcu-reclaims in the code
2191 * start to become numerous it will be necessary to reduce the
2192 * granularity of these critical sections.
2193 */
2194
2195/**
2196 * ram_save_setup: Setup RAM for migration
2197 *
2198 * Returns zero to indicate success and negative for error
2199 *
2200 * @f: QEMUFile where to send the data
2201 * @opaque: RAMState pointer
2202 */
2203static int ram_save_setup(QEMUFile *f, void *opaque)
2204{
2205    RAMState **rsp = opaque;
2206    RAMBlock *block;
2207
2208    /* migration has already setup the bitmap, reuse it. */
2209    if (!migration_in_colo_state()) {
2210        if (ram_init_all(rsp) != 0) {
2211            return -1;
2212        }
2213    }
2214    (*rsp)->f = f;
2215
2216    rcu_read_lock();
2217
2218    qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2219
2220    RAMBLOCK_FOREACH(block) {
2221        qemu_put_byte(f, strlen(block->idstr));
2222        qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2223        qemu_put_be64(f, block->used_length);
2224        if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2225            qemu_put_be64(f, block->page_size);
2226        }
2227    }
2228
2229    rcu_read_unlock();
2230    compress_threads_save_setup();
2231
2232    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2233    ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2234
2235    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2236
2237    return 0;
2238}
2239
2240/**
2241 * ram_save_iterate: iterative stage for migration
2242 *
2243 * Returns zero to indicate success and negative for error
2244 *
2245 * @f: QEMUFile where to send the data
2246 * @opaque: RAMState pointer
2247 */
2248static int ram_save_iterate(QEMUFile *f, void *opaque)
2249{
2250    RAMState **temp = opaque;
2251    RAMState *rs = *temp;
2252    int ret;
2253    int i;
2254    int64_t t0;
2255    int done = 0;
2256
2257    rcu_read_lock();
2258    if (ram_list.version != rs->last_version) {
2259        ram_state_reset(rs);
2260    }
2261
2262    /* Read version before ram_list.blocks */
2263    smp_rmb();
2264
2265    ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2266
2267    t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2268    i = 0;
2269    while ((ret = qemu_file_rate_limit(f)) == 0) {
2270        int pages;
2271
2272        pages = ram_find_and_save_block(rs, false);
2273        /* no more pages to sent */
2274        if (pages == 0) {
2275            done = 1;
2276            break;
2277        }
2278        rs->iterations++;
2279
2280        /* we want to check in the 1st loop, just in case it was the 1st time
2281           and we had to sync the dirty bitmap.
2282           qemu_get_clock_ns() is a bit expensive, so we only check each some
2283           iterations
2284        */
2285        if ((i & 63) == 0) {
2286            uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2287            if (t1 > MAX_WAIT) {
2288                trace_ram_save_iterate_big_wait(t1, i);
2289                break;
2290            }
2291        }
2292        i++;
2293    }
2294    flush_compressed_data(rs);
2295    rcu_read_unlock();
2296
2297    /*
2298     * Must occur before EOS (or any QEMUFile operation)
2299     * because of RDMA protocol.
2300     */
2301    ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2302
2303    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2304    ram_counters.transferred += 8;
2305
2306    ret = qemu_file_get_error(f);
2307    if (ret < 0) {
2308        return ret;
2309    }
2310
2311    return done;
2312}
2313
2314/**
2315 * ram_save_complete: function called to send the remaining amount of ram
2316 *
2317 * Returns zero to indicate success
2318 *
2319 * Called with iothread lock
2320 *
2321 * @f: QEMUFile where to send the data
2322 * @opaque: RAMState pointer
2323 */
2324static int ram_save_complete(QEMUFile *f, void *opaque)
2325{
2326    RAMState **temp = opaque;
2327    RAMState *rs = *temp;
2328
2329    rcu_read_lock();
2330
2331    if (!migration_in_postcopy()) {
2332        migration_bitmap_sync(rs);
2333    }
2334
2335    ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2336
2337    /* try transferring iterative blocks of memory */
2338
2339    /* flush all remaining blocks regardless of rate limiting */
2340    while (true) {
2341        int pages;
2342
2343        pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2344        /* no more blocks to sent */
2345        if (pages == 0) {
2346            break;
2347        }
2348    }
2349
2350    flush_compressed_data(rs);
2351    ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2352
2353    rcu_read_unlock();
2354
2355    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2356
2357    return 0;
2358}
2359
2360static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2361                             uint64_t *non_postcopiable_pending,
2362                             uint64_t *postcopiable_pending)
2363{
2364    RAMState **temp = opaque;
2365    RAMState *rs = *temp;
2366    uint64_t remaining_size;
2367
2368    remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2369
2370    if (!migration_in_postcopy() &&
2371        remaining_size < max_size) {
2372        qemu_mutex_lock_iothread();
2373        rcu_read_lock();
2374        migration_bitmap_sync(rs);
2375        rcu_read_unlock();
2376        qemu_mutex_unlock_iothread();
2377        remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2378    }
2379
2380    if (migrate_postcopy_ram()) {
2381        /* We can do postcopy, and all the data is postcopiable */
2382        *postcopiable_pending += remaining_size;
2383    } else {
2384        *non_postcopiable_pending += remaining_size;
2385    }
2386}
2387
2388static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2389{
2390    unsigned int xh_len;
2391    int xh_flags;
2392    uint8_t *loaded_data;
2393
2394    /* extract RLE header */
2395    xh_flags = qemu_get_byte(f);
2396    xh_len = qemu_get_be16(f);
2397
2398    if (xh_flags != ENCODING_FLAG_XBZRLE) {
2399        error_report("Failed to load XBZRLE page - wrong compression!");
2400        return -1;
2401    }
2402
2403    if (xh_len > TARGET_PAGE_SIZE) {
2404        error_report("Failed to load XBZRLE page - len overflow!");
2405        return -1;
2406    }
2407    loaded_data = XBZRLE.decoded_buf;
2408    /* load data and decode */
2409    /* it can change loaded_data to point to an internal buffer */
2410    qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2411
2412    /* decode RLE */
2413    if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2414                             TARGET_PAGE_SIZE) == -1) {
2415        error_report("Failed to load XBZRLE page - decode error!");
2416        return -1;
2417    }
2418
2419    return 0;
2420}
2421
2422/**
2423 * ram_block_from_stream: read a RAMBlock id from the migration stream
2424 *
2425 * Must be called from within a rcu critical section.
2426 *
2427 * Returns a pointer from within the RCU-protected ram_list.
2428 *
2429 * @f: QEMUFile where to read the data from
2430 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2431 */
2432static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2433{
2434    static RAMBlock *block = NULL;
2435    char id[256];
2436    uint8_t len;
2437
2438    if (flags & RAM_SAVE_FLAG_CONTINUE) {
2439        if (!block) {
2440            error_report("Ack, bad migration stream!");
2441            return NULL;
2442        }
2443        return block;
2444    }
2445
2446    len = qemu_get_byte(f);
2447    qemu_get_buffer(f, (uint8_t *)id, len);
2448    id[len] = 0;
2449
2450    block = qemu_ram_block_by_name(id);
2451    if (!block) {
2452        error_report("Can't find block %s", id);
2453        return NULL;
2454    }
2455
2456    return block;
2457}
2458
2459static inline void *host_from_ram_block_offset(RAMBlock *block,
2460                                               ram_addr_t offset)
2461{
2462    if (!offset_in_ramblock(block, offset)) {
2463        return NULL;
2464    }
2465
2466    return block->host + offset;
2467}
2468
2469/**
2470 * ram_handle_compressed: handle the zero page case
2471 *
2472 * If a page (or a whole RDMA chunk) has been
2473 * determined to be zero, then zap it.
2474 *
2475 * @host: host address for the zero page
2476 * @ch: what the page is filled from.  We only support zero
2477 * @size: size of the zero page
2478 */
2479void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2480{
2481    if (ch != 0 || !is_zero_range(host, size)) {
2482        memset(host, ch, size);
2483    }
2484}
2485
2486static void *do_data_decompress(void *opaque)
2487{
2488    DecompressParam *param = opaque;
2489    unsigned long pagesize;
2490    uint8_t *des;
2491    int len;
2492
2493    qemu_mutex_lock(&param->mutex);
2494    while (!param->quit) {
2495        if (param->des) {
2496            des = param->des;
2497            len = param->len;
2498            param->des = 0;
2499            qemu_mutex_unlock(&param->mutex);
2500
2501            pagesize = TARGET_PAGE_SIZE;
2502            /* uncompress() will return failed in some case, especially
2503             * when the page is dirted when doing the compression, it's
2504             * not a problem because the dirty page will be retransferred
2505             * and uncompress() won't break the data in other pages.
2506             */
2507            uncompress((Bytef *)des, &pagesize,
2508                       (const Bytef *)param->compbuf, len);
2509
2510            qemu_mutex_lock(&decomp_done_lock);
2511            param->done = true;
2512            qemu_cond_signal(&decomp_done_cond);
2513            qemu_mutex_unlock(&decomp_done_lock);
2514
2515            qemu_mutex_lock(&param->mutex);
2516        } else {
2517            qemu_cond_wait(&param->cond, &param->mutex);
2518        }
2519    }
2520    qemu_mutex_unlock(&param->mutex);
2521
2522    return NULL;
2523}
2524
2525static void wait_for_decompress_done(void)
2526{
2527    int idx, thread_count;
2528
2529    if (!migrate_use_compression()) {
2530        return;
2531    }
2532
2533    thread_count = migrate_decompress_threads();
2534    qemu_mutex_lock(&decomp_done_lock);
2535    for (idx = 0; idx < thread_count; idx++) {
2536        while (!decomp_param[idx].done) {
2537            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2538        }
2539    }
2540    qemu_mutex_unlock(&decomp_done_lock);
2541}
2542
2543static void compress_threads_load_setup(void)
2544{
2545    int i, thread_count;
2546
2547    if (!migrate_use_compression()) {
2548        return;
2549    }
2550    thread_count = migrate_decompress_threads();
2551    decompress_threads = g_new0(QemuThread, thread_count);
2552    decomp_param = g_new0(DecompressParam, thread_count);
2553    qemu_mutex_init(&decomp_done_lock);
2554    qemu_cond_init(&decomp_done_cond);
2555    for (i = 0; i < thread_count; i++) {
2556        qemu_mutex_init(&decomp_param[i].mutex);
2557        qemu_cond_init(&decomp_param[i].cond);
2558        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2559        decomp_param[i].done = true;
2560        decomp_param[i].quit = false;
2561        qemu_thread_create(decompress_threads + i, "decompress",
2562                           do_data_decompress, decomp_param + i,
2563                           QEMU_THREAD_JOINABLE);
2564    }
2565}
2566
2567static void compress_threads_load_cleanup(void)
2568{
2569    int i, thread_count;
2570
2571    if (!migrate_use_compression()) {
2572        return;
2573    }
2574    thread_count = migrate_decompress_threads();
2575    for (i = 0; i < thread_count; i++) {
2576        qemu_mutex_lock(&decomp_param[i].mutex);
2577        decomp_param[i].quit = true;
2578        qemu_cond_signal(&decomp_param[i].cond);
2579        qemu_mutex_unlock(&decomp_param[i].mutex);
2580    }
2581    for (i = 0; i < thread_count; i++) {
2582        qemu_thread_join(decompress_threads + i);
2583        qemu_mutex_destroy(&decomp_param[i].mutex);
2584        qemu_cond_destroy(&decomp_param[i].cond);
2585        g_free(decomp_param[i].compbuf);
2586    }
2587    g_free(decompress_threads);
2588    g_free(decomp_param);
2589    decompress_threads = NULL;
2590    decomp_param = NULL;
2591}
2592
2593static void decompress_data_with_multi_threads(QEMUFile *f,
2594                                               void *host, int len)
2595{
2596    int idx, thread_count;
2597
2598    thread_count = migrate_decompress_threads();
2599    qemu_mutex_lock(&decomp_done_lock);
2600    while (true) {
2601        for (idx = 0; idx < thread_count; idx++) {
2602            if (decomp_param[idx].done) {
2603                decomp_param[idx].done = false;
2604                qemu_mutex_lock(&decomp_param[idx].mutex);
2605                qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2606                decomp_param[idx].des = host;
2607                decomp_param[idx].len = len;
2608                qemu_cond_signal(&decomp_param[idx].cond);
2609                qemu_mutex_unlock(&decomp_param[idx].mutex);
2610                break;
2611            }
2612        }
2613        if (idx < thread_count) {
2614            break;
2615        } else {
2616            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2617        }
2618    }
2619    qemu_mutex_unlock(&decomp_done_lock);
2620}
2621
2622/**
2623 * ram_load_setup: Setup RAM for migration incoming side
2624 *
2625 * Returns zero to indicate success and negative for error
2626 *
2627 * @f: QEMUFile where to receive the data
2628 * @opaque: RAMState pointer
2629 */
2630static int ram_load_setup(QEMUFile *f, void *opaque)
2631{
2632    xbzrle_load_setup();
2633    compress_threads_load_setup();
2634    ramblock_recv_map_init();
2635    return 0;
2636}
2637
2638static int ram_load_cleanup(void *opaque)
2639{
2640    RAMBlock *rb;
2641    xbzrle_load_cleanup();
2642    compress_threads_load_cleanup();
2643
2644    RAMBLOCK_FOREACH(rb) {
2645        g_free(rb->receivedmap);
2646        rb->receivedmap = NULL;
2647    }
2648    return 0;
2649}
2650
2651/**
2652 * ram_postcopy_incoming_init: allocate postcopy data structures
2653 *
2654 * Returns 0 for success and negative if there was one error
2655 *
2656 * @mis: current migration incoming state
2657 *
2658 * Allocate data structures etc needed by incoming migration with
2659 * postcopy-ram. postcopy-ram's similarly names
2660 * postcopy_ram_incoming_init does the work.
2661 */
2662int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2663{
2664    unsigned long ram_pages = last_ram_page();
2665
2666    return postcopy_ram_incoming_init(mis, ram_pages);
2667}
2668
2669/**
2670 * ram_load_postcopy: load a page in postcopy case
2671 *
2672 * Returns 0 for success or -errno in case of error
2673 *
2674 * Called in postcopy mode by ram_load().
2675 * rcu_read_lock is taken prior to this being called.
2676 *
2677 * @f: QEMUFile where to send the data
2678 */
2679static int ram_load_postcopy(QEMUFile *f)
2680{
2681    int flags = 0, ret = 0;
2682    bool place_needed = false;
2683    bool matching_page_sizes = false;
2684    MigrationIncomingState *mis = migration_incoming_get_current();
2685    /* Temporary page that is later 'placed' */
2686    void *postcopy_host_page = postcopy_get_tmp_page(mis);
2687    void *last_host = NULL;
2688    bool all_zero = false;
2689
2690    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2691        ram_addr_t addr;
2692        void *host = NULL;
2693        void *page_buffer = NULL;
2694        void *place_source = NULL;
2695        RAMBlock *block = NULL;
2696        uint8_t ch;
2697
2698        addr = qemu_get_be64(f);
2699        flags = addr & ~TARGET_PAGE_MASK;
2700        addr &= TARGET_PAGE_MASK;
2701
2702        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2703        place_needed = false;
2704        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2705            block = ram_block_from_stream(f, flags);
2706
2707            host = host_from_ram_block_offset(block, addr);
2708            if (!host) {
2709                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2710                ret = -EINVAL;
2711                break;
2712            }
2713            matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2714            /*
2715             * Postcopy requires that we place whole host pages atomically;
2716             * these may be huge pages for RAMBlocks that are backed by
2717             * hugetlbfs.
2718             * To make it atomic, the data is read into a temporary page
2719             * that's moved into place later.
2720             * The migration protocol uses,  possibly smaller, target-pages
2721             * however the source ensures it always sends all the components
2722             * of a host page in order.
2723             */
2724            page_buffer = postcopy_host_page +
2725                          ((uintptr_t)host & (block->page_size - 1));
2726            /* If all TP are zero then we can optimise the place */
2727            if (!((uintptr_t)host & (block->page_size - 1))) {
2728                all_zero = true;
2729            } else {
2730                /* not the 1st TP within the HP */
2731                if (host != (last_host + TARGET_PAGE_SIZE)) {
2732                    error_report("Non-sequential target page %p/%p",
2733                                  host, last_host);
2734                    ret = -EINVAL;
2735                    break;
2736                }
2737            }
2738
2739
2740            /*
2741             * If it's the last part of a host page then we place the host
2742             * page
2743             */
2744            place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2745                                     (block->page_size - 1)) == 0;
2746            place_source = postcopy_host_page;
2747        }
2748        last_host = host;
2749
2750        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2751        case RAM_SAVE_FLAG_ZERO:
2752            ch = qemu_get_byte(f);
2753            memset(page_buffer, ch, TARGET_PAGE_SIZE);
2754            if (ch) {
2755                all_zero = false;
2756            }
2757            break;
2758
2759        case RAM_SAVE_FLAG_PAGE:
2760            all_zero = false;
2761            if (!place_needed || !matching_page_sizes) {
2762                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2763            } else {
2764                /* Avoids the qemu_file copy during postcopy, which is
2765                 * going to do a copy later; can only do it when we
2766                 * do this read in one go (matching page sizes)
2767                 */
2768                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2769                                         TARGET_PAGE_SIZE);
2770            }
2771            break;
2772        case RAM_SAVE_FLAG_EOS:
2773            /* normal exit */
2774            break;
2775        default:
2776            error_report("Unknown combination of migration flags: %#x"
2777                         " (postcopy mode)", flags);
2778            ret = -EINVAL;
2779        }
2780
2781        if (place_needed) {
2782            /* This gets called at the last target page in the host page */
2783            void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2784
2785            if (all_zero) {
2786                ret = postcopy_place_page_zero(mis, place_dest,
2787                                               block);
2788            } else {
2789                ret = postcopy_place_page(mis, place_dest,
2790                                          place_source, block);
2791            }
2792        }
2793        if (!ret) {
2794            ret = qemu_file_get_error(f);
2795        }
2796    }
2797
2798    return ret;
2799}
2800
2801static bool postcopy_is_advised(void)
2802{
2803    PostcopyState ps = postcopy_state_get();
2804    return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
2805}
2806
2807static bool postcopy_is_running(void)
2808{
2809    PostcopyState ps = postcopy_state_get();
2810    return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
2811}
2812
2813static int ram_load(QEMUFile *f, void *opaque, int version_id)
2814{
2815    int flags = 0, ret = 0, invalid_flags = 0;
2816    static uint64_t seq_iter;
2817    int len = 0;
2818    /*
2819     * If system is running in postcopy mode, page inserts to host memory must
2820     * be atomic
2821     */
2822    bool postcopy_running = postcopy_is_running();
2823    /* ADVISE is earlier, it shows the source has the postcopy capability on */
2824    bool postcopy_advised = postcopy_is_advised();
2825
2826    seq_iter++;
2827
2828    if (version_id != 4) {
2829        ret = -EINVAL;
2830    }
2831
2832    if (!migrate_use_compression()) {
2833        invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2834    }
2835    /* This RCU critical section can be very long running.
2836     * When RCU reclaims in the code start to become numerous,
2837     * it will be necessary to reduce the granularity of this
2838     * critical section.
2839     */
2840    rcu_read_lock();
2841
2842    if (postcopy_running) {
2843        ret = ram_load_postcopy(f);
2844    }
2845
2846    while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2847        ram_addr_t addr, total_ram_bytes;
2848        void *host = NULL;
2849        uint8_t ch;
2850
2851        addr = qemu_get_be64(f);
2852        flags = addr & ~TARGET_PAGE_MASK;
2853        addr &= TARGET_PAGE_MASK;
2854
2855        if (flags & invalid_flags) {
2856            if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2857                error_report("Received an unexpected compressed page");
2858            }
2859
2860            ret = -EINVAL;
2861            break;
2862        }
2863
2864        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2865                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2866            RAMBlock *block = ram_block_from_stream(f, flags);
2867
2868            host = host_from_ram_block_offset(block, addr);
2869            if (!host) {
2870                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2871                ret = -EINVAL;
2872                break;
2873            }
2874            ramblock_recv_bitmap_set(block, host);
2875            trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2876        }
2877
2878        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2879        case RAM_SAVE_FLAG_MEM_SIZE:
2880            /* Synchronize RAM block list */
2881            total_ram_bytes = addr;
2882            while (!ret && total_ram_bytes) {
2883                RAMBlock *block;
2884                char id[256];
2885                ram_addr_t length;
2886
2887                len = qemu_get_byte(f);
2888                qemu_get_buffer(f, (uint8_t *)id, len);
2889                id[len] = 0;
2890                length = qemu_get_be64(f);
2891
2892                block = qemu_ram_block_by_name(id);
2893                if (block) {
2894                    if (length != block->used_length) {
2895                        Error *local_err = NULL;
2896
2897                        ret = qemu_ram_resize(block, length,
2898                                              &local_err);
2899                        if (local_err) {
2900                            error_report_err(local_err);
2901                        }
2902                    }
2903                    /* For postcopy we need to check hugepage sizes match */
2904                    if (postcopy_advised &&
2905                        block->page_size != qemu_host_page_size) {
2906                        uint64_t remote_page_size = qemu_get_be64(f);
2907                        if (remote_page_size != block->page_size) {
2908                            error_report("Mismatched RAM page size %s "
2909                                         "(local) %zd != %" PRId64,
2910                                         id, block->page_size,
2911                                         remote_page_size);
2912                            ret = -EINVAL;
2913                        }
2914                    }
2915                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2916                                          block->idstr);
2917                } else {
2918                    error_report("Unknown ramblock \"%s\", cannot "
2919                                 "accept migration", id);
2920                    ret = -EINVAL;
2921                }
2922
2923                total_ram_bytes -= length;
2924            }
2925            break;
2926
2927        case RAM_SAVE_FLAG_ZERO:
2928            ch = qemu_get_byte(f);
2929            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2930            break;
2931
2932        case RAM_SAVE_FLAG_PAGE:
2933            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2934            break;
2935
2936        case RAM_SAVE_FLAG_COMPRESS_PAGE:
2937            len = qemu_get_be32(f);
2938            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2939                error_report("Invalid compressed data length: %d", len);
2940                ret = -EINVAL;
2941                break;
2942            }
2943            decompress_data_with_multi_threads(f, host, len);
2944            break;
2945
2946        case RAM_SAVE_FLAG_XBZRLE:
2947            if (load_xbzrle(f, addr, host) < 0) {
2948                error_report("Failed to decompress XBZRLE page at "
2949                             RAM_ADDR_FMT, addr);
2950                ret = -EINVAL;
2951                break;
2952            }
2953            break;
2954        case RAM_SAVE_FLAG_EOS:
2955            /* normal exit */
2956            break;
2957        default:
2958            if (flags & RAM_SAVE_FLAG_HOOK) {
2959                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2960            } else {
2961                error_report("Unknown combination of migration flags: %#x",
2962                             flags);
2963                ret = -EINVAL;
2964            }
2965        }
2966        if (!ret) {
2967            ret = qemu_file_get_error(f);
2968        }
2969    }
2970
2971    wait_for_decompress_done();
2972    rcu_read_unlock();
2973    trace_ram_load_complete(ret, seq_iter);
2974    return ret;
2975}
2976
2977static bool ram_has_postcopy(void *opaque)
2978{
2979    return migrate_postcopy_ram();
2980}
2981
2982static SaveVMHandlers savevm_ram_handlers = {
2983    .save_setup = ram_save_setup,
2984    .save_live_iterate = ram_save_iterate,
2985    .save_live_complete_postcopy = ram_save_complete,
2986    .save_live_complete_precopy = ram_save_complete,
2987    .has_postcopy = ram_has_postcopy,
2988    .save_live_pending = ram_save_pending,
2989    .load_state = ram_load,
2990    .save_cleanup = ram_save_cleanup,
2991    .load_setup = ram_load_setup,
2992    .load_cleanup = ram_load_cleanup,
2993};
2994
2995void ram_mig_init(void)
2996{
2997    qemu_mutex_init(&XBZRLE.lock);
2998    register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
2999}
3000