LXR qemu/migration/ram.c

   1/*
   2 * QEMU System Emulator
   3 *
   4 * Copyright (c) 2003-2008 Fabrice Bellard
   5 * Copyright (c) 2011-2015 Red Hat Inc
   6 *
   7 * Authors:
   8 *  Juan Quintela <quintela@redhat.com>
   9 *
  10 * Permission is hereby granted, free of charge, to any person obtaining a copy
  11 * of this software and associated documentation files (the "Software"), to deal
  12 * in the Software without restriction, including without limitation the rights
  13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14 * copies of the Software, and to permit persons to whom the Software is
  15 * furnished to do so, subject to the following conditions:
  16 *
  17 * The above copyright notice and this permission notice shall be included in
  18 * all copies or substantial portions of the Software.
  19 *
  20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26 * THE SOFTWARE.
  27 */
  28
  29#include "qemu/osdep.h"
  30#include "cpu.h"
  31#include <zlib.h>
  32#include "qemu/cutils.h"
  33#include "qemu/bitops.h"
  34#include "qemu/bitmap.h"
  35#include "qemu/main-loop.h"
  36#include "xbzrle.h"
  37#include "ram.h"
  38#include "migration.h"
  39#include "migration/register.h"
  40#include "migration/misc.h"
  41#include "qemu-file.h"
  42#include "postcopy-ram.h"
  43#include "migration/page_cache.h"
  44#include "qemu/error-report.h"
  45#include "qapi/error.h"
  46#include "qapi/qapi-events-migration.h"
  47#include "qapi/qmp/qerror.h"
  48#include "trace.h"
  49#include "exec/ram_addr.h"
  50#include "exec/target_page.h"
  51#include "qemu/rcu_queue.h"
  52#include "migration/colo.h"
  53#include "migration/block.h"
  54
  55/***********************************************************/
  56/* ram save/restore */
  57
  58/* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  59 * worked for pages that where filled with the same char.  We switched
  60 * it to only search for the zero value.  And to avoid confusion with
  61 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  62 */
  63
  64#define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  65#define RAM_SAVE_FLAG_ZERO     0x02
  66#define RAM_SAVE_FLAG_MEM_SIZE 0x04
  67#define RAM_SAVE_FLAG_PAGE     0x08
  68#define RAM_SAVE_FLAG_EOS      0x10
  69#define RAM_SAVE_FLAG_CONTINUE 0x20
  70#define RAM_SAVE_FLAG_XBZRLE   0x40
  71/* 0x80 is reserved in migration.h start with 0x100 next */
  72#define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  73
  74static inline bool is_zero_range(uint8_t *p, uint64_t size)
  75{
  76    return buffer_is_zero(p, size);
  77}
  78
  79XBZRLECacheStats xbzrle_counters;
  80
  81/* struct contains XBZRLE cache and a static page
  82   used by the compression */
  83static struct {
  84    /* buffer used for XBZRLE encoding */
  85    uint8_t *encoded_buf;
  86    /* buffer for storing page content */
  87    uint8_t *current_buf;
  88    /* Cache for XBZRLE, Protected by lock. */
  89    PageCache *cache;
  90    QemuMutex lock;
  91    /* it will store a page full of zeros */
  92    uint8_t *zero_target_page;
  93    /* buffer used for XBZRLE decoding */
  94    uint8_t *decoded_buf;
  95} XBZRLE;
  96
  97static void XBZRLE_cache_lock(void)
  98{
  99    if (migrate_use_xbzrle())
 100        qemu_mutex_lock(&XBZRLE.lock);
 101}
 102
 103static void XBZRLE_cache_unlock(void)
 104{
 105    if (migrate_use_xbzrle())
 106        qemu_mutex_unlock(&XBZRLE.lock);
 107}
 108
 109/**
 110 * xbzrle_cache_resize: resize the xbzrle cache
 111 *
 112 * This function is called from qmp_migrate_set_cache_size in main
 113 * thread, possibly while a migration is in progress.  A running
 114 * migration may be using the cache and might finish during this call,
 115 * hence changes to the cache are protected by XBZRLE.lock().
 116 *
 117 * Returns 0 for success or -1 for error
 118 *
 119 * @new_size: new cache size
 120 * @errp: set *errp if the check failed, with reason
 121 */
 122int xbzrle_cache_resize(int64_t new_size, Error **errp)
 123{
 124    PageCache *new_cache;
 125    int64_t ret = 0;
 126
 127    /* Check for truncation */
 128    if (new_size != (size_t)new_size) {
 129        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 130                   "exceeding address space");
 131        return -1;
 132    }
 133
 134    if (new_size == migrate_xbzrle_cache_size()) {
 135        /* nothing to do */
 136        return 0;
 137    }
 138
 139    XBZRLE_cache_lock();
 140
 141    if (XBZRLE.cache != NULL) {
 142        new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 143        if (!new_cache) {
 144            ret = -1;
 145            goto out;
 146        }
 147
 148        cache_fini(XBZRLE.cache);
 149        XBZRLE.cache = new_cache;
 150    }
 151out:
 152    XBZRLE_cache_unlock();
 153    return ret;
 154}
 155
 156static void ramblock_recv_map_init(void)
 157{
 158    RAMBlock *rb;
 159
 160    RAMBLOCK_FOREACH(rb) {
 161        assert(!rb->receivedmap);
 162        rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 163    }
 164}
 165
 166int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 167{
 168    return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 169                    rb->receivedmap);
 170}
 171
 172bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 173{
 174    return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 175}
 176
 177void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 178{
 179    set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 180}
 181
 182void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 183                                    size_t nr)
 184{
 185    bitmap_set_atomic(rb->receivedmap,
 186                      ramblock_recv_bitmap_offset(host_addr, rb),
 187                      nr);
 188}
 189
 190/*
 191 * An outstanding page request, on the source, having been received
 192 * and queued
 193 */
 194struct RAMSrcPageRequest {
 195    RAMBlock *rb;
 196    hwaddr    offset;
 197    hwaddr    len;
 198
 199    QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 200};
 201
 202/* State of RAM for migration */
 203struct RAMState {
 204    /* QEMUFile used for this migration */
 205    QEMUFile *f;
 206    /* Last block that we have visited searching for dirty pages */
 207    RAMBlock *last_seen_block;
 208    /* Last block from where we have sent data */
 209    RAMBlock *last_sent_block;
 210    /* Last dirty target page we have sent */
 211    ram_addr_t last_page;
 212    /* last ram version we have seen */
 213    uint32_t last_version;
 214    /* We are in the first round */
 215    bool ram_bulk_stage;
 216    /* How many times we have dirty too many pages */
 217    int dirty_rate_high_cnt;
 218    /* these variables are used for bitmap sync */
 219    /* last time we did a full bitmap_sync */
 220    int64_t time_last_bitmap_sync;
 221    /* bytes transferred at start_time */
 222    uint64_t bytes_xfer_prev;
 223    /* number of dirty pages since start_time */
 224    uint64_t num_dirty_pages_period;
 225    /* xbzrle misses since the beginning of the period */
 226    uint64_t xbzrle_cache_miss_prev;
 227    /* number of iterations at the beginning of period */
 228    uint64_t iterations_prev;
 229    /* Iterations since start */
 230    uint64_t iterations;
 231    /* number of dirty bits in the bitmap */
 232    uint64_t migration_dirty_pages;
 233    /* protects modification of the bitmap */
 234    QemuMutex bitmap_mutex;
 235    /* The RAMBlock used in the last src_page_requests */
 236    RAMBlock *last_req_rb;
 237    /* Queue of outstanding page requests from the destination */
 238    QemuMutex src_page_req_mutex;
 239    QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 240};
 241typedef struct RAMState RAMState;
 242
 243static RAMState *ram_state;
 244
 245uint64_t ram_bytes_remaining(void)
 246{
 247    return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 248                       0;
 249}
 250
 251MigrationStats ram_counters;
 252
 253/* used by the search for pages to send */
 254struct PageSearchStatus {
 255    /* Current block being searched */
 256    RAMBlock    *block;
 257    /* Current page to search from */
 258    unsigned long page;
 259    /* Set once we wrap around */
 260    bool         complete_round;
 261};
 262typedef struct PageSearchStatus PageSearchStatus;
 263
 264struct CompressParam {
 265    bool done;
 266    bool quit;
 267    QEMUFile *file;
 268    QemuMutex mutex;
 269    QemuCond cond;
 270    RAMBlock *block;
 271    ram_addr_t offset;
 272};
 273typedef struct CompressParam CompressParam;
 274
 275struct DecompressParam {
 276    bool done;
 277    bool quit;
 278    QemuMutex mutex;
 279    QemuCond cond;
 280    void *des;
 281    uint8_t *compbuf;
 282    int len;
 283};
 284typedef struct DecompressParam DecompressParam;
 285
 286static CompressParam *comp_param;
 287static QemuThread *compress_threads;
 288/* comp_done_cond is used to wake up the migration thread when
 289 * one of the compression threads has finished the compression.
 290 * comp_done_lock is used to co-work with comp_done_cond.
 291 */
 292static QemuMutex comp_done_lock;
 293static QemuCond comp_done_cond;
 294/* The empty QEMUFileOps will be used by file in CompressParam */
 295static const QEMUFileOps empty_ops = { };
 296
 297static DecompressParam *decomp_param;
 298static QemuThread *decompress_threads;
 299static QemuMutex decomp_done_lock;
 300static QemuCond decomp_done_cond;
 301
 302static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
 303                                ram_addr_t offset);
 304
 305static void *do_data_compress(void *opaque)
 306{
 307    CompressParam *param = opaque;
 308    RAMBlock *block;
 309    ram_addr_t offset;
 310
 311    qemu_mutex_lock(&param->mutex);
 312    while (!param->quit) {
 313        if (param->block) {
 314            block = param->block;
 315            offset = param->offset;
 316            param->block = NULL;
 317            qemu_mutex_unlock(&param->mutex);
 318
 319            do_compress_ram_page(param->file, block, offset);
 320
 321            qemu_mutex_lock(&comp_done_lock);
 322            param->done = true;
 323            qemu_cond_signal(&comp_done_cond);
 324            qemu_mutex_unlock(&comp_done_lock);
 325
 326            qemu_mutex_lock(&param->mutex);
 327        } else {
 328            qemu_cond_wait(&param->cond, &param->mutex);
 329        }
 330    }
 331    qemu_mutex_unlock(&param->mutex);
 332
 333    return NULL;
 334}
 335
 336static inline void terminate_compression_threads(void)
 337{
 338    int idx, thread_count;
 339
 340    thread_count = migrate_compress_threads();
 341
 342    for (idx = 0; idx < thread_count; idx++) {
 343        qemu_mutex_lock(&comp_param[idx].mutex);
 344        comp_param[idx].quit = true;
 345        qemu_cond_signal(&comp_param[idx].cond);
 346        qemu_mutex_unlock(&comp_param[idx].mutex);
 347    }
 348}
 349
 350static void compress_threads_save_cleanup(void)
 351{
 352    int i, thread_count;
 353
 354    if (!migrate_use_compression()) {
 355        return;
 356    }
 357    terminate_compression_threads();
 358    thread_count = migrate_compress_threads();
 359    for (i = 0; i < thread_count; i++) {
 360        qemu_thread_join(compress_threads + i);
 361        qemu_fclose(comp_param[i].file);
 362        qemu_mutex_destroy(&comp_param[i].mutex);
 363        qemu_cond_destroy(&comp_param[i].cond);
 364    }
 365    qemu_mutex_destroy(&comp_done_lock);
 366    qemu_cond_destroy(&comp_done_cond);
 367    g_free(compress_threads);
 368    g_free(comp_param);
 369    compress_threads = NULL;
 370    comp_param = NULL;
 371}
 372
 373static void compress_threads_save_setup(void)
 374{
 375    int i, thread_count;
 376
 377    if (!migrate_use_compression()) {
 378        return;
 379    }
 380    thread_count = migrate_compress_threads();
 381    compress_threads = g_new0(QemuThread, thread_count);
 382    comp_param = g_new0(CompressParam, thread_count);
 383    qemu_cond_init(&comp_done_cond);
 384    qemu_mutex_init(&comp_done_lock);
 385    for (i = 0; i < thread_count; i++) {
 386        /* comp_param[i].file is just used as a dummy buffer to save data,
 387         * set its ops to empty.
 388         */
 389        comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 390        comp_param[i].done = true;
 391        comp_param[i].quit = false;
 392        qemu_mutex_init(&comp_param[i].mutex);
 393        qemu_cond_init(&comp_param[i].cond);
 394        qemu_thread_create(compress_threads + i, "compress",
 395                           do_data_compress, comp_param + i,
 396                           QEMU_THREAD_JOINABLE);
 397    }
 398}
 399
 400/* Multiple fd's */
 401
 402struct MultiFDSendParams {
 403    uint8_t id;
 404    char *name;
 405    QemuThread thread;
 406    QemuSemaphore sem;
 407    QemuMutex mutex;
 408    bool quit;
 409};
 410typedef struct MultiFDSendParams MultiFDSendParams;
 411
 412struct {
 413    MultiFDSendParams *params;
 414    /* number of created threads */
 415    int count;
 416} *multifd_send_state;
 417
 418static void terminate_multifd_send_threads(Error *errp)
 419{
 420    int i;
 421
 422    for (i = 0; i < multifd_send_state->count; i++) {
 423        MultiFDSendParams *p = &multifd_send_state->params[i];
 424
 425        qemu_mutex_lock(&p->mutex);
 426        p->quit = true;
 427        qemu_sem_post(&p->sem);
 428        qemu_mutex_unlock(&p->mutex);
 429    }
 430}
 431
 432int multifd_save_cleanup(Error **errp)
 433{
 434    int i;
 435    int ret = 0;
 436
 437    if (!migrate_use_multifd()) {
 438        return 0;
 439    }
 440    terminate_multifd_send_threads(NULL);
 441    for (i = 0; i < multifd_send_state->count; i++) {
 442        MultiFDSendParams *p = &multifd_send_state->params[i];
 443
 444        qemu_thread_join(&p->thread);
 445        qemu_mutex_destroy(&p->mutex);
 446        qemu_sem_destroy(&p->sem);
 447        g_free(p->name);
 448        p->name = NULL;
 449    }
 450    g_free(multifd_send_state->params);
 451    multifd_send_state->params = NULL;
 452    g_free(multifd_send_state);
 453    multifd_send_state = NULL;
 454    return ret;
 455}
 456
 457static void *multifd_send_thread(void *opaque)
 458{
 459    MultiFDSendParams *p = opaque;
 460
 461    while (true) {
 462        qemu_mutex_lock(&p->mutex);
 463        if (p->quit) {
 464            qemu_mutex_unlock(&p->mutex);
 465            break;
 466        }
 467        qemu_mutex_unlock(&p->mutex);
 468        qemu_sem_wait(&p->sem);
 469    }
 470
 471    return NULL;
 472}
 473
 474int multifd_save_setup(void)
 475{
 476    int thread_count;
 477    uint8_t i;
 478
 479    if (!migrate_use_multifd()) {
 480        return 0;
 481    }
 482    thread_count = migrate_multifd_channels();
 483    multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 484    multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 485    multifd_send_state->count = 0;
 486    for (i = 0; i < thread_count; i++) {
 487        MultiFDSendParams *p = &multifd_send_state->params[i];
 488
 489        qemu_mutex_init(&p->mutex);
 490        qemu_sem_init(&p->sem, 0);
 491        p->quit = false;
 492        p->id = i;
 493        p->name = g_strdup_printf("multifdsend_%d", i);
 494        qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
 495                           QEMU_THREAD_JOINABLE);
 496
 497        multifd_send_state->count++;
 498    }
 499    return 0;
 500}
 501
 502struct MultiFDRecvParams {
 503    uint8_t id;
 504    char *name;
 505    QemuThread thread;
 506    QemuSemaphore sem;
 507    QemuMutex mutex;
 508    bool quit;
 509};
 510typedef struct MultiFDRecvParams MultiFDRecvParams;
 511
 512struct {
 513    MultiFDRecvParams *params;
 514    /* number of created threads */
 515    int count;
 516} *multifd_recv_state;
 517
 518static void terminate_multifd_recv_threads(Error *errp)
 519{
 520    int i;
 521
 522    for (i = 0; i < multifd_recv_state->count; i++) {
 523        MultiFDRecvParams *p = &multifd_recv_state->params[i];
 524
 525        qemu_mutex_lock(&p->mutex);
 526        p->quit = true;
 527        qemu_sem_post(&p->sem);
 528        qemu_mutex_unlock(&p->mutex);
 529    }
 530}
 531
 532int multifd_load_cleanup(Error **errp)
 533{
 534    int i;
 535    int ret = 0;
 536
 537    if (!migrate_use_multifd()) {
 538        return 0;
 539    }
 540    terminate_multifd_recv_threads(NULL);
 541    for (i = 0; i < multifd_recv_state->count; i++) {
 542        MultiFDRecvParams *p = &multifd_recv_state->params[i];
 543
 544        qemu_thread_join(&p->thread);
 545        qemu_mutex_destroy(&p->mutex);
 546        qemu_sem_destroy(&p->sem);
 547        g_free(p->name);
 548        p->name = NULL;
 549    }
 550    g_free(multifd_recv_state->params);
 551    multifd_recv_state->params = NULL;
 552    g_free(multifd_recv_state);
 553    multifd_recv_state = NULL;
 554
 555    return ret;
 556}
 557
 558static void *multifd_recv_thread(void *opaque)
 559{
 560    MultiFDRecvParams *p = opaque;
 561
 562    while (true) {
 563        qemu_mutex_lock(&p->mutex);
 564        if (p->quit) {
 565            qemu_mutex_unlock(&p->mutex);
 566            break;
 567        }
 568        qemu_mutex_unlock(&p->mutex);
 569        qemu_sem_wait(&p->sem);
 570    }
 571
 572    return NULL;
 573}
 574
 575int multifd_load_setup(void)
 576{
 577    int thread_count;
 578    uint8_t i;
 579
 580    if (!migrate_use_multifd()) {
 581        return 0;
 582    }
 583    thread_count = migrate_multifd_channels();
 584    multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 585    multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
 586    multifd_recv_state->count = 0;
 587    for (i = 0; i < thread_count; i++) {
 588        MultiFDRecvParams *p = &multifd_recv_state->params[i];
 589
 590        qemu_mutex_init(&p->mutex);
 591        qemu_sem_init(&p->sem, 0);
 592        p->quit = false;
 593        p->id = i;
 594        p->name = g_strdup_printf("multifdrecv_%d", i);
 595        qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
 596                           QEMU_THREAD_JOINABLE);
 597        multifd_recv_state->count++;
 598    }
 599    return 0;
 600}
 601
 602/**
 603 * save_page_header: write page header to wire
 604 *
 605 * If this is the 1st block, it also writes the block identification
 606 *
 607 * Returns the number of bytes written
 608 *
 609 * @f: QEMUFile where to send the data
 610 * @block: block that contains the page we want to send
 611 * @offset: offset inside the block for the page
 612 *          in the lower bits, it contains flags
 613 */
 614static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 615                               ram_addr_t offset)
 616{
 617    size_t size, len;
 618
 619    if (block == rs->last_sent_block) {
 620        offset |= RAM_SAVE_FLAG_CONTINUE;
 621    }
 622    qemu_put_be64(f, offset);
 623    size = 8;
 624
 625    if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 626        len = strlen(block->idstr);
 627        qemu_put_byte(f, len);
 628        qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 629        size += 1 + len;
 630        rs->last_sent_block = block;
 631    }
 632    return size;
 633}
 634
 635/**
 636 * mig_throttle_guest_down: throotle down the guest
 637 *
 638 * Reduce amount of guest cpu execution to hopefully slow down memory
 639 * writes. If guest dirty memory rate is reduced below the rate at
 640 * which we can transfer pages to the destination then we should be
 641 * able to complete migration. Some workloads dirty memory way too
 642 * fast and will not effectively converge, even with auto-converge.
 643 */
 644static void mig_throttle_guest_down(void)
 645{
 646    MigrationState *s = migrate_get_current();
 647    uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 648    uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 649
 650    /* We have not started throttling yet. Let's start it. */
 651    if (!cpu_throttle_active()) {
 652        cpu_throttle_set(pct_initial);
 653    } else {
 654        /* Throttling already on, just increase the rate */
 655        cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 656    }
 657}
 658
 659/**
 660 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 661 *
 662 * @rs: current RAM state
 663 * @current_addr: address for the zero page
 664 *
 665 * Update the xbzrle cache to reflect a page that's been sent as all 0.
 666 * The important thing is that a stale (not-yet-0'd) page be replaced
 667 * by the new data.
 668 * As a bonus, if the page wasn't in the cache it gets added so that
 669 * when a small write is made into the 0'd page it gets XBZRLE sent.
 670 */
 671static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 672{
 673    if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 674        return;
 675    }
 676
 677    /* We don't care if this fails to allocate a new cache page
 678     * as long as it updated an old one */
 679    cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 680                 ram_counters.dirty_sync_count);
 681}
 682
 683#define ENCODING_FLAG_XBZRLE 0x1
 684
 685/**
 686 * save_xbzrle_page: compress and send current page
 687 *
 688 * Returns: 1 means that we wrote the page
 689 *          0 means that page is identical to the one already sent
 690 *          -1 means that xbzrle would be longer than normal
 691 *
 692 * @rs: current RAM state
 693 * @current_data: pointer to the address of the page contents
 694 * @current_addr: addr of the page
 695 * @block: block that contains the page we want to send
 696 * @offset: offset inside the block for the page
 697 * @last_stage: if we are at the completion stage
 698 */
 699static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 700                            ram_addr_t current_addr, RAMBlock *block,
 701                            ram_addr_t offset, bool last_stage)
 702{
 703    int encoded_len = 0, bytes_xbzrle;
 704    uint8_t *prev_cached_page;
 705
 706    if (!cache_is_cached(XBZRLE.cache, current_addr,
 707                         ram_counters.dirty_sync_count)) {
 708        xbzrle_counters.cache_miss++;
 709        if (!last_stage) {
 710            if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 711                             ram_counters.dirty_sync_count) == -1) {
 712                return -1;
 713            } else {
 714                /* update *current_data when the page has been
 715                   inserted into cache */
 716                *current_data = get_cached_data(XBZRLE.cache, current_addr);
 717            }
 718        }
 719        return -1;
 720    }
 721
 722    prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 723
 724    /* save current buffer into memory */
 725    memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 726
 727    /* XBZRLE encoding (if there is no overflow) */
 728    encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 729                                       TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 730                                       TARGET_PAGE_SIZE);
 731    if (encoded_len == 0) {
 732        trace_save_xbzrle_page_skipping();
 733        return 0;
 734    } else if (encoded_len == -1) {
 735        trace_save_xbzrle_page_overflow();
 736        xbzrle_counters.overflow++;
 737        /* update data in the cache */
 738        if (!last_stage) {
 739            memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 740            *current_data = prev_cached_page;
 741        }
 742        return -1;
 743    }
 744
 745    /* we need to update the data in the cache, in order to get the same data */
 746    if (!last_stage) {
 747        memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 748    }
 749
 750    /* Send XBZRLE based compressed page */
 751    bytes_xbzrle = save_page_header(rs, rs->f, block,
 752                                    offset | RAM_SAVE_FLAG_XBZRLE);
 753    qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 754    qemu_put_be16(rs->f, encoded_len);
 755    qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 756    bytes_xbzrle += encoded_len + 1 + 2;
 757    xbzrle_counters.pages++;
 758    xbzrle_counters.bytes += bytes_xbzrle;
 759    ram_counters.transferred += bytes_xbzrle;
 760
 761    return 1;
 762}
 763
 764/**
 765 * migration_bitmap_find_dirty: find the next dirty page from start
 766 *
 767 * Called with rcu_read_lock() to protect migration_bitmap
 768 *
 769 * Returns the byte offset within memory region of the start of a dirty page
 770 *
 771 * @rs: current RAM state
 772 * @rb: RAMBlock where to search for dirty pages
 773 * @start: page where we start the search
 774 */
 775static inline
 776unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 777                                          unsigned long start)
 778{
 779    unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 780    unsigned long *bitmap = rb->bmap;
 781    unsigned long next;
 782
 783    if (rs->ram_bulk_stage && start > 0) {
 784        next = start + 1;
 785    } else {
 786        next = find_next_bit(bitmap, size, start);
 787    }
 788
 789    return next;
 790}
 791
 792static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 793                                                RAMBlock *rb,
 794                                                unsigned long page)
 795{
 796    bool ret;
 797
 798    ret = test_and_clear_bit(page, rb->bmap);
 799
 800    if (ret) {
 801        rs->migration_dirty_pages--;
 802    }
 803    return ret;
 804}
 805
 806static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 807                                        ram_addr_t start, ram_addr_t length)
 808{
 809    rs->migration_dirty_pages +=
 810        cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 811                                              &rs->num_dirty_pages_period);
 812}
 813
 814/**
 815 * ram_pagesize_summary: calculate all the pagesizes of a VM
 816 *
 817 * Returns a summary bitmap of the page sizes of all RAMBlocks
 818 *
 819 * For VMs with just normal pages this is equivalent to the host page
 820 * size. If it's got some huge pages then it's the OR of all the
 821 * different page sizes.
 822 */
 823uint64_t ram_pagesize_summary(void)
 824{
 825    RAMBlock *block;
 826    uint64_t summary = 0;
 827
 828    RAMBLOCK_FOREACH(block) {
 829        summary |= block->page_size;
 830    }
 831
 832    return summary;
 833}
 834
 835static void migration_bitmap_sync(RAMState *rs)
 836{
 837    RAMBlock *block;
 838    int64_t end_time;
 839    uint64_t bytes_xfer_now;
 840
 841    ram_counters.dirty_sync_count++;
 842
 843    if (!rs->time_last_bitmap_sync) {
 844        rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 845    }
 846
 847    trace_migration_bitmap_sync_start();
 848    memory_global_dirty_log_sync();
 849
 850    qemu_mutex_lock(&rs->bitmap_mutex);
 851    rcu_read_lock();
 852    RAMBLOCK_FOREACH(block) {
 853        migration_bitmap_sync_range(rs, block, 0, block->used_length);
 854    }
 855    rcu_read_unlock();
 856    qemu_mutex_unlock(&rs->bitmap_mutex);
 857
 858    trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 859
 860    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 861
 862    /* more than 1 second = 1000 millisecons */
 863    if (end_time > rs->time_last_bitmap_sync + 1000) {
 864        /* calculate period counters */
 865        ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 866            / (end_time - rs->time_last_bitmap_sync);
 867        bytes_xfer_now = ram_counters.transferred;
 868
 869        /* During block migration the auto-converge logic incorrectly detects
 870         * that ram migration makes no progress. Avoid this by disabling the
 871         * throttling logic during the bulk phase of block migration. */
 872        if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 873            /* The following detection logic can be refined later. For now:
 874               Check to see if the dirtied bytes is 50% more than the approx.
 875               amount of bytes that just got transferred since the last time we
 876               were in this routine. If that happens twice, start or increase
 877               throttling */
 878
 879            if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 880                   (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 881                (++rs->dirty_rate_high_cnt >= 2)) {
 882                    trace_migration_throttle();
 883                    rs->dirty_rate_high_cnt = 0;
 884                    mig_throttle_guest_down();
 885            }
 886        }
 887
 888        if (migrate_use_xbzrle()) {
 889            if (rs->iterations_prev != rs->iterations) {
 890                xbzrle_counters.cache_miss_rate =
 891                   (double)(xbzrle_counters.cache_miss -
 892                            rs->xbzrle_cache_miss_prev) /
 893                   (rs->iterations - rs->iterations_prev);
 894            }
 895            rs->iterations_prev = rs->iterations;
 896            rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 897        }
 898
 899        /* reset period counters */
 900        rs->time_last_bitmap_sync = end_time;
 901        rs->num_dirty_pages_period = 0;
 902        rs->bytes_xfer_prev = bytes_xfer_now;
 903    }
 904    if (migrate_use_events()) {
 905        qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
 906    }
 907}
 908
 909/**
 910 * save_zero_page: send the zero page to the stream
 911 *
 912 * Returns the number of pages written.
 913 *
 914 * @rs: current RAM state
 915 * @block: block that contains the page we want to send
 916 * @offset: offset inside the block for the page
 917 */
 918static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
 919{
 920    uint8_t *p = block->host + offset;
 921    int pages = -1;
 922
 923    if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 924        ram_counters.duplicate++;
 925        ram_counters.transferred +=
 926            save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 927        qemu_put_byte(rs->f, 0);
 928        ram_counters.transferred += 1;
 929        pages = 1;
 930    }
 931
 932    return pages;
 933}
 934
 935static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 936{
 937    if (!migrate_release_ram() || !migration_in_postcopy()) {
 938        return;
 939    }
 940
 941    ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 942}
 943
 944/**
 945 * ram_save_page: send the given page to the stream
 946 *
 947 * Returns the number of pages written.
 948 *          < 0 - error
 949 *          >=0 - Number of pages written - this might legally be 0
 950 *                if xbzrle noticed the page was the same.
 951 *
 952 * @rs: current RAM state
 953 * @block: block that contains the page we want to send
 954 * @offset: offset inside the block for the page
 955 * @last_stage: if we are at the completion stage
 956 */
 957static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
 958{
 959    int pages = -1;
 960    uint64_t bytes_xmit;
 961    ram_addr_t current_addr;
 962    uint8_t *p;
 963    int ret;
 964    bool send_async = true;
 965    RAMBlock *block = pss->block;
 966    ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
 967
 968    p = block->host + offset;
 969    trace_ram_save_page(block->idstr, (uint64_t)offset, p);
 970
 971    /* In doubt sent page as normal */
 972    bytes_xmit = 0;
 973    ret = ram_control_save_page(rs->f, block->offset,
 974                           offset, TARGET_PAGE_SIZE, &bytes_xmit);
 975    if (bytes_xmit) {
 976        ram_counters.transferred += bytes_xmit;
 977        pages = 1;
 978    }
 979
 980    XBZRLE_cache_lock();
 981
 982    current_addr = block->offset + offset;
 983
 984    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 985        if (ret != RAM_SAVE_CONTROL_DELAYED) {
 986            if (bytes_xmit > 0) {
 987                ram_counters.normal++;
 988            } else if (bytes_xmit == 0) {
 989                ram_counters.duplicate++;
 990            }
 991        }
 992    } else {
 993        pages = save_zero_page(rs, block, offset);
 994        if (pages > 0) {
 995            /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 996             * page would be stale
 997             */
 998            xbzrle_cache_zero_page(rs, current_addr);
 999            ram_release_pages(block->idstr, offset, pages);
1000        } else if (!rs->ram_bulk_stage &&

1001                   !migration_in_postcopy() && migrate_use_xbzrle()) {
1002            pages = save_xbzrle_page(rs, &p, current_addr, block,
1003                                     offset, last_stage);
1004            if (!last_stage) {
1005                /* Can't send this cached data async, since the cache page
1006                 * might get updated before it gets to the wire
1007                 */
1008                send_async = false;
1009            }
1010        }
1011    }
1012
1013    /* XBZRLE overflow or normal page */
1014    if (pages == -1) {
1015        ram_counters.transferred +=
1016            save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
1017        if (send_async) {
1018            qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
1019                                  migrate_release_ram() &
1020                                  migration_in_postcopy());
1021        } else {
1022            qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
1023        }
1024        ram_counters.transferred += TARGET_PAGE_SIZE;
1025        pages = 1;
1026        ram_counters.normal++;
1027    }
1028
1029    XBZRLE_cache_unlock();
1030
1031    return pages;
1032}
1033
1034static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
1035                                ram_addr_t offset)
1036{
1037    RAMState *rs = ram_state;
1038    int bytes_sent, blen;
1039    uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1040
1041    bytes_sent = save_page_header(rs, f, block, offset |
1042                                  RAM_SAVE_FLAG_COMPRESS_PAGE);
1043    blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1044                                     migrate_compress_level());
1045    if (blen < 0) {
1046        bytes_sent = 0;
1047        qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1048        error_report("compressed data failed!");
1049    } else {
1050        bytes_sent += blen;
1051        ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1052    }
1053
1054    return bytes_sent;
1055}
1056
1057static void flush_compressed_data(RAMState *rs)
1058{
1059    int idx, len, thread_count;
1060
1061    if (!migrate_use_compression()) {
1062        return;
1063    }
1064    thread_count = migrate_compress_threads();
1065
1066    qemu_mutex_lock(&comp_done_lock);
1067    for (idx = 0; idx < thread_count; idx++) {
1068        while (!comp_param[idx].done) {
1069            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1070        }
1071    }
1072    qemu_mutex_unlock(&comp_done_lock);
1073
1074    for (idx = 0; idx < thread_count; idx++) {
1075        qemu_mutex_lock(&comp_param[idx].mutex);
1076        if (!comp_param[idx].quit) {
1077            len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1078            ram_counters.transferred += len;
1079        }
1080        qemu_mutex_unlock(&comp_param[idx].mutex);
1081    }
1082}
1083
1084static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1085                                       ram_addr_t offset)
1086{
1087    param->block = block;
1088    param->offset = offset;
1089}
1090
1091static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1092                                           ram_addr_t offset)
1093{
1094    int idx, thread_count, bytes_xmit = -1, pages = -1;
1095
1096    thread_count = migrate_compress_threads();
1097    qemu_mutex_lock(&comp_done_lock);
1098    while (true) {
1099        for (idx = 0; idx < thread_count; idx++) {
1100            if (comp_param[idx].done) {
1101                comp_param[idx].done = false;
1102                bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1103                qemu_mutex_lock(&comp_param[idx].mutex);
1104                set_compress_params(&comp_param[idx], block, offset);
1105                qemu_cond_signal(&comp_param[idx].cond);
1106                qemu_mutex_unlock(&comp_param[idx].mutex);
1107                pages = 1;
1108                ram_counters.normal++;
1109                ram_counters.transferred += bytes_xmit;
1110                break;
1111            }
1112        }
1113        if (pages > 0) {
1114            break;
1115        } else {
1116            qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1117        }
1118    }
1119    qemu_mutex_unlock(&comp_done_lock);
1120
1121    return pages;
1122}
1123
1124/**
1125 * ram_save_compressed_page: compress the given page and send it to the stream
1126 *
1127 * Returns the number of pages written.
1128 *
1129 * @rs: current RAM state
1130 * @block: block that contains the page we want to send
1131 * @offset: offset inside the block for the page
1132 * @last_stage: if we are at the completion stage
1133 */
1134static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1135                                    bool last_stage)
1136{
1137    int pages = -1;
1138    uint64_t bytes_xmit = 0;
1139    uint8_t *p;
1140    int ret, blen;
1141    RAMBlock *block = pss->block;
1142    ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1143
1144    p = block->host + offset;
1145
1146    ret = ram_control_save_page(rs->f, block->offset,
1147                                offset, TARGET_PAGE_SIZE, &bytes_xmit);
1148    if (bytes_xmit) {
1149        ram_counters.transferred += bytes_xmit;
1150        pages = 1;
1151    }
1152    if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1153        if (ret != RAM_SAVE_CONTROL_DELAYED) {
1154            if (bytes_xmit > 0) {
1155                ram_counters.normal++;
1156            } else if (bytes_xmit == 0) {
1157                ram_counters.duplicate++;
1158            }
1159        }
1160    } else {
1161        /* When starting the process of a new block, the first page of
1162         * the block should be sent out before other pages in the same
1163         * block, and all the pages in last block should have been sent
1164         * out, keeping this order is important, because the 'cont' flag
1165         * is used to avoid resending the block name.
1166         */
1167        if (block != rs->last_sent_block) {
1168            flush_compressed_data(rs);
1169            pages = save_zero_page(rs, block, offset);
1170            if (pages == -1) {
1171                /* Make sure the first page is sent out before other pages */
1172                bytes_xmit = save_page_header(rs, rs->f, block, offset |
1173                                              RAM_SAVE_FLAG_COMPRESS_PAGE);
1174                blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1175                                                 migrate_compress_level());
1176                if (blen > 0) {
1177                    ram_counters.transferred += bytes_xmit + blen;
1178                    ram_counters.normal++;
1179                    pages = 1;
1180                } else {
1181                    qemu_file_set_error(rs->f, blen);
1182                    error_report("compressed data failed!");
1183                }
1184            }
1185            if (pages > 0) {
1186                ram_release_pages(block->idstr, offset, pages);
1187            }
1188        } else {
1189            pages = save_zero_page(rs, block, offset);
1190            if (pages == -1) {
1191                pages = compress_page_with_multi_thread(rs, block, offset);
1192            } else {
1193                ram_release_pages(block->idstr, offset, pages);
1194            }
1195        }
1196    }
1197
1198    return pages;
1199}
1200
1201/**
1202 * find_dirty_block: find the next dirty page and update any state
1203 * associated with the search process.
1204 *
1205 * Returns if a page is found
1206 *
1207 * @rs: current RAM state
1208 * @pss: data about the state of the current dirty page scan
1209 * @again: set to false if the search has scanned the whole of RAM
1210 */
1211static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1212{
1213    pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1214    if (pss->complete_round && pss->block == rs->last_seen_block &&
1215        pss->page >= rs->last_page) {
1216        /*
1217         * We've been once around the RAM and haven't found anything.
1218         * Give up.
1219         */
1220        *again = false;
1221        return false;
1222    }
1223    if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1224        /* Didn't find anything in this RAM Block */
1225        pss->page = 0;
1226        pss->block = QLIST_NEXT_RCU(pss->block, next);
1227        if (!pss->block) {
1228            /* Hit the end of the list */
1229            pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1230            /* Flag that we've looped */
1231            pss->complete_round = true;
1232            rs->ram_bulk_stage = false;
1233            if (migrate_use_xbzrle()) {
1234                /* If xbzrle is on, stop using the data compression at this
1235                 * point. In theory, xbzrle can do better than compression.
1236                 */
1237                flush_compressed_data(rs);
1238            }
1239        }
1240        /* Didn't find anything this time, but try again on the new block */
1241        *again = true;
1242        return false;
1243    } else {
1244        /* Can go around again, but... */
1245        *again = true;
1246        /* We've found something so probably don't need to */
1247        return true;
1248    }
1249}
1250
1251/**
1252 * unqueue_page: gets a page of the queue
1253 *
1254 * Helper for 'get_queued_page' - gets a page off the queue
1255 *
1256 * Returns the block of the page (or NULL if none available)
1257 *
1258 * @rs: current RAM state
1259 * @offset: used to return the offset within the RAMBlock
1260 */
1261static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1262{
1263    RAMBlock *block = NULL;
1264
1265    qemu_mutex_lock(&rs->src_page_req_mutex);
1266    if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1267        struct RAMSrcPageRequest *entry =
1268                                QSIMPLEQ_FIRST(&rs->src_page_requests);
1269        block = entry->rb;
1270        *offset = entry->offset;
1271
1272        if (entry->len > TARGET_PAGE_SIZE) {
1273            entry->len -= TARGET_PAGE_SIZE;
1274            entry->offset += TARGET_PAGE_SIZE;
1275        } else {
1276            memory_region_unref(block->mr);
1277            QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1278            g_free(entry);
1279        }
1280    }
1281    qemu_mutex_unlock(&rs->src_page_req_mutex);
1282
1283    return block;
1284}
1285
1286/**
1287 * get_queued_page: unqueue a page from the postocpy requests
1288 *
1289 * Skips pages that are already sent (!dirty)
1290 *
1291 * Returns if a queued page is found
1292 *
1293 * @rs: current RAM state
1294 * @pss: data about the state of the current dirty page scan
1295 */
1296static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1297{
1298    RAMBlock  *block;
1299    ram_addr_t offset;
1300    bool dirty;
1301
1302    do {
1303        block = unqueue_page(rs, &offset);
1304        /*
1305         * We're sending this page, and since it's postcopy nothing else
1306         * will dirty it, and we must make sure it doesn't get sent again
1307         * even if this queue request was received after the background
1308         * search already sent it.
1309         */
1310        if (block) {
1311            unsigned long page;
1312
1313            page = offset >> TARGET_PAGE_BITS;
1314            dirty = test_bit(page, block->bmap);
1315            if (!dirty) {
1316                trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1317                       page, test_bit(page, block->unsentmap));
1318            } else {
1319                trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1320            }
1321        }
1322
1323    } while (block && !dirty);
1324
1325    if (block) {
1326        /*
1327         * As soon as we start servicing pages out of order, then we have
1328         * to kill the bulk stage, since the bulk stage assumes
1329         * in (migration_bitmap_find_and_reset_dirty) that every page is
1330         * dirty, that's no longer true.
1331         */
1332        rs->ram_bulk_stage = false;
1333
1334        /*
1335         * We want the background search to continue from the queued page
1336         * since the guest is likely to want other pages near to the page
1337         * it just requested.
1338         */
1339        pss->block = block;
1340        pss->page = offset >> TARGET_PAGE_BITS;
1341    }
1342
1343    return !!block;
1344}
1345
1346/**
1347 * migration_page_queue_free: drop any remaining pages in the ram
1348 * request queue
1349 *
1350 * It should be empty at the end anyway, but in error cases there may
1351 * be some left.  in case that there is any page left, we drop it.
1352 *
1353 */
1354static void migration_page_queue_free(RAMState *rs)
1355{
1356    struct RAMSrcPageRequest *mspr, *next_mspr;
1357    /* This queue generally should be empty - but in the case of a failed
1358     * migration might have some droppings in.
1359     */
1360    rcu_read_lock();
1361    QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1362        memory_region_unref(mspr->rb->mr);
1363        QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1364        g_free(mspr);
1365    }
1366    rcu_read_unlock();
1367}
1368
1369/**
1370 * ram_save_queue_pages: queue the page for transmission
1371 *
1372 * A request from postcopy destination for example.
1373 *
1374 * Returns zero on success or negative on error
1375 *
1376 * @rbname: Name of the RAMBLock of the request. NULL means the
1377 *          same that last one.
1378 * @start: starting address from the start of the RAMBlock
1379 * @len: length (in bytes) to send
1380 */
1381int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1382{
1383    RAMBlock *ramblock;
1384    RAMState *rs = ram_state;
1385
1386    ram_counters.postcopy_requests++;
1387    rcu_read_lock();
1388    if (!rbname) {
1389        /* Reuse last RAMBlock */
1390        ramblock = rs->last_req_rb;
1391
1392        if (!ramblock) {
1393            /*
1394             * Shouldn't happen, we can't reuse the last RAMBlock if
1395             * it's the 1st request.
1396             */
1397            error_report("ram_save_queue_pages no previous block");
1398            goto err;
1399        }
1400    } else {
1401        ramblock = qemu_ram_block_by_name(rbname);
1402
1403        if (!ramblock) {
1404            /* We shouldn't be asked for a non-existent RAMBlock */
1405            error_report("ram_save_queue_pages no block '%s'", rbname);
1406            goto err;
1407        }
1408        rs->last_req_rb = ramblock;
1409    }
1410    trace_ram_save_queue_pages(ramblock->idstr, start, len);
1411    if (start+len > ramblock->used_length) {
1412        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1413                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1414                     __func__, start, len, ramblock->used_length);
1415        goto err;
1416    }
1417
1418    struct RAMSrcPageRequest *new_entry =
1419        g_malloc0(sizeof(struct RAMSrcPageRequest));
1420    new_entry->rb = ramblock;
1421    new_entry->offset = start;
1422    new_entry->len = len;
1423
1424    memory_region_ref(ramblock->mr);
1425    qemu_mutex_lock(&rs->src_page_req_mutex);
1426    QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1427    qemu_mutex_unlock(&rs->src_page_req_mutex);
1428    rcu_read_unlock();
1429
1430    return 0;
1431
1432err:
1433    rcu_read_unlock();
1434    return -1;
1435}
1436
1437/**
1438 * ram_save_target_page: save one target page
1439 *
1440 * Returns the number of pages written
1441 *
1442 * @rs: current RAM state
1443 * @ms: current migration state
1444 * @pss: data about the page we want to send
1445 * @last_stage: if we are at the completion stage
1446 */
1447static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1448                                bool last_stage)
1449{
1450    int res = 0;
1451
1452    /* Check the pages is dirty and if it is send it */
1453    if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1454        /*
1455         * If xbzrle is on, stop using the data compression after first
1456         * round of migration even if compression is enabled. In theory,
1457         * xbzrle can do better than compression.
1458         */
1459        if (migrate_use_compression() &&
1460            (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1461            res = ram_save_compressed_page(rs, pss, last_stage);
1462        } else {
1463            res = ram_save_page(rs, pss, last_stage);
1464        }
1465
1466        if (res < 0) {
1467            return res;
1468        }
1469        if (pss->block->unsentmap) {
1470            clear_bit(pss->page, pss->block->unsentmap);
1471        }
1472    }
1473
1474    return res;
1475}
1476
1477/**
1478 * ram_save_host_page: save a whole host page
1479 *
1480 * Starting at *offset send pages up to the end of the current host
1481 * page. It's valid for the initial offset to point into the middle of
1482 * a host page in which case the remainder of the hostpage is sent.
1483 * Only dirty target pages are sent. Note that the host page size may
1484 * be a huge page for this block.
1485 * The saving stops at the boundary of the used_length of the block
1486 * if the RAMBlock isn't a multiple of the host page size.
1487 *
1488 * Returns the number of pages written or negative on error
1489 *
1490 * @rs: current RAM state
1491 * @ms: current migration state
1492 * @pss: data about the page we want to send
1493 * @last_stage: if we are at the completion stage
1494 */
1495static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1496                              bool last_stage)
1497{
1498    int tmppages, pages = 0;
1499    size_t pagesize_bits =
1500        qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1501
1502    do {
1503        tmppages = ram_save_target_page(rs, pss, last_stage);
1504        if (tmppages < 0) {
1505            return tmppages;
1506        }
1507
1508        pages += tmppages;
1509        pss->page++;
1510    } while ((pss->page & (pagesize_bits - 1)) &&
1511             offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1512
1513    /* The offset we leave with is the last one we looked at */
1514    pss->page--;
1515    return pages;
1516}
1517
1518/**
1519 * ram_find_and_save_block: finds a dirty page and sends it to f
1520 *
1521 * Called within an RCU critical section.
1522 *
1523 * Returns the number of pages written where zero means no dirty pages
1524 *
1525 * @rs: current RAM state
1526 * @last_stage: if we are at the completion stage
1527 *
1528 * On systems where host-page-size > target-page-size it will send all the
1529 * pages in a host page that are dirty.
1530 */
1531
1532static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1533{
1534    PageSearchStatus pss;
1535    int pages = 0;
1536    bool again, found;
1537
1538    /* No dirty page as there is zero RAM */
1539    if (!ram_bytes_total()) {
1540        return pages;
1541    }
1542
1543    pss.block = rs->last_seen_block;
1544    pss.page = rs->last_page;
1545    pss.complete_round = false;
1546
1547    if (!pss.block) {
1548        pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1549    }
1550
1551    do {
1552        again = true;
1553        found = get_queued_page(rs, &pss);
1554
1555        if (!found) {
1556            /* priority queue empty, so just search for something dirty */
1557            found = find_dirty_block(rs, &pss, &again);
1558        }
1559
1560        if (found) {
1561            pages = ram_save_host_page(rs, &pss, last_stage);
1562        }
1563    } while (!pages && again);
1564
1565    rs->last_seen_block = pss.block;
1566    rs->last_page = pss.page;
1567
1568    return pages;
1569}
1570
1571void acct_update_position(QEMUFile *f, size_t size, bool zero)
1572{
1573    uint64_t pages = size / TARGET_PAGE_SIZE;
1574
1575    if (zero) {
1576        ram_counters.duplicate += pages;
1577    } else {
1578        ram_counters.normal += pages;
1579        ram_counters.transferred += size;
1580        qemu_update_position(f, size);
1581    }
1582}
1583
1584uint64_t ram_bytes_total(void)
1585{
1586    RAMBlock *block;
1587    uint64_t total = 0;
1588
1589    rcu_read_lock();
1590    RAMBLOCK_FOREACH(block) {
1591        total += block->used_length;
1592    }
1593    rcu_read_unlock();
1594    return total;
1595}
1596
1597static void xbzrle_load_setup(void)
1598{
1599    XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1600}
1601
1602static void xbzrle_load_cleanup(void)
1603{
1604    g_free(XBZRLE.decoded_buf);
1605    XBZRLE.decoded_buf = NULL;
1606}
1607
1608static void ram_state_cleanup(RAMState **rsp)
1609{
1610    if (*rsp) {
1611        migration_page_queue_free(*rsp);
1612        qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1613        qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1614        g_free(*rsp);
1615        *rsp = NULL;
1616    }
1617}
1618
1619static void xbzrle_cleanup(void)
1620{
1621    XBZRLE_cache_lock();
1622    if (XBZRLE.cache) {
1623        cache_fini(XBZRLE.cache);
1624        g_free(XBZRLE.encoded_buf);
1625        g_free(XBZRLE.current_buf);
1626        g_free(XBZRLE.zero_target_page);
1627        XBZRLE.cache = NULL;
1628        XBZRLE.encoded_buf = NULL;
1629        XBZRLE.current_buf = NULL;
1630        XBZRLE.zero_target_page = NULL;
1631    }
1632    XBZRLE_cache_unlock();
1633}
1634
1635static void ram_save_cleanup(void *opaque)
1636{
1637    RAMState **rsp = opaque;
1638    RAMBlock *block;
1639
1640    /* caller have hold iothread lock or is in a bh, so there is
1641     * no writing race against this migration_bitmap
1642     */
1643    memory_global_dirty_log_stop();
1644
1645    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1646        g_free(block->bmap);
1647        block->bmap = NULL;
1648        g_free(block->unsentmap);
1649        block->unsentmap = NULL;
1650    }
1651
1652    xbzrle_cleanup();
1653    compress_threads_save_cleanup();
1654    ram_state_cleanup(rsp);
1655}
1656
1657static void ram_state_reset(RAMState *rs)
1658{
1659    rs->last_seen_block = NULL;
1660    rs->last_sent_block = NULL;
1661    rs->last_page = 0;
1662    rs->last_version = ram_list.version;
1663    rs->ram_bulk_stage = true;
1664}
1665
1666#define MAX_WAIT 50 /* ms, half buffered_file limit */
1667
1668/*
1669 * 'expected' is the value you expect the bitmap mostly to be full
1670 * of; it won't bother printing lines that are all this value.
1671 * If 'todump' is null the migration bitmap is dumped.
1672 */
1673void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1674                           unsigned long pages)
1675{
1676    int64_t cur;
1677    int64_t linelen = 128;
1678    char linebuf[129];
1679
1680    for (cur = 0; cur < pages; cur += linelen) {
1681        int64_t curb;
1682        bool found = false;
1683        /*
1684         * Last line; catch the case where the line length
1685         * is longer than remaining ram
1686         */
1687        if (cur + linelen > pages) {
1688            linelen = pages - cur;
1689        }
1690        for (curb = 0; curb < linelen; curb++) {
1691            bool thisbit = test_bit(cur + curb, todump);
1692            linebuf[curb] = thisbit ? '1' : '.';
1693            found = found || (thisbit != expected);
1694        }
1695        if (found) {
1696            linebuf[curb] = '\0';
1697            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1698        }
1699    }
1700}
1701
1702/* **** functions for postcopy ***** */
1703
1704void ram_postcopy_migrated_memory_release(MigrationState *ms)
1705{
1706    struct RAMBlock *block;
1707
1708    RAMBLOCK_FOREACH(block) {
1709        unsigned long *bitmap = block->bmap;
1710        unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1711        unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1712
1713        while (run_start < range) {
1714            unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1715            ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1716                              (run_end - run_start) << TARGET_PAGE_BITS);
1717            run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1718        }
1719    }
1720}
1721
1722/**
1723 * postcopy_send_discard_bm_ram: discard a RAMBlock
1724 *
1725 * Returns zero on success
1726 *
1727 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1728 * Note: At this point the 'unsentmap' is the processed bitmap combined
1729 *       with the dirtymap; so a '1' means it's either dirty or unsent.
1730 *
1731 * @ms: current migration state
1732 * @pds: state for postcopy
1733 * @start: RAMBlock starting page
1734 * @length: RAMBlock size
1735 */
1736static int postcopy_send_discard_bm_ram(MigrationState *ms,
1737                                        PostcopyDiscardState *pds,
1738                                        RAMBlock *block)
1739{
1740    unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1741    unsigned long current;
1742    unsigned long *unsentmap = block->unsentmap;
1743
1744    for (current = 0; current < end; ) {
1745        unsigned long one = find_next_bit(unsentmap, end, current);
1746
1747        if (one <= end) {
1748            unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1749            unsigned long discard_length;
1750
1751            if (zero >= end) {
1752                discard_length = end - one;
1753            } else {
1754                discard_length = zero - one;
1755            }
1756            if (discard_length) {
1757                postcopy_discard_send_range(ms, pds, one, discard_length);
1758            }
1759            current = one + discard_length;
1760        } else {
1761            current = one;
1762        }
1763    }
1764
1765    return 0;
1766}
1767
1768/**
1769 * postcopy_each_ram_send_discard: discard all RAMBlocks
1770 *
1771 * Returns 0 for success or negative for error
1772 *
1773 * Utility for the outgoing postcopy code.
1774 *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1775 *   passing it bitmap indexes and name.
1776 * (qemu_ram_foreach_block ends up passing unscaled lengths
1777 *  which would mean postcopy code would have to deal with target page)
1778 *
1779 * @ms: current migration state
1780 */
1781static int postcopy_each_ram_send_discard(MigrationState *ms)
1782{
1783    struct RAMBlock *block;
1784    int ret;
1785
1786    RAMBLOCK_FOREACH(block) {
1787        PostcopyDiscardState *pds =
1788            postcopy_discard_send_init(ms, block->idstr);
1789
1790        /*
1791         * Postcopy sends chunks of bitmap over the wire, but it
1792         * just needs indexes at this point, avoids it having
1793         * target page specific code.
1794         */
1795        ret = postcopy_send_discard_bm_ram(ms, pds, block);
1796        postcopy_discard_send_finish(ms, pds);
1797        if (ret) {
1798            return ret;
1799        }
1800    }
1801
1802    return 0;
1803}
1804
1805/**
1806 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1807 *
1808 * Helper for postcopy_chunk_hostpages; it's called twice to
1809 * canonicalize the two bitmaps, that are similar, but one is
1810 * inverted.
1811 *
1812 * Postcopy requires that all target pages in a hostpage are dirty or
1813 * clean, not a mix.  This function canonicalizes the bitmaps.
1814 *
1815 * @ms: current migration state
1816 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1817 *               otherwise we need to canonicalize partially dirty host pages
1818 * @block: block that contains the page we want to canonicalize
1819 * @pds: state for postcopy
1820 */
1821static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1822                                          RAMBlock *block,
1823                                          PostcopyDiscardState *pds)
1824{
1825    RAMState *rs = ram_state;
1826    unsigned long *bitmap = block->bmap;
1827    unsigned long *unsentmap = block->unsentmap;
1828    unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1829    unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1830    unsigned long run_start;
1831
1832    if (block->page_size == TARGET_PAGE_SIZE) {
1833        /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1834        return;
1835    }
1836
1837    if (unsent_pass) {
1838        /* Find a sent page */
1839        run_start = find_next_zero_bit(unsentmap, pages, 0);
1840    } else {
1841        /* Find a dirty page */
1842        run_start = find_next_bit(bitmap, pages, 0);
1843    }
1844
1845    while (run_start < pages) {
1846        bool do_fixup = false;
1847        unsigned long fixup_start_addr;
1848        unsigned long host_offset;
1849
1850        /*
1851         * If the start of this run of pages is in the middle of a host
1852         * page, then we need to fixup this host page.
1853         */
1854        host_offset = run_start % host_ratio;
1855        if (host_offset) {
1856            do_fixup = true;
1857            run_start -= host_offset;
1858            fixup_start_addr = run_start;
1859            /* For the next pass */
1860            run_start = run_start + host_ratio;
1861        } else {
1862            /* Find the end of this run */
1863            unsigned long run_end;
1864            if (unsent_pass) {
1865                run_end = find_next_bit(unsentmap, pages, run_start + 1);
1866            } else {
1867                run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1868            }
1869            /*
1870             * If the end isn't at the start of a host page, then the
1871             * run doesn't finish at the end of a host page
1872             * and we need to discard.
1873             */
1874            host_offset = run_end % host_ratio;
1875            if (host_offset) {
1876                do_fixup = true;
1877                fixup_start_addr = run_end - host_offset;
1878                /*
1879                 * This host page has gone, the next loop iteration starts
1880                 * from after the fixup
1881                 */
1882                run_start = fixup_start_addr + host_ratio;
1883            } else {
1884                /*
1885                 * No discards on this iteration, next loop starts from
1886                 * next sent/dirty page
1887                 */
1888                run_start = run_end + 1;
1889            }
1890        }
1891
1892        if (do_fixup) {
1893            unsigned long page;
1894
1895            /* Tell the destination to discard this page */
1896            if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1897                /* For the unsent_pass we:
1898                 *     discard partially sent pages
1899                 * For the !unsent_pass (dirty) we:
1900                 *     discard partially dirty pages that were sent
1901                 *     (any partially sent pages were already discarded
1902                 *     by the previous unsent_pass)
1903                 */
1904                postcopy_discard_send_range(ms, pds, fixup_start_addr,
1905                                            host_ratio);
1906            }
1907
1908            /* Clean up the bitmap */
1909            for (page = fixup_start_addr;
1910                 page < fixup_start_addr + host_ratio; page++) {
1911                /* All pages in this host page are now not sent */
1912                set_bit(page, unsentmap);
1913
1914                /*
1915                 * Remark them as dirty, updating the count for any pages
1916                 * that weren't previously dirty.
1917                 */
1918                rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1919            }
1920        }
1921
1922        if (unsent_pass) {
1923            /* Find the next sent page for the next iteration */
1924            run_start = find_next_zero_bit(unsentmap, pages, run_start);
1925        } else {
1926            /* Find the next dirty page for the next iteration */
1927            run_start = find_next_bit(bitmap, pages, run_start);
1928        }
1929    }
1930}
1931
1932/**
1933 * postcopy_chuck_hostpages: discrad any partially sent host page
1934 *
1935 * Utility for the outgoing postcopy code.
1936 *
1937 * Discard any partially sent host-page size chunks, mark any partially
1938 * dirty host-page size chunks as all dirty.  In this case the host-page
1939 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1940 *
1941 * Returns zero on success
1942 *
1943 * @ms: current migration state
1944 * @block: block we want to work with
1945 */
1946static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1947{
1948    PostcopyDiscardState *pds =
1949        postcopy_discard_send_init(ms, block->idstr);
1950
1951    /* First pass: Discard all partially sent host pages */
1952    postcopy_chunk_hostpages_pass(ms, true, block, pds);
1953    /*
1954     * Second pass: Ensure that all partially dirty host pages are made
1955     * fully dirty.
1956     */
1957    postcopy_chunk_hostpages_pass(ms, false, block, pds);
1958
1959    postcopy_discard_send_finish(ms, pds);
1960    return 0;
1961}
1962
1963/**
1964 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1965 *
1966 * Returns zero on success
1967 *
1968 * Transmit the set of pages to be discarded after precopy to the target
1969 * these are pages that:
1970 *     a) Have been previously transmitted but are now dirty again
1971 *     b) Pages that have never been transmitted, this ensures that
1972 *        any pages on the destination that have been mapped by background
1973 *        tasks get discarded (transparent huge pages is the specific concern)
1974 * Hopefully this is pretty sparse
1975 *
1976 * @ms: current migration state
1977 */
1978int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1979{
1980    RAMState *rs = ram_state;
1981    RAMBlock *block;
1982    int ret;
1983
1984    rcu_read_lock();
1985
1986    /* This should be our last sync, the src is now paused */
1987    migration_bitmap_sync(rs);
1988
1989    /* Easiest way to make sure we don't resume in the middle of a host-page */
1990    rs->last_seen_block = NULL;
1991    rs->last_sent_block = NULL;
1992    rs->last_page = 0;
1993
1994    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1995        unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1996        unsigned long *bitmap = block->bmap;
1997        unsigned long *unsentmap = block->unsentmap;
1998
1999        if (!unsentmap) {
2000            /* We don't have a safe way to resize the sentmap, so

2001             * if the bitmap was resized it will be NULL at this
2002             * point.
2003             */
2004            error_report("migration ram resized during precopy phase");
2005            rcu_read_unlock();
2006            return -EINVAL;
2007        }
2008        /* Deal with TPS != HPS and huge pages */
2009        ret = postcopy_chunk_hostpages(ms, block);
2010        if (ret) {
2011            rcu_read_unlock();
2012            return ret;
2013        }
2014
2015        /*
2016         * Update the unsentmap to be unsentmap = unsentmap | dirty
2017         */
2018        bitmap_or(unsentmap, unsentmap, bitmap, pages);
2019#ifdef DEBUG_POSTCOPY
2020        ram_debug_dump_bitmap(unsentmap, true, pages);
2021#endif
2022    }
2023    trace_ram_postcopy_send_discard_bitmap();
2024
2025    ret = postcopy_each_ram_send_discard(ms);
2026    rcu_read_unlock();
2027
2028    return ret;
2029}
2030
2031/**
2032 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2033 *
2034 * Returns zero on success
2035 *
2036 * @rbname: name of the RAMBlock of the request. NULL means the
2037 *          same that last one.
2038 * @start: RAMBlock starting page
2039 * @length: RAMBlock size
2040 */
2041int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2042{
2043    int ret = -1;
2044
2045    trace_ram_discard_range(rbname, start, length);
2046
2047    rcu_read_lock();
2048    RAMBlock *rb = qemu_ram_block_by_name(rbname);
2049
2050    if (!rb) {
2051        error_report("ram_discard_range: Failed to find block '%s'", rbname);
2052        goto err;
2053    }
2054
2055    bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2056                 length >> qemu_target_page_bits());
2057    ret = ram_block_discard_range(rb, start, length);
2058
2059err:
2060    rcu_read_unlock();
2061
2062    return ret;
2063}
2064
2065/*
2066 * For every allocation, we will try not to crash the VM if the
2067 * allocation failed.
2068 */
2069static int xbzrle_init(void)
2070{
2071    Error *local_err = NULL;
2072
2073    if (!migrate_use_xbzrle()) {
2074        return 0;
2075    }
2076
2077    XBZRLE_cache_lock();
2078
2079    XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2080    if (!XBZRLE.zero_target_page) {
2081        error_report("%s: Error allocating zero page", __func__);
2082        goto err_out;
2083    }
2084
2085    XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2086                              TARGET_PAGE_SIZE, &local_err);
2087    if (!XBZRLE.cache) {
2088        error_report_err(local_err);
2089        goto free_zero_page;
2090    }
2091
2092    XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2093    if (!XBZRLE.encoded_buf) {
2094        error_report("%s: Error allocating encoded_buf", __func__);
2095        goto free_cache;
2096    }
2097
2098    XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2099    if (!XBZRLE.current_buf) {
2100        error_report("%s: Error allocating current_buf", __func__);
2101        goto free_encoded_buf;
2102    }
2103
2104    /* We are all good */
2105    XBZRLE_cache_unlock();
2106    return 0;
2107
2108free_encoded_buf:
2109    g_free(XBZRLE.encoded_buf);
2110    XBZRLE.encoded_buf = NULL;
2111free_cache:
2112    cache_fini(XBZRLE.cache);
2113    XBZRLE.cache = NULL;
2114free_zero_page:
2115    g_free(XBZRLE.zero_target_page);
2116    XBZRLE.zero_target_page = NULL;
2117err_out:
2118    XBZRLE_cache_unlock();
2119    return -ENOMEM;
2120}
2121
2122static int ram_state_init(RAMState **rsp)
2123{
2124    *rsp = g_try_new0(RAMState, 1);
2125
2126    if (!*rsp) {
2127        error_report("%s: Init ramstate fail", __func__);
2128        return -1;
2129    }
2130
2131    qemu_mutex_init(&(*rsp)->bitmap_mutex);
2132    qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2133    QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2134
2135    /*
2136     * Count the total number of pages used by ram blocks not including any
2137     * gaps due to alignment or unplugs.
2138     */
2139    (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2140
2141    ram_state_reset(*rsp);
2142
2143    return 0;
2144}
2145
2146static void ram_list_init_bitmaps(void)
2147{
2148    RAMBlock *block;
2149    unsigned long pages;
2150
2151    /* Skip setting bitmap if there is no RAM */
2152    if (ram_bytes_total()) {
2153        QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2154            pages = block->max_length >> TARGET_PAGE_BITS;
2155            block->bmap = bitmap_new(pages);
2156            bitmap_set(block->bmap, 0, pages);
2157            if (migrate_postcopy_ram()) {
2158                block->unsentmap = bitmap_new(pages);
2159                bitmap_set(block->unsentmap, 0, pages);
2160            }
2161        }
2162    }
2163}
2164
2165static void ram_init_bitmaps(RAMState *rs)
2166{
2167    /* For memory_global_dirty_log_start below.  */
2168    qemu_mutex_lock_iothread();
2169    qemu_mutex_lock_ramlist();
2170    rcu_read_lock();
2171
2172    ram_list_init_bitmaps();
2173    memory_global_dirty_log_start();
2174    migration_bitmap_sync(rs);
2175
2176    rcu_read_unlock();
2177    qemu_mutex_unlock_ramlist();
2178    qemu_mutex_unlock_iothread();
2179}
2180
2181static int ram_init_all(RAMState **rsp)
2182{
2183    if (ram_state_init(rsp)) {
2184        return -1;
2185    }
2186
2187    if (xbzrle_init()) {
2188        ram_state_cleanup(rsp);
2189        return -1;
2190    }
2191
2192    ram_init_bitmaps(*rsp);
2193
2194    return 0;
2195}
2196
2197/*
2198 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2199 * long-running RCU critical section.  When rcu-reclaims in the code
2200 * start to become numerous it will be necessary to reduce the
2201 * granularity of these critical sections.
2202 */
2203
2204/**
2205 * ram_save_setup: Setup RAM for migration
2206 *
2207 * Returns zero to indicate success and negative for error
2208 *
2209 * @f: QEMUFile where to send the data
2210 * @opaque: RAMState pointer
2211 */
2212static int ram_save_setup(QEMUFile *f, void *opaque)
2213{
2214    RAMState **rsp = opaque;
2215    RAMBlock *block;
2216
2217    /* migration has already setup the bitmap, reuse it. */
2218    if (!migration_in_colo_state()) {
2219        if (ram_init_all(rsp) != 0) {
2220            return -1;
2221        }
2222    }
2223    (*rsp)->f = f;
2224
2225    rcu_read_lock();
2226
2227    qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2228
2229    RAMBLOCK_FOREACH(block) {
2230        qemu_put_byte(f, strlen(block->idstr));
2231        qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2232        qemu_put_be64(f, block->used_length);
2233        if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2234            qemu_put_be64(f, block->page_size);
2235        }
2236    }
2237
2238    rcu_read_unlock();
2239    compress_threads_save_setup();
2240
2241    ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2242    ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2243
2244    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2245
2246    return 0;
2247}
2248
2249/**
2250 * ram_save_iterate: iterative stage for migration
2251 *
2252 * Returns zero to indicate success and negative for error
2253 *
2254 * @f: QEMUFile where to send the data
2255 * @opaque: RAMState pointer
2256 */
2257static int ram_save_iterate(QEMUFile *f, void *opaque)
2258{
2259    RAMState **temp = opaque;
2260    RAMState *rs = *temp;
2261    int ret;
2262    int i;
2263    int64_t t0;
2264    int done = 0;
2265
2266    if (blk_mig_bulk_active()) {
2267        /* Avoid transferring ram during bulk phase of block migration as
2268         * the bulk phase will usually take a long time and transferring
2269         * ram updates during that time is pointless. */
2270        goto out;
2271    }
2272
2273    rcu_read_lock();
2274    if (ram_list.version != rs->last_version) {
2275        ram_state_reset(rs);
2276    }
2277
2278    /* Read version before ram_list.blocks */
2279    smp_rmb();
2280
2281    ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2282
2283    t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2284    i = 0;
2285    while ((ret = qemu_file_rate_limit(f)) == 0) {
2286        int pages;
2287
2288        pages = ram_find_and_save_block(rs, false);
2289        /* no more pages to sent */
2290        if (pages == 0) {
2291            done = 1;
2292            break;
2293        }
2294        rs->iterations++;
2295
2296        /* we want to check in the 1st loop, just in case it was the 1st time
2297           and we had to sync the dirty bitmap.
2298           qemu_get_clock_ns() is a bit expensive, so we only check each some
2299           iterations
2300        */
2301        if ((i & 63) == 0) {
2302            uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2303            if (t1 > MAX_WAIT) {
2304                trace_ram_save_iterate_big_wait(t1, i);
2305                break;
2306            }
2307        }
2308        i++;
2309    }
2310    flush_compressed_data(rs);
2311    rcu_read_unlock();
2312
2313    /*
2314     * Must occur before EOS (or any QEMUFile operation)
2315     * because of RDMA protocol.
2316     */
2317    ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2318
2319out:
2320    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2321    ram_counters.transferred += 8;
2322
2323    ret = qemu_file_get_error(f);
2324    if (ret < 0) {
2325        return ret;
2326    }
2327
2328    return done;
2329}
2330
2331/**
2332 * ram_save_complete: function called to send the remaining amount of ram
2333 *
2334 * Returns zero to indicate success
2335 *
2336 * Called with iothread lock
2337 *
2338 * @f: QEMUFile where to send the data
2339 * @opaque: RAMState pointer
2340 */
2341static int ram_save_complete(QEMUFile *f, void *opaque)
2342{
2343    RAMState **temp = opaque;
2344    RAMState *rs = *temp;
2345
2346    rcu_read_lock();
2347
2348    if (!migration_in_postcopy()) {
2349        migration_bitmap_sync(rs);
2350    }
2351
2352    ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2353
2354    /* try transferring iterative blocks of memory */
2355
2356    /* flush all remaining blocks regardless of rate limiting */
2357    while (true) {
2358        int pages;
2359
2360        pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2361        /* no more blocks to sent */
2362        if (pages == 0) {
2363            break;
2364        }
2365    }
2366
2367    flush_compressed_data(rs);
2368    ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2369
2370    rcu_read_unlock();
2371
2372    qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2373
2374    return 0;
2375}
2376
2377static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2378                             uint64_t *res_precopy_only,
2379                             uint64_t *res_compatible,
2380                             uint64_t *res_postcopy_only)
2381{
2382    RAMState **temp = opaque;
2383    RAMState *rs = *temp;
2384    uint64_t remaining_size;
2385
2386    remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2387
2388    if (!migration_in_postcopy() &&
2389        remaining_size < max_size) {
2390        qemu_mutex_lock_iothread();
2391        rcu_read_lock();
2392        migration_bitmap_sync(rs);
2393        rcu_read_unlock();
2394        qemu_mutex_unlock_iothread();
2395        remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2396    }
2397
2398    if (migrate_postcopy_ram()) {
2399        /* We can do postcopy, and all the data is postcopiable */
2400        *res_compatible += remaining_size;
2401    } else {
2402        *res_precopy_only += remaining_size;
2403    }
2404}
2405
2406static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2407{
2408    unsigned int xh_len;
2409    int xh_flags;
2410    uint8_t *loaded_data;
2411
2412    /* extract RLE header */
2413    xh_flags = qemu_get_byte(f);
2414    xh_len = qemu_get_be16(f);
2415
2416    if (xh_flags != ENCODING_FLAG_XBZRLE) {
2417        error_report("Failed to load XBZRLE page - wrong compression!");
2418        return -1;
2419    }
2420
2421    if (xh_len > TARGET_PAGE_SIZE) {
2422        error_report("Failed to load XBZRLE page - len overflow!");
2423        return -1;
2424    }
2425    loaded_data = XBZRLE.decoded_buf;
2426    /* load data and decode */
2427    /* it can change loaded_data to point to an internal buffer */
2428    qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2429
2430    /* decode RLE */
2431    if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2432                             TARGET_PAGE_SIZE) == -1) {
2433        error_report("Failed to load XBZRLE page - decode error!");
2434        return -1;
2435    }
2436
2437    return 0;
2438}
2439
2440/**
2441 * ram_block_from_stream: read a RAMBlock id from the migration stream
2442 *
2443 * Must be called from within a rcu critical section.
2444 *
2445 * Returns a pointer from within the RCU-protected ram_list.
2446 *
2447 * @f: QEMUFile where to read the data from
2448 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2449 */
2450static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2451{
2452    static RAMBlock *block = NULL;
2453    char id[256];
2454    uint8_t len;
2455
2456    if (flags & RAM_SAVE_FLAG_CONTINUE) {
2457        if (!block) {
2458            error_report("Ack, bad migration stream!");
2459            return NULL;
2460        }
2461        return block;
2462    }
2463
2464    len = qemu_get_byte(f);
2465    qemu_get_buffer(f, (uint8_t *)id, len);
2466    id[len] = 0;
2467
2468    block = qemu_ram_block_by_name(id);
2469    if (!block) {
2470        error_report("Can't find block %s", id);
2471        return NULL;
2472    }
2473
2474    return block;
2475}
2476
2477static inline void *host_from_ram_block_offset(RAMBlock *block,
2478                                               ram_addr_t offset)
2479{
2480    if (!offset_in_ramblock(block, offset)) {
2481        return NULL;
2482    }
2483
2484    return block->host + offset;
2485}
2486
2487/**
2488 * ram_handle_compressed: handle the zero page case
2489 *
2490 * If a page (or a whole RDMA chunk) has been
2491 * determined to be zero, then zap it.
2492 *
2493 * @host: host address for the zero page
2494 * @ch: what the page is filled from.  We only support zero
2495 * @size: size of the zero page
2496 */
2497void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2498{
2499    if (ch != 0 || !is_zero_range(host, size)) {
2500        memset(host, ch, size);
2501    }
2502}
2503
2504static void *do_data_decompress(void *opaque)
2505{
2506    DecompressParam *param = opaque;
2507    unsigned long pagesize;
2508    uint8_t *des;
2509    int len;
2510
2511    qemu_mutex_lock(&param->mutex);
2512    while (!param->quit) {
2513        if (param->des) {
2514            des = param->des;
2515            len = param->len;
2516            param->des = 0;
2517            qemu_mutex_unlock(&param->mutex);
2518
2519            pagesize = TARGET_PAGE_SIZE;
2520            /* uncompress() will return failed in some case, especially
2521             * when the page is dirted when doing the compression, it's
2522             * not a problem because the dirty page will be retransferred
2523             * and uncompress() won't break the data in other pages.
2524             */
2525            uncompress((Bytef *)des, &pagesize,
2526                       (const Bytef *)param->compbuf, len);
2527
2528            qemu_mutex_lock(&decomp_done_lock);
2529            param->done = true;
2530            qemu_cond_signal(&decomp_done_cond);
2531            qemu_mutex_unlock(&decomp_done_lock);
2532
2533            qemu_mutex_lock(&param->mutex);
2534        } else {
2535            qemu_cond_wait(&param->cond, &param->mutex);
2536        }
2537    }
2538    qemu_mutex_unlock(&param->mutex);
2539
2540    return NULL;
2541}
2542
2543static void wait_for_decompress_done(void)
2544{
2545    int idx, thread_count;
2546
2547    if (!migrate_use_compression()) {
2548        return;
2549    }
2550
2551    thread_count = migrate_decompress_threads();
2552    qemu_mutex_lock(&decomp_done_lock);
2553    for (idx = 0; idx < thread_count; idx++) {
2554        while (!decomp_param[idx].done) {
2555            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2556        }
2557    }
2558    qemu_mutex_unlock(&decomp_done_lock);
2559}
2560
2561static void compress_threads_load_setup(void)
2562{
2563    int i, thread_count;
2564
2565    if (!migrate_use_compression()) {
2566        return;
2567    }
2568    thread_count = migrate_decompress_threads();
2569    decompress_threads = g_new0(QemuThread, thread_count);
2570    decomp_param = g_new0(DecompressParam, thread_count);
2571    qemu_mutex_init(&decomp_done_lock);
2572    qemu_cond_init(&decomp_done_cond);
2573    for (i = 0; i < thread_count; i++) {
2574        qemu_mutex_init(&decomp_param[i].mutex);
2575        qemu_cond_init(&decomp_param[i].cond);
2576        decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2577        decomp_param[i].done = true;
2578        decomp_param[i].quit = false;
2579        qemu_thread_create(decompress_threads + i, "decompress",
2580                           do_data_decompress, decomp_param + i,
2581                           QEMU_THREAD_JOINABLE);
2582    }
2583}
2584
2585static void compress_threads_load_cleanup(void)
2586{
2587    int i, thread_count;
2588
2589    if (!migrate_use_compression()) {
2590        return;
2591    }
2592    thread_count = migrate_decompress_threads();
2593    for (i = 0; i < thread_count; i++) {
2594        qemu_mutex_lock(&decomp_param[i].mutex);
2595        decomp_param[i].quit = true;
2596        qemu_cond_signal(&decomp_param[i].cond);
2597        qemu_mutex_unlock(&decomp_param[i].mutex);
2598    }
2599    for (i = 0; i < thread_count; i++) {
2600        qemu_thread_join(decompress_threads + i);
2601        qemu_mutex_destroy(&decomp_param[i].mutex);
2602        qemu_cond_destroy(&decomp_param[i].cond);
2603        g_free(decomp_param[i].compbuf);
2604    }
2605    g_free(decompress_threads);
2606    g_free(decomp_param);
2607    decompress_threads = NULL;
2608    decomp_param = NULL;
2609}
2610
2611static void decompress_data_with_multi_threads(QEMUFile *f,
2612                                               void *host, int len)
2613{
2614    int idx, thread_count;
2615
2616    thread_count = migrate_decompress_threads();
2617    qemu_mutex_lock(&decomp_done_lock);
2618    while (true) {
2619        for (idx = 0; idx < thread_count; idx++) {
2620            if (decomp_param[idx].done) {
2621                decomp_param[idx].done = false;
2622                qemu_mutex_lock(&decomp_param[idx].mutex);
2623                qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2624                decomp_param[idx].des = host;
2625                decomp_param[idx].len = len;
2626                qemu_cond_signal(&decomp_param[idx].cond);
2627                qemu_mutex_unlock(&decomp_param[idx].mutex);
2628                break;
2629            }
2630        }
2631        if (idx < thread_count) {
2632            break;
2633        } else {
2634            qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2635        }
2636    }
2637    qemu_mutex_unlock(&decomp_done_lock);
2638}
2639
2640/**
2641 * ram_load_setup: Setup RAM for migration incoming side
2642 *
2643 * Returns zero to indicate success and negative for error
2644 *
2645 * @f: QEMUFile where to receive the data
2646 * @opaque: RAMState pointer
2647 */
2648static int ram_load_setup(QEMUFile *f, void *opaque)
2649{
2650    xbzrle_load_setup();
2651    compress_threads_load_setup();
2652    ramblock_recv_map_init();
2653    return 0;
2654}
2655
2656static int ram_load_cleanup(void *opaque)
2657{
2658    RAMBlock *rb;
2659    xbzrle_load_cleanup();
2660    compress_threads_load_cleanup();
2661
2662    RAMBLOCK_FOREACH(rb) {
2663        g_free(rb->receivedmap);
2664        rb->receivedmap = NULL;
2665    }
2666    return 0;
2667}
2668
2669/**
2670 * ram_postcopy_incoming_init: allocate postcopy data structures
2671 *
2672 * Returns 0 for success and negative if there was one error
2673 *
2674 * @mis: current migration incoming state
2675 *
2676 * Allocate data structures etc needed by incoming migration with
2677 * postcopy-ram. postcopy-ram's similarly names
2678 * postcopy_ram_incoming_init does the work.
2679 */
2680int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2681{
2682    unsigned long ram_pages = last_ram_page();
2683
2684    return postcopy_ram_incoming_init(mis, ram_pages);
2685}
2686
2687/**
2688 * ram_load_postcopy: load a page in postcopy case
2689 *
2690 * Returns 0 for success or -errno in case of error
2691 *
2692 * Called in postcopy mode by ram_load().
2693 * rcu_read_lock is taken prior to this being called.
2694 *
2695 * @f: QEMUFile where to send the data
2696 */
2697static int ram_load_postcopy(QEMUFile *f)
2698{
2699    int flags = 0, ret = 0;
2700    bool place_needed = false;
2701    bool matching_page_sizes = false;
2702    MigrationIncomingState *mis = migration_incoming_get_current();
2703    /* Temporary page that is later 'placed' */
2704    void *postcopy_host_page = postcopy_get_tmp_page(mis);
2705    void *last_host = NULL;
2706    bool all_zero = false;
2707
2708    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2709        ram_addr_t addr;
2710        void *host = NULL;
2711        void *page_buffer = NULL;
2712        void *place_source = NULL;
2713        RAMBlock *block = NULL;
2714        uint8_t ch;
2715
2716        addr = qemu_get_be64(f);
2717
2718        /*
2719         * If qemu file error, we should stop here, and then "addr"
2720         * may be invalid
2721         */
2722        ret = qemu_file_get_error(f);
2723        if (ret) {
2724            break;
2725        }
2726
2727        flags = addr & ~TARGET_PAGE_MASK;
2728        addr &= TARGET_PAGE_MASK;
2729
2730        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2731        place_needed = false;
2732        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2733            block = ram_block_from_stream(f, flags);
2734
2735            host = host_from_ram_block_offset(block, addr);
2736            if (!host) {
2737                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2738                ret = -EINVAL;
2739                break;
2740            }
2741            matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2742            /*
2743             * Postcopy requires that we place whole host pages atomically;
2744             * these may be huge pages for RAMBlocks that are backed by
2745             * hugetlbfs.
2746             * To make it atomic, the data is read into a temporary page
2747             * that's moved into place later.
2748             * The migration protocol uses,  possibly smaller, target-pages
2749             * however the source ensures it always sends all the components
2750             * of a host page in order.
2751             */
2752            page_buffer = postcopy_host_page +
2753                          ((uintptr_t)host & (block->page_size - 1));
2754            /* If all TP are zero then we can optimise the place */
2755            if (!((uintptr_t)host & (block->page_size - 1))) {
2756                all_zero = true;
2757            } else {
2758                /* not the 1st TP within the HP */
2759                if (host != (last_host + TARGET_PAGE_SIZE)) {
2760                    error_report("Non-sequential target page %p/%p",
2761                                  host, last_host);
2762                    ret = -EINVAL;
2763                    break;
2764                }
2765            }
2766
2767
2768            /*
2769             * If it's the last part of a host page then we place the host
2770             * page
2771             */
2772            place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2773                                     (block->page_size - 1)) == 0;
2774            place_source = postcopy_host_page;
2775        }
2776        last_host = host;
2777
2778        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2779        case RAM_SAVE_FLAG_ZERO:
2780            ch = qemu_get_byte(f);
2781            memset(page_buffer, ch, TARGET_PAGE_SIZE);
2782            if (ch) {
2783                all_zero = false;
2784            }
2785            break;
2786
2787        case RAM_SAVE_FLAG_PAGE:
2788            all_zero = false;
2789            if (!place_needed || !matching_page_sizes) {
2790                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2791            } else {
2792                /* Avoids the qemu_file copy during postcopy, which is
2793                 * going to do a copy later; can only do it when we
2794                 * do this read in one go (matching page sizes)
2795                 */
2796                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2797                                         TARGET_PAGE_SIZE);
2798            }
2799            break;
2800        case RAM_SAVE_FLAG_EOS:
2801            /* normal exit */
2802            break;
2803        default:
2804            error_report("Unknown combination of migration flags: %#x"
2805                         " (postcopy mode)", flags);
2806            ret = -EINVAL;
2807            break;
2808        }
2809
2810        /* Detect for any possible file errors */
2811        if (!ret && qemu_file_get_error(f)) {
2812            ret = qemu_file_get_error(f);
2813        }
2814
2815        if (!ret && place_needed) {
2816            /* This gets called at the last target page in the host page */
2817            void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2818
2819            if (all_zero) {
2820                ret = postcopy_place_page_zero(mis, place_dest,
2821                                               block);
2822            } else {
2823                ret = postcopy_place_page(mis, place_dest,
2824                                          place_source, block);
2825            }
2826        }
2827    }
2828
2829    return ret;
2830}
2831
2832static bool postcopy_is_advised(void)
2833{
2834    PostcopyState ps = postcopy_state_get();
2835    return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
2836}
2837
2838static bool postcopy_is_running(void)
2839{
2840    PostcopyState ps = postcopy_state_get();
2841    return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
2842}
2843
2844static int ram_load(QEMUFile *f, void *opaque, int version_id)
2845{
2846    int flags = 0, ret = 0, invalid_flags = 0;
2847    static uint64_t seq_iter;
2848    int len = 0;
2849    /*
2850     * If system is running in postcopy mode, page inserts to host memory must
2851     * be atomic
2852     */
2853    bool postcopy_running = postcopy_is_running();
2854    /* ADVISE is earlier, it shows the source has the postcopy capability on */
2855    bool postcopy_advised = postcopy_is_advised();
2856
2857    seq_iter++;
2858
2859    if (version_id != 4) {
2860        ret = -EINVAL;
2861    }
2862
2863    if (!migrate_use_compression()) {
2864        invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2865    }
2866    /* This RCU critical section can be very long running.
2867     * When RCU reclaims in the code start to become numerous,
2868     * it will be necessary to reduce the granularity of this
2869     * critical section.
2870     */
2871    rcu_read_lock();
2872
2873    if (postcopy_running) {
2874        ret = ram_load_postcopy(f);
2875    }
2876
2877    while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2878        ram_addr_t addr, total_ram_bytes;
2879        void *host = NULL;
2880        uint8_t ch;
2881
2882        addr = qemu_get_be64(f);
2883        flags = addr & ~TARGET_PAGE_MASK;
2884        addr &= TARGET_PAGE_MASK;
2885
2886        if (flags & invalid_flags) {
2887            if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2888                error_report("Received an unexpected compressed page");
2889            }
2890
2891            ret = -EINVAL;
2892            break;
2893        }
2894
2895        if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2896                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2897            RAMBlock *block = ram_block_from_stream(f, flags);
2898
2899            host = host_from_ram_block_offset(block, addr);
2900            if (!host) {
2901                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2902                ret = -EINVAL;
2903                break;
2904            }
2905            ramblock_recv_bitmap_set(block, host);
2906            trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2907        }
2908
2909        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2910        case RAM_SAVE_FLAG_MEM_SIZE:
2911            /* Synchronize RAM block list */
2912            total_ram_bytes = addr;
2913            while (!ret && total_ram_bytes) {
2914                RAMBlock *block;
2915                char id[256];
2916                ram_addr_t length;
2917
2918                len = qemu_get_byte(f);
2919                qemu_get_buffer(f, (uint8_t *)id, len);
2920                id[len] = 0;
2921                length = qemu_get_be64(f);
2922
2923                block = qemu_ram_block_by_name(id);
2924                if (block) {
2925                    if (length != block->used_length) {
2926                        Error *local_err = NULL;
2927
2928                        ret = qemu_ram_resize(block, length,
2929                                              &local_err);
2930                        if (local_err) {
2931                            error_report_err(local_err);
2932                        }
2933                    }
2934                    /* For postcopy we need to check hugepage sizes match */
2935                    if (postcopy_advised &&
2936                        block->page_size != qemu_host_page_size) {
2937                        uint64_t remote_page_size = qemu_get_be64(f);
2938                        if (remote_page_size != block->page_size) {
2939                            error_report("Mismatched RAM page size %s "
2940                                         "(local) %zd != %" PRId64,
2941                                         id, block->page_size,
2942                                         remote_page_size);
2943                            ret = -EINVAL;
2944                        }
2945                    }
2946                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2947                                          block->idstr);
2948                } else {
2949                    error_report("Unknown ramblock \"%s\", cannot "
2950                                 "accept migration", id);
2951                    ret = -EINVAL;
2952                }
2953
2954                total_ram_bytes -= length;
2955            }
2956            break;
2957
2958        case RAM_SAVE_FLAG_ZERO:
2959            ch = qemu_get_byte(f);
2960            ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2961            break;
2962
2963        case RAM_SAVE_FLAG_PAGE:
2964            qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2965            break;
2966
2967        case RAM_SAVE_FLAG_COMPRESS_PAGE:
2968            len = qemu_get_be32(f);
2969            if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2970                error_report("Invalid compressed data length: %d", len);
2971                ret = -EINVAL;
2972                break;
2973            }
2974            decompress_data_with_multi_threads(f, host, len);
2975            break;
2976
2977        case RAM_SAVE_FLAG_XBZRLE:
2978            if (load_xbzrle(f, addr, host) < 0) {
2979                error_report("Failed to decompress XBZRLE page at "
2980                             RAM_ADDR_FMT, addr);
2981                ret = -EINVAL;
2982                break;
2983            }
2984            break;
2985        case RAM_SAVE_FLAG_EOS:
2986            /* normal exit */
2987            break;
2988        default:
2989            if (flags & RAM_SAVE_FLAG_HOOK) {
2990                ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2991            } else {
2992                error_report("Unknown combination of migration flags: %#x",
2993                             flags);
2994                ret = -EINVAL;
2995            }
2996        }
2997        if (!ret) {
2998            ret = qemu_file_get_error(f);
2999        }
3000    }

3001
3002    wait_for_decompress_done();
3003    rcu_read_unlock();
3004    trace_ram_load_complete(ret, seq_iter);
3005    return ret;
3006}
3007
3008static bool ram_has_postcopy(void *opaque)
3009{
3010    return migrate_postcopy_ram();
3011}
3012
3013static SaveVMHandlers savevm_ram_handlers = {
3014    .save_setup = ram_save_setup,
3015    .save_live_iterate = ram_save_iterate,
3016    .save_live_complete_postcopy = ram_save_complete,
3017    .save_live_complete_precopy = ram_save_complete,
3018    .has_postcopy = ram_has_postcopy,
3019    .save_live_pending = ram_save_pending,
3020    .load_state = ram_load,
3021    .save_cleanup = ram_save_cleanup,
3022    .load_setup = ram_load_setup,
3023    .load_cleanup = ram_load_cleanup,
3024};
3025
3026void ram_mig_init(void)
3027{
3028    qemu_mutex_init(&XBZRLE.lock);
3029    register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
3030}
3031