LXR qemu/migration/postcopy-ram.c

   1/*
   2 * Postcopy migration for RAM
   3 *
   4 * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
   5 *
   6 * Authors:
   7 *  Dave Gilbert  <dgilbert@redhat.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
  10 * See the COPYING file in the top-level directory.
  11 *
  12 */
  13
  14/*
  15 * Postcopy is a migration technique where the execution flips from the
  16 * source to the destination before all the data has been copied.
  17 */
  18
  19#include "qemu/osdep.h"
  20#include "exec/target_page.h"
  21#include "migration.h"
  22#include "qemu-file.h"
  23#include "savevm.h"
  24#include "postcopy-ram.h"
  25#include "ram.h"
  26#include "qapi/error.h"
  27#include "qemu/notify.h"
  28#include "qemu/rcu.h"
  29#include "sysemu/sysemu.h"
  30#include "sysemu/balloon.h"
  31#include "qemu/error-report.h"
  32#include "trace.h"
  33#include "hw/boards.h"
  34
  35/* Arbitrary limit on size of each discard command,
  36 * keeps them around ~200 bytes
  37 */
  38#define MAX_DISCARDS_PER_COMMAND 12
  39
  40struct PostcopyDiscardState {
  41    const char *ramblock_name;
  42    uint16_t cur_entry;
  43    /*
  44     * Start and length of a discard range (bytes)
  45     */
  46    uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
  47    uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
  48    unsigned int nsentwords;
  49    unsigned int nsentcmds;
  50};
  51
  52static NotifierWithReturnList postcopy_notifier_list;
  53
  54void postcopy_infrastructure_init(void)
  55{
  56    notifier_with_return_list_init(&postcopy_notifier_list);
  57}
  58
  59void postcopy_add_notifier(NotifierWithReturn *nn)
  60{
  61    notifier_with_return_list_add(&postcopy_notifier_list, nn);
  62}
  63
  64void postcopy_remove_notifier(NotifierWithReturn *n)
  65{
  66    notifier_with_return_remove(n);
  67}
  68
  69int postcopy_notify(enum PostcopyNotifyReason reason, Error **errp)
  70{
  71    struct PostcopyNotifyData pnd;
  72    pnd.reason = reason;
  73    pnd.errp = errp;
  74
  75    return notifier_with_return_list_notify(&postcopy_notifier_list,
  76                                            &pnd);
  77}
  78
  79/* Postcopy needs to detect accesses to pages that haven't yet been copied
  80 * across, and efficiently map new pages in, the techniques for doing this
  81 * are target OS specific.
  82 */
  83#if defined(__linux__)
  84
  85#include <poll.h>
  86#include <sys/ioctl.h>
  87#include <sys/syscall.h>
  88#include <asm/types.h> /* for __u64 */
  89#endif
  90
  91#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
  92#include <sys/eventfd.h>
  93#include <linux/userfaultfd.h>
  94
  95typedef struct PostcopyBlocktimeContext {
  96    /* time when page fault initiated per vCPU */
  97    uint32_t *page_fault_vcpu_time;
  98    /* page address per vCPU */
  99    uintptr_t *vcpu_addr;
 100    uint32_t total_blocktime;
 101    /* blocktime per vCPU */
 102    uint32_t *vcpu_blocktime;
 103    /* point in time when last page fault was initiated */
 104    uint32_t last_begin;
 105    /* number of vCPU are suspended */
 106    int smp_cpus_down;
 107    uint64_t start_time;
 108
 109    /*
 110     * Handler for exit event, necessary for
 111     * releasing whole blocktime_ctx
 112     */
 113    Notifier exit_notifier;
 114} PostcopyBlocktimeContext;
 115
 116static void destroy_blocktime_context(struct PostcopyBlocktimeContext *ctx)
 117{
 118    g_free(ctx->page_fault_vcpu_time);
 119    g_free(ctx->vcpu_addr);
 120    g_free(ctx->vcpu_blocktime);
 121    g_free(ctx);
 122}
 123
 124static void migration_exit_cb(Notifier *n, void *data)
 125{
 126    PostcopyBlocktimeContext *ctx = container_of(n, PostcopyBlocktimeContext,
 127                                                 exit_notifier);
 128    destroy_blocktime_context(ctx);
 129}
 130
 131static struct PostcopyBlocktimeContext *blocktime_context_new(void)
 132{
 133    MachineState *ms = MACHINE(qdev_get_machine());
 134    unsigned int smp_cpus = ms->smp.cpus;
 135    PostcopyBlocktimeContext *ctx = g_new0(PostcopyBlocktimeContext, 1);
 136    ctx->page_fault_vcpu_time = g_new0(uint32_t, smp_cpus);
 137    ctx->vcpu_addr = g_new0(uintptr_t, smp_cpus);
 138    ctx->vcpu_blocktime = g_new0(uint32_t, smp_cpus);
 139
 140    ctx->exit_notifier.notify = migration_exit_cb;
 141    ctx->start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 142    qemu_add_exit_notifier(&ctx->exit_notifier);
 143    return ctx;
 144}
 145
 146static uint32List *get_vcpu_blocktime_list(PostcopyBlocktimeContext *ctx)
 147{
 148    MachineState *ms = MACHINE(qdev_get_machine());
 149    uint32List *list = NULL, *entry = NULL;
 150    int i;
 151
 152    for (i = ms->smp.cpus - 1; i >= 0; i--) {
 153        entry = g_new0(uint32List, 1);
 154        entry->value = ctx->vcpu_blocktime[i];
 155        entry->next = list;
 156        list = entry;
 157    }
 158
 159    return list;
 160}
 161
 162/*
 163 * This function just populates MigrationInfo from postcopy's
 164 * blocktime context. It will not populate MigrationInfo,
 165 * unless postcopy-blocktime capability was set.
 166 *
 167 * @info: pointer to MigrationInfo to populate
 168 */
 169void fill_destination_postcopy_migration_info(MigrationInfo *info)
 170{
 171    MigrationIncomingState *mis = migration_incoming_get_current();
 172    PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
 173
 174    if (!bc) {
 175        return;
 176    }
 177
 178    info->has_postcopy_blocktime = true;
 179    info->postcopy_blocktime = bc->total_blocktime;
 180    info->has_postcopy_vcpu_blocktime = true;
 181    info->postcopy_vcpu_blocktime = get_vcpu_blocktime_list(bc);
 182}
 183
 184static uint32_t get_postcopy_total_blocktime(void)
 185{
 186    MigrationIncomingState *mis = migration_incoming_get_current();
 187    PostcopyBlocktimeContext *bc = mis->blocktime_ctx;
 188
 189    if (!bc) {
 190        return 0;
 191    }
 192
 193    return bc->total_blocktime;
 194}
 195
 196/**
 197 * receive_ufd_features: check userfault fd features, to request only supported
 198 * features in the future.
 199 *
 200 * Returns: true on success
 201 *
 202 * __NR_userfaultfd - should be checked before
 203 *  @features: out parameter will contain uffdio_api.features provided by kernel
 204 *              in case of success
 205 */
 206static bool receive_ufd_features(uint64_t *features)
 207{
 208    struct uffdio_api api_struct = {0};
 209    int ufd;
 210    bool ret = true;
 211
 212    /* if we are here __NR_userfaultfd should exists */
 213    ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
 214    if (ufd == -1) {
 215        error_report("%s: syscall __NR_userfaultfd failed: %s", __func__,
 216                     strerror(errno));
 217        return false;
 218    }
 219
 220    /* ask features */
 221    api_struct.api = UFFD_API;
 222    api_struct.features = 0;
 223    if (ioctl(ufd, UFFDIO_API, &api_struct)) {
 224        error_report("%s: UFFDIO_API failed: %s", __func__,
 225                     strerror(errno));
 226        ret = false;
 227        goto release_ufd;
 228    }
 229
 230    *features = api_struct.features;
 231
 232release_ufd:
 233    close(ufd);
 234    return ret;
 235}
 236
 237/**
 238 * request_ufd_features: this function should be called only once on a newly
 239 * opened ufd, subsequent calls will lead to error.
 240 *
 241 * Returns: true on succes
 242 *
 243 * @ufd: fd obtained from userfaultfd syscall
 244 * @features: bit mask see UFFD_API_FEATURES
 245 */
 246static bool request_ufd_features(int ufd, uint64_t features)
 247{
 248    struct uffdio_api api_struct = {0};
 249    uint64_t ioctl_mask;
 250
 251    api_struct.api = UFFD_API;
 252    api_struct.features = features;
 253    if (ioctl(ufd, UFFDIO_API, &api_struct)) {
 254        error_report("%s failed: UFFDIO_API failed: %s", __func__,
 255                     strerror(errno));
 256        return false;
 257    }
 258
 259    ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
 260                 (__u64)1 << _UFFDIO_UNREGISTER;
 261    if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
 262        error_report("Missing userfault features: %" PRIx64,
 263                     (uint64_t)(~api_struct.ioctls & ioctl_mask));
 264        return false;
 265    }
 266
 267    return true;
 268}
 269
 270static bool ufd_check_and_apply(int ufd, MigrationIncomingState *mis)
 271{
 272    uint64_t asked_features = 0;
 273    static uint64_t supported_features;
 274
 275    /*
 276     * it's not possible to
 277     * request UFFD_API twice per one fd
 278     * userfault fd features is persistent
 279     */
 280    if (!supported_features) {
 281        if (!receive_ufd_features(&supported_features)) {
 282            error_report("%s failed", __func__);
 283            return false;
 284        }
 285    }
 286
 287#ifdef UFFD_FEATURE_THREAD_ID
 288    if (migrate_postcopy_blocktime() && mis &&
 289        UFFD_FEATURE_THREAD_ID & supported_features) {
 290        /* kernel supports that feature */
 291        /* don't create blocktime_context if it exists */
 292        if (!mis->blocktime_ctx) {
 293            mis->blocktime_ctx = blocktime_context_new();
 294        }
 295
 296        asked_features |= UFFD_FEATURE_THREAD_ID;
 297    }
 298#endif
 299
 300    /*
 301     * request features, even if asked_features is 0, due to
 302     * kernel expects UFFD_API before UFFDIO_REGISTER, per
 303     * userfault file descriptor
 304     */
 305    if (!request_ufd_features(ufd, asked_features)) {
 306        error_report("%s failed: features %" PRIu64, __func__,
 307                     asked_features);
 308        return false;
 309    }
 310
 311    if (qemu_real_host_page_size != ram_pagesize_summary()) {
 312        bool have_hp = false;
 313        /* We've got a huge page */
 314#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
 315        have_hp = supported_features & UFFD_FEATURE_MISSING_HUGETLBFS;
 316#endif
 317        if (!have_hp) {
 318            error_report("Userfault on this host does not support huge pages");
 319            return false;
 320        }
 321    }
 322    return true;
 323}
 324
 325/* Callback from postcopy_ram_supported_by_host block iterator.
 326 */
 327static int test_ramblock_postcopiable(RAMBlock *rb, void *opaque)
 328{
 329    const char *block_name = qemu_ram_get_idstr(rb);
 330    ram_addr_t length = qemu_ram_get_used_length(rb);
 331    size_t pagesize = qemu_ram_pagesize(rb);
 332
 333    if (length % pagesize) {
 334        error_report("Postcopy requires RAM blocks to be a page size multiple,"
 335                     " block %s is 0x" RAM_ADDR_FMT " bytes with a "
 336                     "page size of 0x%zx", block_name, length, pagesize);
 337        return 1;
 338    }
 339    return 0;
 340}
 341
 342/*
 343 * Note: This has the side effect of munlock'ing all of RAM, that's
 344 * normally fine since if the postcopy succeeds it gets turned back on at the
 345 * end.
 346 */
 347bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
 348{
 349    long pagesize = qemu_real_host_page_size;
 350    int ufd = -1;
 351    bool ret = false; /* Error unless we change it */
 352    void *testarea = NULL;
 353    struct uffdio_register reg_struct;
 354    struct uffdio_range range_struct;
 355    uint64_t feature_mask;
 356    Error *local_err = NULL;
 357
 358    if (qemu_target_page_size() > pagesize) {
 359        error_report("Target page size bigger than host page size");
 360        goto out;
 361    }
 362
 363    ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
 364    if (ufd == -1) {
 365        error_report("%s: userfaultfd not available: %s", __func__,
 366                     strerror(errno));
 367        goto out;
 368    }
 369
 370    /* Give devices a chance to object */
 371    if (postcopy_notify(POSTCOPY_NOTIFY_PROBE, &local_err)) {
 372        error_report_err(local_err);
 373        goto out;
 374    }
 375
 376    /* Version and features check */
 377    if (!ufd_check_and_apply(ufd, mis)) {
 378        goto out;
 379    }
 380
 381    /* We don't support postcopy with shared RAM yet */
 382    if (foreach_not_ignored_block(test_ramblock_postcopiable, NULL)) {
 383        goto out;
 384    }
 385
 386    /*
 387     * userfault and mlock don't go together; we'll put it back later if
 388     * it was enabled.
 389     */
 390    if (munlockall()) {
 391        error_report("%s: munlockall: %s", __func__,  strerror(errno));
 392        return -1;
 393    }
 394
 395    /*
 396     *  We need to check that the ops we need are supported on anon memory
 397     *  To do that we need to register a chunk and see the flags that
 398     *  are returned.
 399     */
 400    testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
 401                                    MAP_ANONYMOUS, -1, 0);
 402    if (testarea == MAP_FAILED) {
 403        error_report("%s: Failed to map test area: %s", __func__,
 404                     strerror(errno));
 405        goto out;
 406    }
 407    g_assert(((size_t)testarea & (pagesize-1)) == 0);
 408
 409    reg_struct.range.start = (uintptr_t)testarea;
 410    reg_struct.range.len = pagesize;
 411    reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
 412
 413    if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
 414        error_report("%s userfault register: %s", __func__, strerror(errno));
 415        goto out;
 416    }
 417
 418    range_struct.start = (uintptr_t)testarea;
 419    range_struct.len = pagesize;
 420    if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
 421        error_report("%s userfault unregister: %s", __func__, strerror(errno));
 422        goto out;
 423    }
 424
 425    feature_mask = (__u64)1 << _UFFDIO_WAKE |
 426                   (__u64)1 << _UFFDIO_COPY |
 427                   (__u64)1 << _UFFDIO_ZEROPAGE;
 428    if ((reg_struct.ioctls & feature_mask) != feature_mask) {
 429        error_report("Missing userfault map features: %" PRIx64,
 430                     (uint64_t)(~reg_struct.ioctls & feature_mask));
 431        goto out;
 432    }
 433
 434    /* Success! */
 435    ret = true;
 436out:
 437    if (testarea) {
 438        munmap(testarea, pagesize);
 439    }
 440    if (ufd != -1) {
 441        close(ufd);
 442    }
 443    return ret;
 444}
 445
 446/*
 447 * Setup an area of RAM so that it *can* be used for postcopy later; this
 448 * must be done right at the start prior to pre-copy.
 449 * opaque should be the MIS.
 450 */
 451static int init_range(RAMBlock *rb, void *opaque)
 452{
 453    const char *block_name = qemu_ram_get_idstr(rb);
 454    void *host_addr = qemu_ram_get_host_addr(rb);
 455    ram_addr_t offset = qemu_ram_get_offset(rb);
 456    ram_addr_t length = qemu_ram_get_used_length(rb);
 457    trace_postcopy_init_range(block_name, host_addr, offset, length);
 458
 459    /*
 460     * We need the whole of RAM to be truly empty for postcopy, so things
 461     * like ROMs and any data tables built during init must be zero'd
 462     * - we're going to get the copy from the source anyway.
 463     * (Precopy will just overwrite this data, so doesn't need the discard)
 464     */
 465    if (ram_discard_range(block_name, 0, length)) {
 466        return -1;
 467    }
 468
 469    return 0;
 470}
 471
 472/*
 473 * At the end of migration, undo the effects of init_range
 474 * opaque should be the MIS.
 475 */
 476static int cleanup_range(RAMBlock *rb, void *opaque)
 477{
 478    const char *block_name = qemu_ram_get_idstr(rb);
 479    void *host_addr = qemu_ram_get_host_addr(rb);
 480    ram_addr_t offset = qemu_ram_get_offset(rb);
 481    ram_addr_t length = qemu_ram_get_used_length(rb);
 482    MigrationIncomingState *mis = opaque;
 483    struct uffdio_range range_struct;
 484    trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
 485
 486    /*
 487     * We turned off hugepage for the precopy stage with postcopy enabled
 488     * we can turn it back on now.
 489     */
 490    qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
 491
 492    /*
 493     * We can also turn off userfault now since we should have all the
 494     * pages.   It can be useful to leave it on to debug postcopy
 495     * if you're not sure it's always getting every page.
 496     */
 497    range_struct.start = (uintptr_t)host_addr;
 498    range_struct.len = length;
 499
 500    if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
 501        error_report("%s: userfault unregister %s", __func__, strerror(errno));
 502
 503        return -1;
 504    }
 505
 506    return 0;
 507}
 508
 509/*
 510 * Initialise postcopy-ram, setting the RAM to a state where we can go into
 511 * postcopy later; must be called prior to any precopy.
 512 * called from arch_init's similarly named ram_postcopy_incoming_init
 513 */
 514int postcopy_ram_incoming_init(MigrationIncomingState *mis)
 515{
 516    if (foreach_not_ignored_block(init_range, NULL)) {
 517        return -1;
 518    }
 519
 520    return 0;
 521}
 522
 523/*
 524 * Manage a single vote to the QEMU balloon inhibitor for all postcopy usage,
 525 * last caller wins.
 526 */
 527static void postcopy_balloon_inhibit(bool state)
 528{
 529    static bool cur_state = false;
 530
 531    if (state != cur_state) {
 532        qemu_balloon_inhibit(state);
 533        cur_state = state;
 534    }
 535}
 536
 537/*
 538 * At the end of a migration where postcopy_ram_incoming_init was called.
 539 */
 540int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
 541{
 542    trace_postcopy_ram_incoming_cleanup_entry();
 543
 544    if (mis->have_fault_thread) {
 545        Error *local_err = NULL;
 546
 547        /* Let the fault thread quit */
 548        atomic_set(&mis->fault_thread_quit, 1);
 549        postcopy_fault_thread_notify(mis);
 550        trace_postcopy_ram_incoming_cleanup_join();
 551        qemu_thread_join(&mis->fault_thread);
 552
 553        if (postcopy_notify(POSTCOPY_NOTIFY_INBOUND_END, &local_err)) {
 554            error_report_err(local_err);
 555            return -1;
 556        }
 557
 558        if (foreach_not_ignored_block(cleanup_range, mis)) {
 559            return -1;
 560        }
 561
 562        trace_postcopy_ram_incoming_cleanup_closeuf();
 563        close(mis->userfault_fd);
 564        close(mis->userfault_event_fd);
 565        mis->have_fault_thread = false;
 566    }
 567
 568    postcopy_balloon_inhibit(false);
 569
 570    if (enable_mlock) {
 571        if (os_mlock() < 0) {
 572            error_report("mlock: %s", strerror(errno));
 573            /*
 574             * It doesn't feel right to fail at this point, we have a valid
 575             * VM state.
 576             */
 577        }
 578    }
 579
 580    if (mis->postcopy_tmp_page) {
 581        munmap(mis->postcopy_tmp_page, mis->largest_page_size);
 582        mis->postcopy_tmp_page = NULL;
 583    }
 584    if (mis->postcopy_tmp_zero_page) {
 585        munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
 586        mis->postcopy_tmp_zero_page = NULL;
 587    }
 588    trace_postcopy_ram_incoming_cleanup_blocktime(
 589            get_postcopy_total_blocktime());
 590
 591    trace_postcopy_ram_incoming_cleanup_exit();
 592    return 0;
 593}
 594
 595/*
 596 * Disable huge pages on an area
 597 */
 598static int nhp_range(RAMBlock *rb, void *opaque)
 599{
 600    const char *block_name = qemu_ram_get_idstr(rb);
 601    void *host_addr = qemu_ram_get_host_addr(rb);
 602    ram_addr_t offset = qemu_ram_get_offset(rb);
 603    ram_addr_t length = qemu_ram_get_used_length(rb);
 604    trace_postcopy_nhp_range(block_name, host_addr, offset, length);
 605
 606    /*
 607     * Before we do discards we need to ensure those discards really
 608     * do delete areas of the page, even if THP thinks a hugepage would
 609     * be a good idea, so force hugepages off.
 610     */
 611    qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
 612
 613    return 0;
 614}
 615
 616/*
 617 * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
 618 * however leaving it until after precopy means that most of the precopy
 619 * data is still THPd
 620 */
 621int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
 622{
 623    if (foreach_not_ignored_block(nhp_range, mis)) {
 624        return -1;
 625    }
 626
 627    postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
 628
 629    return 0;
 630}
 631
 632/*
 633 * Mark the given area of RAM as requiring notification to unwritten areas
 634 * Used as a  callback on foreach_not_ignored_block.
 635 *   host_addr: Base of area to mark
 636 *   offset: Offset in the whole ram arena
 637 *   length: Length of the section
 638 *   opaque: MigrationIncomingState pointer
 639 * Returns 0 on success
 640 */
 641static int ram_block_enable_notify(RAMBlock *rb, void *opaque)
 642{
 643    MigrationIncomingState *mis = opaque;
 644    struct uffdio_register reg_struct;
 645
 646    reg_struct.range.start = (uintptr_t)qemu_ram_get_host_addr(rb);
 647    reg_struct.range.len = qemu_ram_get_used_length(rb);
 648    reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
 649
 650    /* Now tell our userfault_fd that it's responsible for this area */
 651    if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
 652        error_report("%s userfault register: %s", __func__, strerror(errno));
 653        return -1;
 654    }
 655    if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
 656        error_report("%s userfault: Region doesn't support COPY", __func__);
 657        return -1;
 658    }
 659    if (reg_struct.ioctls & ((__u64)1 << _UFFDIO_ZEROPAGE)) {
 660        qemu_ram_set_uf_zeroable(rb);
 661    }
 662
 663    return 0;
 664}
 665
 666int postcopy_wake_shared(struct PostCopyFD *pcfd,
 667                         uint64_t client_addr,
 668                         RAMBlock *rb)
 669{
 670    size_t pagesize = qemu_ram_pagesize(rb);
 671    struct uffdio_range range;
 672    int ret;
 673    trace_postcopy_wake_shared(client_addr, qemu_ram_get_idstr(rb));
 674    range.start = client_addr & ~(pagesize - 1);
 675    range.len = pagesize;
 676    ret = ioctl(pcfd->fd, UFFDIO_WAKE, &range);
 677    if (ret) {
 678        error_report("%s: Failed to wake: %zx in %s (%s)",
 679                     __func__, (size_t)client_addr, qemu_ram_get_idstr(rb),
 680                     strerror(errno));
 681    }
 682    return ret;
 683}
 684
 685/*
 686 * Callback from shared fault handlers to ask for a page,
 687 * the page must be specified by a RAMBlock and an offset in that rb
 688 * Note: Only for use by shared fault handlers (in fault thread)
 689 */
 690int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
 691                                 uint64_t client_addr, uint64_t rb_offset)
 692{
 693    size_t pagesize = qemu_ram_pagesize(rb);
 694    uint64_t aligned_rbo = rb_offset & ~(pagesize - 1);
 695    MigrationIncomingState *mis = migration_incoming_get_current();
 696
 697    trace_postcopy_request_shared_page(pcfd->idstr, qemu_ram_get_idstr(rb),
 698                                       rb_offset);
 699    if (ramblock_recv_bitmap_test_byte_offset(rb, aligned_rbo)) {
 700        trace_postcopy_request_shared_page_present(pcfd->idstr,
 701                                        qemu_ram_get_idstr(rb), rb_offset);
 702        return postcopy_wake_shared(pcfd, client_addr, rb);
 703    }
 704    if (rb != mis->last_rb) {
 705        mis->last_rb = rb;
 706        migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
 707                                  aligned_rbo, pagesize);
 708    } else {
 709        /* Save some space */
 710        migrate_send_rp_req_pages(mis, NULL, aligned_rbo, pagesize);
 711    }
 712    return 0;
 713}
 714
 715static int get_mem_fault_cpu_index(uint32_t pid)
 716{
 717    CPUState *cpu_iter;
 718
 719    CPU_FOREACH(cpu_iter) {
 720        if (cpu_iter->thread_id == pid) {
 721            trace_get_mem_fault_cpu_index(cpu_iter->cpu_index, pid);
 722            return cpu_iter->cpu_index;
 723        }
 724    }
 725    trace_get_mem_fault_cpu_index(-1, pid);
 726    return -1;
 727}
 728
 729static uint32_t get_low_time_offset(PostcopyBlocktimeContext *dc)
 730{
 731    int64_t start_time_offset = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
 732                                    dc->start_time;
 733    return start_time_offset < 1 ? 1 : start_time_offset & UINT32_MAX;
 734}
 735
 736/*
 737 * This function is being called when pagefault occurs. It
 738 * tracks down vCPU blocking time.
 739 *
 740 * @addr: faulted host virtual address
 741 * @ptid: faulted process thread id
 742 * @rb: ramblock appropriate to addr
 743 */
 744static void mark_postcopy_blocktime_begin(uintptr_t addr, uint32_t ptid,
 745                                          RAMBlock *rb)
 746{
 747    int cpu, already_received;
 748    MigrationIncomingState *mis = migration_incoming_get_current();
 749    PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
 750    uint32_t low_time_offset;
 751
 752    if (!dc || ptid == 0) {
 753        return;
 754    }
 755    cpu = get_mem_fault_cpu_index(ptid);
 756    if (cpu < 0) {
 757        return;
 758    }
 759
 760    low_time_offset = get_low_time_offset(dc);
 761    if (dc->vcpu_addr[cpu] == 0) {
 762        atomic_inc(&dc->smp_cpus_down);
 763    }
 764
 765    atomic_xchg(&dc->last_begin, low_time_offset);
 766    atomic_xchg(&dc->page_fault_vcpu_time[cpu], low_time_offset);
 767    atomic_xchg(&dc->vcpu_addr[cpu], addr);
 768
 769    /*
 770     * check it here, not at the beginning of the function,
 771     * due to, check could occur early than bitmap_set in
 772     * qemu_ufd_copy_ioctl
 773     */
 774    already_received = ramblock_recv_bitmap_test(rb, (void *)addr);
 775    if (already_received) {
 776        atomic_xchg(&dc->vcpu_addr[cpu], 0);
 777        atomic_xchg(&dc->page_fault_vcpu_time[cpu], 0);
 778        atomic_dec(&dc->smp_cpus_down);
 779    }
 780    trace_mark_postcopy_blocktime_begin(addr, dc, dc->page_fault_vcpu_time[cpu],
 781                                        cpu, already_received);
 782}
 783
 784/*
 785 *  This function just provide calculated blocktime per cpu and trace it.
 786 *  Total blocktime is calculated in mark_postcopy_blocktime_end.
 787 *
 788 *
 789 * Assume we have 3 CPU
 790 *
 791 *      S1        E1           S1               E1
 792 * -----***********------------xxx***************------------------------> CPU1
 793 *
 794 *             S2                E2
 795 * ------------****************xxx---------------------------------------> CPU2
 796 *
 797 *                         S3            E3
 798 * ------------------------****xxx********-------------------------------> CPU3
 799 *
 800 * We have sequence S1,S2,E1,S3,S1,E2,E3,E1
 801 * S2,E1 - doesn't match condition due to sequence S1,S2,E1 doesn't include CPU3
 802 * S3,S1,E2 - sequence includes all CPUs, in this case overlap will be S1,E2 -
 803 *            it's a part of total blocktime.
 804 * S1 - here is last_begin
 805 * Legend of the picture is following:
 806 *              * - means blocktime per vCPU
 807 *              x - means overlapped blocktime (total blocktime)
 808 *
 809 * @addr: host virtual address
 810 */
 811static void mark_postcopy_blocktime_end(uintptr_t addr)
 812{
 813    MigrationIncomingState *mis = migration_incoming_get_current();
 814    PostcopyBlocktimeContext *dc = mis->blocktime_ctx;
 815    MachineState *ms = MACHINE(qdev_get_machine());
 816    unsigned int smp_cpus = ms->smp.cpus;
 817    int i, affected_cpu = 0;
 818    bool vcpu_total_blocktime = false;
 819    uint32_t read_vcpu_time, low_time_offset;
 820
 821    if (!dc) {
 822        return;
 823    }
 824
 825    low_time_offset = get_low_time_offset(dc);
 826    /* lookup cpu, to clear it,
 827     * that algorithm looks straighforward, but it's not
 828     * optimal, more optimal algorithm is keeping tree or hash
 829     * where key is address value is a list of  */
 830    for (i = 0; i < smp_cpus; i++) {
 831        uint32_t vcpu_blocktime = 0;
 832
 833        read_vcpu_time = atomic_fetch_add(&dc->page_fault_vcpu_time[i], 0);
 834        if (atomic_fetch_add(&dc->vcpu_addr[i], 0) != addr ||
 835            read_vcpu_time == 0) {
 836            continue;
 837        }
 838        atomic_xchg(&dc->vcpu_addr[i], 0);
 839        vcpu_blocktime = low_time_offset - read_vcpu_time;
 840        affected_cpu += 1;
 841        /* we need to know is that mark_postcopy_end was due to
 842         * faulted page, another possible case it's prefetched
 843         * page and in that case we shouldn't be here */
 844        if (!vcpu_total_blocktime &&
 845            atomic_fetch_add(&dc->smp_cpus_down, 0) == smp_cpus) {
 846            vcpu_total_blocktime = true;
 847        }
 848        /* continue cycle, due to one page could affect several vCPUs */
 849        dc->vcpu_blocktime[i] += vcpu_blocktime;
 850    }
 851
 852    atomic_sub(&dc->smp_cpus_down, affected_cpu);
 853    if (vcpu_total_blocktime) {
 854        dc->total_blocktime += low_time_offset - atomic_fetch_add(
 855                &dc->last_begin, 0);
 856    }
 857    trace_mark_postcopy_blocktime_end(addr, dc, dc->total_blocktime,
 858                                      affected_cpu);
 859}
 860
 861static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
 862{
 863    trace_postcopy_pause_fault_thread();
 864
 865    qemu_sem_wait(&mis->postcopy_pause_sem_fault);
 866
 867    trace_postcopy_pause_fault_thread_continued();
 868
 869    return true;
 870}
 871
 872/*
 873 * Handle faults detected by the USERFAULT markings
 874 */
 875static void *postcopy_ram_fault_thread(void *opaque)
 876{
 877    MigrationIncomingState *mis = opaque;
 878    struct uffd_msg msg;
 879    int ret;
 880    size_t index;
 881    RAMBlock *rb = NULL;
 882
 883    trace_postcopy_ram_fault_thread_entry();
 884    rcu_register_thread();
 885    mis->last_rb = NULL; /* last RAMBlock we sent part of */
 886    qemu_sem_post(&mis->fault_thread_sem);
 887
 888    struct pollfd *pfd;
 889    size_t pfd_len = 2 + mis->postcopy_remote_fds->len;
 890
 891    pfd = g_new0(struct pollfd, pfd_len);
 892
 893    pfd[0].fd = mis->userfault_fd;
 894    pfd[0].events = POLLIN;
 895    pfd[1].fd = mis->userfault_event_fd;
 896    pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
 897    trace_postcopy_ram_fault_thread_fds_core(pfd[0].fd, pfd[1].fd);
 898    for (index = 0; index < mis->postcopy_remote_fds->len; index++) {
 899        struct PostCopyFD *pcfd = &g_array_index(mis->postcopy_remote_fds,
 900                                                 struct PostCopyFD, index);
 901        pfd[2 + index].fd = pcfd->fd;
 902        pfd[2 + index].events = POLLIN;
 903        trace_postcopy_ram_fault_thread_fds_extra(2 + index, pcfd->idstr,
 904                                                  pcfd->fd);
 905    }
 906
 907    while (true) {
 908        ram_addr_t rb_offset;
 909        int poll_result;
 910
 911        /*
 912         * We're mainly waiting for the kernel to give us a faulting HVA,
 913         * however we can be told to quit via userfault_quit_fd which is
 914         * an eventfd
 915         */
 916
 917        poll_result = poll(pfd, pfd_len, -1 /* Wait forever */);
 918        if (poll_result == -1) {
 919            error_report("%s: userfault poll: %s", __func__, strerror(errno));
 920            break;
 921        }
 922
 923        if (!mis->to_src_file) {
 924            /*
 925             * Possibly someone tells us that the return path is
 926             * broken already using the event. We should hold until
 927             * the channel is rebuilt.
 928             */
 929            if (postcopy_pause_fault_thread(mis)) {
 930                mis->last_rb = NULL;
 931                /* Continue to read the userfaultfd */
 932            } else {
 933                error_report("%s: paused but don't allow to continue",
 934                             __func__);
 935                break;
 936            }
 937        }
 938
 939        if (pfd[1].revents) {
 940            uint64_t tmp64 = 0;
 941
 942            /* Consume the signal */
 943            if (read(mis->userfault_event_fd, &tmp64, 8) != 8) {
 944                /* Nothing obviously nicer than posting this error. */
 945                error_report("%s: read() failed", __func__);
 946            }
 947
 948            if (atomic_read(&mis->fault_thread_quit)) {
 949                trace_postcopy_ram_fault_thread_quit();
 950                break;
 951            }
 952        }
 953
 954        if (pfd[0].revents) {
 955            poll_result--;
 956            ret = read(mis->userfault_fd, &msg, sizeof(msg));
 957            if (ret != sizeof(msg)) {
 958                if (errno == EAGAIN) {
 959                    /*
 960                     * if a wake up happens on the other thread just after
 961                     * the poll, there is nothing to read.
 962                     */
 963                    continue;
 964                }
 965                if (ret < 0) {
 966                    error_report("%s: Failed to read full userfault "
 967                                 "message: %s",
 968                                 __func__, strerror(errno));
 969                    break;
 970                } else {
 971                    error_report("%s: Read %d bytes from userfaultfd "
 972                                 "expected %zd",
 973                                 __func__, ret, sizeof(msg));
 974                    break; /* Lost alignment, don't know what we'd read next */
 975                }
 976            }
 977            if (msg.event != UFFD_EVENT_PAGEFAULT) {
 978                error_report("%s: Read unexpected event %ud from userfaultfd",
 979                             __func__, msg.event);
 980                continue; /* It's not a page fault, shouldn't happen */
 981            }
 982
 983            rb = qemu_ram_block_from_host(
 984                     (void *)(uintptr_t)msg.arg.pagefault.address,
 985                     true, &rb_offset);
 986            if (!rb) {
 987                error_report("postcopy_ram_fault_thread: Fault outside guest: %"
 988                             PRIx64, (uint64_t)msg.arg.pagefault.address);
 989                break;
 990            }
 991
 992            rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
 993            trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
 994                                                qemu_ram_get_idstr(rb),
 995                                                rb_offset,
 996                                                msg.arg.pagefault.feat.ptid);
 997            mark_postcopy_blocktime_begin(
 998                    (uintptr_t)(msg.arg.pagefault.address),
 999                                msg.arg.pagefault.feat.ptid, rb);
1000

1001retry:
1002            /*
1003             * Send the request to the source - we want to request one
1004             * of our host page sizes (which is >= TPS)
1005             */
1006            if (rb != mis->last_rb) {
1007                mis->last_rb = rb;
1008                ret = migrate_send_rp_req_pages(mis,
1009                                                qemu_ram_get_idstr(rb),
1010                                                rb_offset,
1011                                                qemu_ram_pagesize(rb));
1012            } else {
1013                /* Save some space */
1014                ret = migrate_send_rp_req_pages(mis,
1015                                                NULL,
1016                                                rb_offset,
1017                                                qemu_ram_pagesize(rb));
1018            }
1019
1020            if (ret) {
1021                /* May be network failure, try to wait for recovery */
1022                if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
1023                    /* We got reconnected somehow, try to continue */
1024                    mis->last_rb = NULL;
1025                    goto retry;
1026                } else {
1027                    /* This is a unavoidable fault */
1028                    error_report("%s: migrate_send_rp_req_pages() get %d",
1029                                 __func__, ret);
1030                    break;
1031                }
1032            }
1033        }
1034
1035        /* Now handle any requests from external processes on shared memory */
1036        /* TODO: May need to handle devices deregistering during postcopy */
1037        for (index = 2; index < pfd_len && poll_result; index++) {
1038            if (pfd[index].revents) {
1039                struct PostCopyFD *pcfd =
1040                    &g_array_index(mis->postcopy_remote_fds,
1041                                   struct PostCopyFD, index - 2);
1042
1043                poll_result--;
1044                if (pfd[index].revents & POLLERR) {
1045                    error_report("%s: POLLERR on poll %zd fd=%d",
1046                                 __func__, index, pcfd->fd);
1047                    pfd[index].events = 0;
1048                    continue;
1049                }
1050
1051                ret = read(pcfd->fd, &msg, sizeof(msg));
1052                if (ret != sizeof(msg)) {
1053                    if (errno == EAGAIN) {
1054                        /*
1055                         * if a wake up happens on the other thread just after
1056                         * the poll, there is nothing to read.
1057                         */
1058                        continue;
1059                    }
1060                    if (ret < 0) {
1061                        error_report("%s: Failed to read full userfault "
1062                                     "message: %s (shared) revents=%d",
1063                                     __func__, strerror(errno),
1064                                     pfd[index].revents);
1065                        /*TODO: Could just disable this sharer */
1066                        break;
1067                    } else {
1068                        error_report("%s: Read %d bytes from userfaultfd "
1069                                     "expected %zd (shared)",
1070                                     __func__, ret, sizeof(msg));
1071                        /*TODO: Could just disable this sharer */
1072                        break; /*Lost alignment,don't know what we'd read next*/
1073                    }
1074                }
1075                if (msg.event != UFFD_EVENT_PAGEFAULT) {
1076                    error_report("%s: Read unexpected event %ud "
1077                                 "from userfaultfd (shared)",
1078                                 __func__, msg.event);
1079                    continue; /* It's not a page fault, shouldn't happen */
1080                }
1081                /* Call the device handler registered with us */
1082                ret = pcfd->handler(pcfd, &msg);
1083                if (ret) {
1084                    error_report("%s: Failed to resolve shared fault on %zd/%s",
1085                                 __func__, index, pcfd->idstr);
1086                    /* TODO: Fail? Disable this sharer? */
1087                }
1088            }
1089        }
1090    }
1091    rcu_unregister_thread();
1092    trace_postcopy_ram_fault_thread_exit();
1093    g_free(pfd);
1094    return NULL;
1095}
1096
1097int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1098{
1099    /* Open the fd for the kernel to give us userfaults */
1100    mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
1101    if (mis->userfault_fd == -1) {
1102        error_report("%s: Failed to open userfault fd: %s", __func__,
1103                     strerror(errno));
1104        return -1;
1105    }
1106
1107    /*
1108     * Although the host check already tested the API, we need to
1109     * do the check again as an ABI handshake on the new fd.
1110     */
1111    if (!ufd_check_and_apply(mis->userfault_fd, mis)) {
1112        return -1;
1113    }
1114
1115    /* Now an eventfd we use to tell the fault-thread to quit */
1116    mis->userfault_event_fd = eventfd(0, EFD_CLOEXEC);
1117    if (mis->userfault_event_fd == -1) {
1118        error_report("%s: Opening userfault_event_fd: %s", __func__,
1119                     strerror(errno));
1120        close(mis->userfault_fd);
1121        return -1;
1122    }
1123
1124    qemu_sem_init(&mis->fault_thread_sem, 0);
1125    qemu_thread_create(&mis->fault_thread, "postcopy/fault",
1126                       postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
1127    qemu_sem_wait(&mis->fault_thread_sem);
1128    qemu_sem_destroy(&mis->fault_thread_sem);
1129    mis->have_fault_thread = true;
1130
1131    /* Mark so that we get notified of accesses to unwritten areas */
1132    if (foreach_not_ignored_block(ram_block_enable_notify, mis)) {
1133        error_report("ram_block_enable_notify failed");
1134        return -1;
1135    }
1136
1137    mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
1138                                  PROT_READ | PROT_WRITE, MAP_PRIVATE |
1139                                  MAP_ANONYMOUS, -1, 0);
1140    if (mis->postcopy_tmp_page == MAP_FAILED) {
1141        mis->postcopy_tmp_page = NULL;
1142        error_report("%s: Failed to map postcopy_tmp_page %s",
1143                     __func__, strerror(errno));
1144        return -1;
1145    }
1146
1147    /*
1148     * Map large zero page when kernel can't use UFFDIO_ZEROPAGE for hugepages
1149     */
1150    mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
1151                                       PROT_READ | PROT_WRITE,
1152                                       MAP_PRIVATE | MAP_ANONYMOUS,
1153                                       -1, 0);
1154    if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
1155        int e = errno;
1156        mis->postcopy_tmp_zero_page = NULL;
1157        error_report("%s: Failed to map large zero page %s",
1158                     __func__, strerror(e));
1159        return -e;
1160    }
1161    memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
1162
1163    /*
1164     * Ballooning can mark pages as absent while we're postcopying
1165     * that would cause false userfaults.
1166     */
1167    postcopy_balloon_inhibit(true);
1168
1169    trace_postcopy_ram_enable_notify();
1170
1171    return 0;
1172}
1173
1174static int qemu_ufd_copy_ioctl(int userfault_fd, void *host_addr,
1175                               void *from_addr, uint64_t pagesize, RAMBlock *rb)
1176{
1177    int ret;
1178    if (from_addr) {
1179        struct uffdio_copy copy_struct;
1180        copy_struct.dst = (uint64_t)(uintptr_t)host_addr;
1181        copy_struct.src = (uint64_t)(uintptr_t)from_addr;
1182        copy_struct.len = pagesize;
1183        copy_struct.mode = 0;
1184        ret = ioctl(userfault_fd, UFFDIO_COPY, &copy_struct);
1185    } else {
1186        struct uffdio_zeropage zero_struct;
1187        zero_struct.range.start = (uint64_t)(uintptr_t)host_addr;
1188        zero_struct.range.len = pagesize;
1189        zero_struct.mode = 0;
1190        ret = ioctl(userfault_fd, UFFDIO_ZEROPAGE, &zero_struct);
1191    }
1192    if (!ret) {
1193        ramblock_recv_bitmap_set_range(rb, host_addr,
1194                                       pagesize / qemu_target_page_size());
1195        mark_postcopy_blocktime_end((uintptr_t)host_addr);
1196
1197    }
1198    return ret;
1199}
1200
1201int postcopy_notify_shared_wake(RAMBlock *rb, uint64_t offset)
1202{
1203    int i;
1204    MigrationIncomingState *mis = migration_incoming_get_current();
1205    GArray *pcrfds = mis->postcopy_remote_fds;
1206
1207    for (i = 0; i < pcrfds->len; i++) {
1208        struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1209        int ret = cur->waker(cur, rb, offset);
1210        if (ret) {
1211            return ret;
1212        }
1213    }
1214    return 0;
1215}
1216
1217/*
1218 * Place a host page (from) at (host) atomically
1219 * returns 0 on success
1220 */
1221int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1222                        RAMBlock *rb)
1223{
1224    size_t pagesize = qemu_ram_pagesize(rb);
1225
1226    /* copy also acks to the kernel waking the stalled thread up
1227     * TODO: We can inhibit that ack and only do it if it was requested
1228     * which would be slightly cheaper, but we'd have to be careful
1229     * of the order of updating our page state.
1230     */
1231    if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, from, pagesize, rb)) {
1232        int e = errno;
1233        error_report("%s: %s copy host: %p from: %p (size: %zd)",
1234                     __func__, strerror(e), host, from, pagesize);
1235
1236        return -e;
1237    }
1238
1239    trace_postcopy_place_page(host);
1240    return postcopy_notify_shared_wake(rb,
1241                                       qemu_ram_block_host_offset(rb, host));
1242}
1243
1244/*
1245 * Place a zero page at (host) atomically
1246 * returns 0 on success
1247 */
1248int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1249                             RAMBlock *rb)
1250{
1251    size_t pagesize = qemu_ram_pagesize(rb);
1252    trace_postcopy_place_page_zero(host);
1253
1254    /* Normal RAMBlocks can zero a page using UFFDIO_ZEROPAGE
1255     * but it's not available for everything (e.g. hugetlbpages)
1256     */
1257    if (qemu_ram_is_uf_zeroable(rb)) {
1258        if (qemu_ufd_copy_ioctl(mis->userfault_fd, host, NULL, pagesize, rb)) {
1259            int e = errno;
1260            error_report("%s: %s zero host: %p",
1261                         __func__, strerror(e), host);
1262
1263            return -e;
1264        }
1265        return postcopy_notify_shared_wake(rb,
1266                                           qemu_ram_block_host_offset(rb,
1267                                                                      host));
1268    } else {
1269        return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page, rb);
1270    }
1271}
1272
1273#else
1274/* No target OS support, stubs just fail */
1275void fill_destination_postcopy_migration_info(MigrationInfo *info)
1276{
1277}
1278
1279bool postcopy_ram_supported_by_host(MigrationIncomingState *mis)
1280{
1281    error_report("%s: No OS support", __func__);
1282    return false;
1283}
1284
1285int postcopy_ram_incoming_init(MigrationIncomingState *mis)
1286{
1287    error_report("postcopy_ram_incoming_init: No OS support");
1288    return -1;
1289}
1290
1291int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
1292{
1293    assert(0);
1294    return -1;
1295}
1296
1297int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
1298{
1299    assert(0);
1300    return -1;
1301}
1302
1303int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
1304                                 uint64_t client_addr, uint64_t rb_offset)
1305{
1306    assert(0);
1307    return -1;
1308}
1309
1310int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
1311{
1312    assert(0);
1313    return -1;
1314}
1315
1316int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
1317                        RAMBlock *rb)
1318{
1319    assert(0);
1320    return -1;
1321}
1322
1323int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
1324                        RAMBlock *rb)
1325{
1326    assert(0);
1327    return -1;
1328}
1329
1330int postcopy_wake_shared(struct PostCopyFD *pcfd,
1331                         uint64_t client_addr,
1332                         RAMBlock *rb)
1333{
1334    assert(0);
1335    return -1;
1336}
1337#endif
1338
1339/* ------------------------------------------------------------------------- */
1340
1341void postcopy_fault_thread_notify(MigrationIncomingState *mis)
1342{
1343    uint64_t tmp64 = 1;
1344
1345    /*
1346     * Wakeup the fault_thread.  It's an eventfd that should currently
1347     * be at 0, we're going to increment it to 1
1348     */
1349    if (write(mis->userfault_event_fd, &tmp64, 8) != 8) {
1350        /* Not much we can do here, but may as well report it */
1351        error_report("%s: incrementing failed: %s", __func__,
1352                     strerror(errno));
1353    }
1354}
1355
1356/**
1357 * postcopy_discard_send_init: Called at the start of each RAMBlock before
1358 *   asking to discard individual ranges.
1359 *
1360 * @ms: The current migration state.
1361 * @offset: the bitmap offset of the named RAMBlock in the migration bitmap.
1362 * @name: RAMBlock that discards will operate on.
1363 */
1364static PostcopyDiscardState pds = {0};
1365void postcopy_discard_send_init(MigrationState *ms, const char *name)
1366{
1367    pds.ramblock_name = name;
1368    pds.cur_entry = 0;
1369    pds.nsentwords = 0;
1370    pds.nsentcmds = 0;
1371}
1372
1373/**
1374 * postcopy_discard_send_range: Called by the bitmap code for each chunk to
1375 *   discard. May send a discard message, may just leave it queued to
1376 *   be sent later.
1377 *
1378 * @ms: Current migration state.
1379 * @start,@length: a range of pages in the migration bitmap in the
1380 *   RAM block passed to postcopy_discard_send_init() (length=1 is one page)
1381 */
1382void postcopy_discard_send_range(MigrationState *ms, unsigned long start,
1383                                 unsigned long length)
1384{
1385    size_t tp_size = qemu_target_page_size();
1386    /* Convert to byte offsets within the RAM block */
1387    pds.start_list[pds.cur_entry] = start  * tp_size;
1388    pds.length_list[pds.cur_entry] = length * tp_size;
1389    trace_postcopy_discard_send_range(pds.ramblock_name, start, length);
1390    pds.cur_entry++;
1391    pds.nsentwords++;
1392
1393    if (pds.cur_entry == MAX_DISCARDS_PER_COMMAND) {
1394        /* Full set, ship it! */
1395        qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1396                                              pds.ramblock_name,
1397                                              pds.cur_entry,
1398                                              pds.start_list,
1399                                              pds.length_list);
1400        pds.nsentcmds++;
1401        pds.cur_entry = 0;
1402    }
1403}
1404
1405/**
1406 * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
1407 * bitmap code. Sends any outstanding discard messages, frees the PDS
1408 *
1409 * @ms: Current migration state.
1410 */
1411void postcopy_discard_send_finish(MigrationState *ms)
1412{
1413    /* Anything unsent? */
1414    if (pds.cur_entry) {
1415        qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
1416                                              pds.ramblock_name,
1417                                              pds.cur_entry,
1418                                              pds.start_list,
1419                                              pds.length_list);
1420        pds.nsentcmds++;
1421    }
1422
1423    trace_postcopy_discard_send_finish(pds.ramblock_name, pds.nsentwords,
1424                                       pds.nsentcmds);
1425}
1426
1427/*
1428 * Current state of incoming postcopy; note this is not part of
1429 * MigrationIncomingState since it's state is used during cleanup
1430 * at the end as MIS is being freed.
1431 */
1432static PostcopyState incoming_postcopy_state;
1433
1434PostcopyState  postcopy_state_get(void)
1435{
1436    return atomic_mb_read(&incoming_postcopy_state);
1437}
1438
1439/* Set the state and return the old state */
1440PostcopyState postcopy_state_set(PostcopyState new_state)
1441{
1442    return atomic_xchg(&incoming_postcopy_state, new_state);
1443}
1444
1445/* Register a handler for external shared memory postcopy
1446 * called on the destination.
1447 */
1448void postcopy_register_shared_ufd(struct PostCopyFD *pcfd)
1449{
1450    MigrationIncomingState *mis = migration_incoming_get_current();
1451
1452    mis->postcopy_remote_fds = g_array_append_val(mis->postcopy_remote_fds,
1453                                                  *pcfd);
1454}
1455
1456/* Unregister a handler for external shared memory postcopy
1457 */
1458void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
1459{
1460    guint i;
1461    MigrationIncomingState *mis = migration_incoming_get_current();
1462    GArray *pcrfds = mis->postcopy_remote_fds;
1463
1464    for (i = 0; i < pcrfds->len; i++) {
1465        struct PostCopyFD *cur = &g_array_index(pcrfds, struct PostCopyFD, i);
1466        if (cur->fd == pcfd->fd) {
1467            mis->postcopy_remote_fds = g_array_remove_index(pcrfds, i);
1468            return;
1469        }
1470    }
1471}
1472