LXR qemu/migration/postcopy-ram.c

   1/*
   2 * Postcopy migration for RAM
   3 *
   4 * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
   5 *
   6 * Authors:
   7 *  Dave Gilbert  <dgilbert@redhat.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2 or later.
  10 * See the COPYING file in the top-level directory.
  11 *
  12 */
  13
  14/*
  15 * Postcopy is a migration technique where the execution flips from the
  16 * source to the destination before all the data has been copied.
  17 */
  18
  19#include "qemu/osdep.h"
  20
  21#include "qemu-common.h"
  22#include "migration/migration.h"
  23#include "migration/postcopy-ram.h"
  24#include "sysemu/sysemu.h"
  25#include "sysemu/balloon.h"
  26#include "qemu/error-report.h"
  27#include "trace.h"
  28
  29/* Arbitrary limit on size of each discard command,
  30 * keeps them around ~200 bytes
  31 */
  32#define MAX_DISCARDS_PER_COMMAND 12
  33
  34struct PostcopyDiscardState {
  35    const char *ramblock_name;
  36    uint64_t offset; /* Bitmap entry for the 1st bit of this RAMBlock */
  37    uint16_t cur_entry;
  38    /*
  39     * Start and length of a discard range (bytes)
  40     */
  41    uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
  42    uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
  43    unsigned int nsentwords;
  44    unsigned int nsentcmds;
  45};
  46
  47/* Postcopy needs to detect accesses to pages that haven't yet been copied
  48 * across, and efficiently map new pages in, the techniques for doing this
  49 * are target OS specific.
  50 */
  51#if defined(__linux__)
  52
  53#include <poll.h>
  54#include <sys/ioctl.h>
  55#include <sys/syscall.h>
  56#include <asm/types.h> /* for __u64 */
  57#endif
  58
  59#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD)
  60#include <sys/eventfd.h>
  61#include <linux/userfaultfd.h>
  62
  63static bool ufd_version_check(int ufd)
  64{
  65    struct uffdio_api api_struct;
  66    uint64_t ioctl_mask;
  67
  68    api_struct.api = UFFD_API;
  69    api_struct.features = 0;
  70    if (ioctl(ufd, UFFDIO_API, &api_struct)) {
  71        error_report("postcopy_ram_supported_by_host: UFFDIO_API failed: %s",
  72                     strerror(errno));
  73        return false;
  74    }
  75
  76    ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
  77                 (__u64)1 << _UFFDIO_UNREGISTER;
  78    if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
  79        error_report("Missing userfault features: %" PRIx64,
  80                     (uint64_t)(~api_struct.ioctls & ioctl_mask));
  81        return false;
  82    }
  83
  84    return true;
  85}
  86
  87/*
  88 * Check for things that postcopy won't support; returns 0 if the block
  89 * is fine.
  90 */
  91static int check_range(const char *block_name, void *host_addr,
  92                      ram_addr_t offset, ram_addr_t length, void *opaque)
  93{
  94    RAMBlock *rb = qemu_ram_block_by_name(block_name);
  95
  96    if (qemu_ram_pagesize(rb) > getpagesize()) {
  97        error_report("Postcopy doesn't support large page sizes yet (%s)",
  98                     block_name);
  99        return -E2BIG;
 100    }
 101
 102    return 0;
 103}
 104
 105/*
 106 * Note: This has the side effect of munlock'ing all of RAM, that's
 107 * normally fine since if the postcopy succeeds it gets turned back on at the
 108 * end.
 109 */
 110bool postcopy_ram_supported_by_host(void)
 111{
 112    long pagesize = getpagesize();
 113    int ufd = -1;
 114    bool ret = false; /* Error unless we change it */
 115    void *testarea = NULL;
 116    struct uffdio_register reg_struct;
 117    struct uffdio_range range_struct;
 118    uint64_t feature_mask;
 119
 120    if ((1ul << qemu_target_page_bits()) > pagesize) {
 121        error_report("Target page size bigger than host page size");
 122        goto out;
 123    }
 124
 125    /* Check for anything about the RAMBlocks we don't support */
 126    if (qemu_ram_foreach_block(check_range, NULL)) {
 127        /* check_range will have printed its own error */
 128        goto out;
 129    }
 130
 131    ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
 132    if (ufd == -1) {
 133        error_report("%s: userfaultfd not available: %s", __func__,
 134                     strerror(errno));
 135        goto out;
 136    }
 137
 138    /* Version and features check */
 139    if (!ufd_version_check(ufd)) {
 140        goto out;
 141    }
 142
 143    /*
 144     * userfault and mlock don't go together; we'll put it back later if
 145     * it was enabled.
 146     */
 147    if (munlockall()) {
 148        error_report("%s: munlockall: %s", __func__,  strerror(errno));
 149        return -1;
 150    }
 151
 152    /*
 153     *  We need to check that the ops we need are supported on anon memory
 154     *  To do that we need to register a chunk and see the flags that
 155     *  are returned.
 156     */
 157    testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
 158                                    MAP_ANONYMOUS, -1, 0);
 159    if (testarea == MAP_FAILED) {
 160        error_report("%s: Failed to map test area: %s", __func__,
 161                     strerror(errno));
 162        goto out;
 163    }
 164    g_assert(((size_t)testarea & (pagesize-1)) == 0);
 165
 166    reg_struct.range.start = (uintptr_t)testarea;
 167    reg_struct.range.len = pagesize;
 168    reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
 169
 170    if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
 171        error_report("%s userfault register: %s", __func__, strerror(errno));
 172        goto out;
 173    }
 174
 175    range_struct.start = (uintptr_t)testarea;
 176    range_struct.len = pagesize;
 177    if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
 178        error_report("%s userfault unregister: %s", __func__, strerror(errno));
 179        goto out;
 180    }
 181
 182    feature_mask = (__u64)1 << _UFFDIO_WAKE |
 183                   (__u64)1 << _UFFDIO_COPY |
 184                   (__u64)1 << _UFFDIO_ZEROPAGE;
 185    if ((reg_struct.ioctls & feature_mask) != feature_mask) {
 186        error_report("Missing userfault map features: %" PRIx64,
 187                     (uint64_t)(~reg_struct.ioctls & feature_mask));
 188        goto out;
 189    }
 190
 191    /* Success! */
 192    ret = true;
 193out:
 194    if (testarea) {
 195        munmap(testarea, pagesize);
 196    }
 197    if (ufd != -1) {
 198        close(ufd);
 199    }
 200    return ret;
 201}
 202
 203/**
 204 * postcopy_ram_discard_range: Discard a range of memory.
 205 * We can assume that if we've been called postcopy_ram_hosttest returned true.
 206 *
 207 * @mis: Current incoming migration state.
 208 * @start, @length: range of memory to discard.
 209 *
 210 * returns: 0 on success.
 211 */
 212int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
 213                               size_t length)
 214{
 215    trace_postcopy_ram_discard_range(start, length);
 216    if (madvise(start, length, MADV_DONTNEED)) {
 217        error_report("%s MADV_DONTNEED: %s", __func__, strerror(errno));
 218        return -1;
 219    }
 220
 221    return 0;
 222}
 223
 224/*
 225 * Setup an area of RAM so that it *can* be used for postcopy later; this
 226 * must be done right at the start prior to pre-copy.
 227 * opaque should be the MIS.
 228 */
 229static int init_range(const char *block_name, void *host_addr,
 230                      ram_addr_t offset, ram_addr_t length, void *opaque)
 231{
 232    MigrationIncomingState *mis = opaque;
 233
 234    trace_postcopy_init_range(block_name, host_addr, offset, length);
 235
 236    /*
 237     * We need the whole of RAM to be truly empty for postcopy, so things
 238     * like ROMs and any data tables built during init must be zero'd
 239     * - we're going to get the copy from the source anyway.
 240     * (Precopy will just overwrite this data, so doesn't need the discard)
 241     */
 242    if (postcopy_ram_discard_range(mis, host_addr, length)) {
 243        return -1;
 244    }
 245
 246    return 0;
 247}
 248
 249/*
 250 * At the end of migration, undo the effects of init_range
 251 * opaque should be the MIS.
 252 */
 253static int cleanup_range(const char *block_name, void *host_addr,
 254                        ram_addr_t offset, ram_addr_t length, void *opaque)
 255{
 256    MigrationIncomingState *mis = opaque;
 257    struct uffdio_range range_struct;
 258    trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
 259
 260    /*
 261     * We turned off hugepage for the precopy stage with postcopy enabled
 262     * we can turn it back on now.
 263     */
 264    qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE);
 265
 266    /*
 267     * We can also turn off userfault now since we should have all the
 268     * pages.   It can be useful to leave it on to debug postcopy
 269     * if you're not sure it's always getting every page.
 270     */
 271    range_struct.start = (uintptr_t)host_addr;
 272    range_struct.len = length;
 273
 274    if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
 275        error_report("%s: userfault unregister %s", __func__, strerror(errno));
 276
 277        return -1;
 278    }
 279
 280    return 0;
 281}
 282
 283/*
 284 * Initialise postcopy-ram, setting the RAM to a state where we can go into
 285 * postcopy later; must be called prior to any precopy.
 286 * called from arch_init's similarly named ram_postcopy_incoming_init
 287 */
 288int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
 289{
 290    if (qemu_ram_foreach_block(init_range, mis)) {
 291        return -1;
 292    }
 293
 294    return 0;
 295}
 296
 297/*
 298 * At the end of a migration where postcopy_ram_incoming_init was called.
 299 */
 300int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
 301{
 302    trace_postcopy_ram_incoming_cleanup_entry();
 303
 304    if (mis->have_fault_thread) {
 305        uint64_t tmp64;
 306
 307        if (qemu_ram_foreach_block(cleanup_range, mis)) {
 308            return -1;
 309        }
 310        /*
 311         * Tell the fault_thread to exit, it's an eventfd that should
 312         * currently be at 0, we're going to increment it to 1
 313         */
 314        tmp64 = 1;
 315        if (write(mis->userfault_quit_fd, &tmp64, 8) == 8) {
 316            trace_postcopy_ram_incoming_cleanup_join();
 317            qemu_thread_join(&mis->fault_thread);
 318        } else {
 319            /* Not much we can do here, but may as well report it */
 320            error_report("%s: incrementing userfault_quit_fd: %s", __func__,
 321                         strerror(errno));
 322        }
 323        trace_postcopy_ram_incoming_cleanup_closeuf();
 324        close(mis->userfault_fd);
 325        close(mis->userfault_quit_fd);
 326        mis->have_fault_thread = false;
 327    }
 328
 329    qemu_balloon_inhibit(false);
 330
 331    if (enable_mlock) {
 332        if (os_mlock() < 0) {
 333            error_report("mlock: %s", strerror(errno));
 334            /*
 335             * It doesn't feel right to fail at this point, we have a valid
 336             * VM state.
 337             */
 338        }
 339    }
 340
 341    postcopy_state_set(POSTCOPY_INCOMING_END);
 342    migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
 343
 344    if (mis->postcopy_tmp_page) {
 345        munmap(mis->postcopy_tmp_page, getpagesize());
 346        mis->postcopy_tmp_page = NULL;
 347    }
 348    trace_postcopy_ram_incoming_cleanup_exit();
 349    return 0;
 350}
 351
 352/*
 353 * Disable huge pages on an area
 354 */
 355static int nhp_range(const char *block_name, void *host_addr,
 356                    ram_addr_t offset, ram_addr_t length, void *opaque)
 357{
 358    trace_postcopy_nhp_range(block_name, host_addr, offset, length);
 359
 360    /*
 361     * Before we do discards we need to ensure those discards really
 362     * do delete areas of the page, even if THP thinks a hugepage would
 363     * be a good idea, so force hugepages off.
 364     */
 365    qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE);
 366
 367    return 0;
 368}
 369
 370/*
 371 * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
 372 * however leaving it until after precopy means that most of the precopy
 373 * data is still THPd
 374 */
 375int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
 376{
 377    if (qemu_ram_foreach_block(nhp_range, mis)) {
 378        return -1;
 379    }
 380
 381    postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
 382
 383    return 0;
 384}
 385
 386/*
 387 * Mark the given area of RAM as requiring notification to unwritten areas
 388 * Used as a  callback on qemu_ram_foreach_block.
 389 *   host_addr: Base of area to mark
 390 *   offset: Offset in the whole ram arena
 391 *   length: Length of the section
 392 *   opaque: MigrationIncomingState pointer
 393 * Returns 0 on success
 394 */
 395static int ram_block_enable_notify(const char *block_name, void *host_addr,
 396                                   ram_addr_t offset, ram_addr_t length,
 397                                   void *opaque)
 398{
 399    MigrationIncomingState *mis = opaque;
 400    struct uffdio_register reg_struct;
 401
 402    reg_struct.range.start = (uintptr_t)host_addr;
 403    reg_struct.range.len = length;
 404    reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
 405
 406    /* Now tell our userfault_fd that it's responsible for this area */
 407    if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
 408        error_report("%s userfault register: %s", __func__, strerror(errno));
 409        return -1;
 410    }
 411
 412    return 0;
 413}
 414
 415/*
 416 * Handle faults detected by the USERFAULT markings
 417 */
 418static void *postcopy_ram_fault_thread(void *opaque)
 419{
 420    MigrationIncomingState *mis = opaque;
 421    struct uffd_msg msg;
 422    int ret;
 423    size_t hostpagesize = getpagesize();
 424    RAMBlock *rb = NULL;
 425    RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */
 426
 427    trace_postcopy_ram_fault_thread_entry();
 428    qemu_sem_post(&mis->fault_thread_sem);
 429
 430    while (true) {
 431        ram_addr_t rb_offset;
 432        struct pollfd pfd[2];
 433
 434        /*
 435         * We're mainly waiting for the kernel to give us a faulting HVA,
 436         * however we can be told to quit via userfault_quit_fd which is
 437         * an eventfd
 438         */
 439        pfd[0].fd = mis->userfault_fd;
 440        pfd[0].events = POLLIN;
 441        pfd[0].revents = 0;
 442        pfd[1].fd = mis->userfault_quit_fd;
 443        pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
 444        pfd[1].revents = 0;
 445
 446        if (poll(pfd, 2, -1 /* Wait forever */) == -1) {
 447            error_report("%s: userfault poll: %s", __func__, strerror(errno));
 448            break;
 449        }
 450
 451        if (pfd[1].revents) {
 452            trace_postcopy_ram_fault_thread_quit();
 453            break;
 454        }
 455
 456        ret = read(mis->userfault_fd, &msg, sizeof(msg));
 457        if (ret != sizeof(msg)) {
 458            if (errno == EAGAIN) {
 459                /*
 460                 * if a wake up happens on the other thread just after
 461                 * the poll, there is nothing to read.
 462                 */
 463                continue;
 464            }
 465            if (ret < 0) {
 466                error_report("%s: Failed to read full userfault message: %s",
 467                             __func__, strerror(errno));
 468                break;
 469            } else {
 470                error_report("%s: Read %d bytes from userfaultfd expected %zd",
 471                             __func__, ret, sizeof(msg));
 472                break; /* Lost alignment, don't know what we'd read next */
 473            }
 474        }
 475        if (msg.event != UFFD_EVENT_PAGEFAULT) {
 476            error_report("%s: Read unexpected event %ud from userfaultfd",
 477                         __func__, msg.event);
 478            continue; /* It's not a page fault, shouldn't happen */
 479        }
 480
 481        rb = qemu_ram_block_from_host(
 482                 (void *)(uintptr_t)msg.arg.pagefault.address,
 483                 true, &rb_offset);
 484        if (!rb) {
 485            error_report("postcopy_ram_fault_thread: Fault outside guest: %"
 486                         PRIx64, (uint64_t)msg.arg.pagefault.address);
 487            break;
 488        }
 489
 490        rb_offset &= ~(hostpagesize - 1);
 491        trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
 492                                                qemu_ram_get_idstr(rb),
 493                                                rb_offset);
 494
 495        /*
 496         * Send the request to the source - we want to request one
 497         * of our host page sizes (which is >= TPS)
 498         */
 499        if (rb != last_rb) {
 500            last_rb = rb;
 501            migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
 502                                     rb_offset, hostpagesize);
 503        } else {
 504            /* Save some space */
 505            migrate_send_rp_req_pages(mis, NULL,
 506                                     rb_offset, hostpagesize);
 507        }
 508    }
 509    trace_postcopy_ram_fault_thread_exit();
 510    return NULL;
 511}
 512
 513int postcopy_ram_enable_notify(MigrationIncomingState *mis)
 514{
 515    /* Open the fd for the kernel to give us userfaults */
 516    mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
 517    if (mis->userfault_fd == -1) {
 518        error_report("%s: Failed to open userfault fd: %s", __func__,
 519                     strerror(errno));
 520        return -1;
 521    }
 522
 523    /*
 524     * Although the host check already tested the API, we need to
 525     * do the check again as an ABI handshake on the new fd.
 526     */
 527    if (!ufd_version_check(mis->userfault_fd)) {
 528        return -1;
 529    }
 530
 531    /* Now an eventfd we use to tell the fault-thread to quit */
 532    mis->userfault_quit_fd = eventfd(0, EFD_CLOEXEC);
 533    if (mis->userfault_quit_fd == -1) {
 534        error_report("%s: Opening userfault_quit_fd: %s", __func__,
 535                     strerror(errno));
 536        close(mis->userfault_fd);
 537        return -1;
 538    }
 539
 540    qemu_sem_init(&mis->fault_thread_sem, 0);
 541    qemu_thread_create(&mis->fault_thread, "postcopy/fault",
 542                       postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
 543    qemu_sem_wait(&mis->fault_thread_sem);
 544    qemu_sem_destroy(&mis->fault_thread_sem);
 545    mis->have_fault_thread = true;
 546
 547    /* Mark so that we get notified of accesses to unwritten areas */
 548    if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) {
 549        return -1;
 550    }
 551
 552    /*
 553     * Ballooning can mark pages as absent while we're postcopying
 554     * that would cause false userfaults.
 555     */
 556    qemu_balloon_inhibit(true);
 557
 558    trace_postcopy_ram_enable_notify();
 559
 560    return 0;
 561}
 562
 563/*
 564 * Place a host page (from) at (host) atomically
 565 * returns 0 on success
 566 */
 567int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from)
 568{
 569    struct uffdio_copy copy_struct;
 570
 571    copy_struct.dst = (uint64_t)(uintptr_t)host;
 572    copy_struct.src = (uint64_t)(uintptr_t)from;
 573    copy_struct.len = getpagesize();
 574    copy_struct.mode = 0;
 575
 576    /* copy also acks to the kernel waking the stalled thread up
 577     * TODO: We can inhibit that ack and only do it if it was requested
 578     * which would be slightly cheaper, but we'd have to be careful
 579     * of the order of updating our page state.
 580     */
 581    if (ioctl(mis->userfault_fd, UFFDIO_COPY, &copy_struct)) {
 582        int e = errno;
 583        error_report("%s: %s copy host: %p from: %p",
 584                     __func__, strerror(e), host, from);
 585
 586        return -e;
 587    }
 588
 589    trace_postcopy_place_page(host);
 590    return 0;
 591}
 592
 593/*
 594 * Place a zero page at (host) atomically
 595 * returns 0 on success
 596 */
 597int postcopy_place_page_zero(MigrationIncomingState *mis, void *host)
 598{
 599    struct uffdio_zeropage zero_struct;
 600
 601    zero_struct.range.start = (uint64_t)(uintptr_t)host;
 602    zero_struct.range.len = getpagesize();
 603    zero_struct.mode = 0;
 604
 605    if (ioctl(mis->userfault_fd, UFFDIO_ZEROPAGE, &zero_struct)) {
 606        int e = errno;
 607        error_report("%s: %s zero host: %p",
 608                     __func__, strerror(e), host);
 609
 610        return -e;
 611    }
 612
 613    trace_postcopy_place_page_zero(host);
 614    return 0;
 615}
 616
 617/*
 618 * Returns a target page of memory that can be mapped at a later point in time
 619 * using postcopy_place_page
 620 * The same address is used repeatedly, postcopy_place_page just takes the
 621 * backing page away.
 622 * Returns: Pointer to allocated page
 623 *
 624 */
 625void *postcopy_get_tmp_page(MigrationIncomingState *mis)
 626{
 627    if (!mis->postcopy_tmp_page) {
 628        mis->postcopy_tmp_page = mmap(NULL, getpagesize(),
 629                             PROT_READ | PROT_WRITE, MAP_PRIVATE |
 630                             MAP_ANONYMOUS, -1, 0);
 631        if (mis->postcopy_tmp_page == MAP_FAILED) {
 632            mis->postcopy_tmp_page = NULL;
 633            error_report("%s: %s", __func__, strerror(errno));
 634            return NULL;
 635        }
 636    }
 637
 638    return mis->postcopy_tmp_page;
 639}
 640
 641#else
 642/* No target OS support, stubs just fail */
 643bool postcopy_ram_supported_by_host(void)
 644{
 645    error_report("%s: No OS support", __func__);
 646    return false;
 647}
 648
 649int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
 650{
 651    error_report("postcopy_ram_incoming_init: No OS support");
 652    return -1;
 653}
 654
 655int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
 656{
 657    assert(0);
 658    return -1;
 659}
 660
 661int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
 662                               size_t length)
 663{
 664    assert(0);
 665    return -1;
 666}
 667
 668int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
 669{
 670    assert(0);
 671    return -1;
 672}
 673
 674int postcopy_ram_enable_notify(MigrationIncomingState *mis)
 675{
 676    assert(0);
 677    return -1;
 678}
 679
 680int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from)
 681{
 682    assert(0);
 683    return -1;
 684}
 685
 686int postcopy_place_page_zero(MigrationIncomingState *mis, void *host)
 687{
 688    assert(0);
 689    return -1;
 690}
 691
 692void *postcopy_get_tmp_page(MigrationIncomingState *mis)
 693{
 694    assert(0);
 695    return NULL;
 696}
 697
 698#endif
 699
 700/* ------------------------------------------------------------------------- */
 701
 702/**
 703 * postcopy_discard_send_init: Called at the start of each RAMBlock before
 704 *   asking to discard individual ranges.
 705 *
 706 * @ms: The current migration state.
 707 * @offset: the bitmap offset of the named RAMBlock in the migration
 708 *   bitmap.
 709 * @name: RAMBlock that discards will operate on.
 710 *
 711 * returns: a new PDS.
 712 */
 713PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
 714                                                 unsigned long offset,
 715                                                 const char *name)
 716{
 717    PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
 718
 719    if (res) {
 720        res->ramblock_name = name;
 721        res->offset = offset;
 722    }
 723
 724    return res;
 725}
 726
 727/**
 728 * postcopy_discard_send_range: Called by the bitmap code for each chunk to
 729 *   discard. May send a discard message, may just leave it queued to
 730 *   be sent later.
 731 *
 732 * @ms: Current migration state.
 733 * @pds: Structure initialised by postcopy_discard_send_init().
 734 * @start,@length: a range of pages in the migration bitmap in the
 735 *   RAM block passed to postcopy_discard_send_init() (length=1 is one page)
 736 */
 737void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
 738                                unsigned long start, unsigned long length)
 739{
 740    size_t tp_bits = qemu_target_page_bits();
 741    /* Convert to byte offsets within the RAM block */
 742    pds->start_list[pds->cur_entry] = (start - pds->offset) << tp_bits;
 743    pds->length_list[pds->cur_entry] = length << tp_bits;
 744    trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
 745    pds->cur_entry++;
 746    pds->nsentwords++;
 747
 748    if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
 749        /* Full set, ship it! */
 750        qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
 751                                              pds->ramblock_name,
 752                                              pds->cur_entry,
 753                                              pds->start_list,
 754                                              pds->length_list);
 755        pds->nsentcmds++;
 756        pds->cur_entry = 0;
 757    }
 758}
 759
 760/**
 761 * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
 762 * bitmap code. Sends any outstanding discard messages, frees the PDS
 763 *
 764 * @ms: Current migration state.
 765 * @pds: Structure initialised by postcopy_discard_send_init().
 766 */
 767void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
 768{
 769    /* Anything unsent? */
 770    if (pds->cur_entry) {
 771        qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file,
 772                                              pds->ramblock_name,
 773                                              pds->cur_entry,
 774                                              pds->start_list,
 775                                              pds->length_list);
 776        pds->nsentcmds++;
 777    }
 778
 779    trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
 780                                       pds->nsentcmds);
 781
 782    g_free(pds);
 783}
 784