qemu/migration/migration.h
<<
>>
Prefs
   1/*
   2 * QEMU live migration
   3 *
   4 * Copyright IBM, Corp. 2008
   5 *
   6 * Authors:
   7 *  Anthony Liguori   <aliguori@us.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 */
  13
  14#ifndef QEMU_MIGRATION_H
  15#define QEMU_MIGRATION_H
  16
  17#include "exec/cpu-common.h"
  18#include "hw/qdev-core.h"
  19#include "qapi/qapi-types-migration.h"
  20#include "qobject/json-writer.h"
  21#include "qemu/thread.h"
  22#include "qemu/coroutine.h"
  23#include "io/channel.h"
  24#include "io/channel-buffer.h"
  25#include "net/announce.h"
  26#include "qom/object.h"
  27#include "postcopy-ram.h"
  28#include "system/runstate.h"
  29#include "migration/misc.h"
  30
  31#define  MIGRATION_THREAD_SNAPSHOT          "mig/snapshot"
  32#define  MIGRATION_THREAD_DIRTY_RATE        "mig/dirtyrate"
  33
  34#define  MIGRATION_THREAD_SRC_MAIN          "mig/src/main"
  35#define  MIGRATION_THREAD_SRC_MULTIFD       "mig/src/send_%d"
  36#define  MIGRATION_THREAD_SRC_RETURN        "mig/src/return"
  37#define  MIGRATION_THREAD_SRC_TLS           "mig/src/tls"
  38
  39#define  MIGRATION_THREAD_DST_COLO          "mig/dst/colo"
  40#define  MIGRATION_THREAD_DST_MULTIFD       "mig/dst/recv_%d"
  41#define  MIGRATION_THREAD_DST_FAULT         "mig/dst/fault"
  42#define  MIGRATION_THREAD_DST_LISTEN        "mig/dst/listen"
  43#define  MIGRATION_THREAD_DST_PREEMPT       "mig/dst/preempt"
  44
  45struct PostcopyBlocktimeContext;
  46typedef struct ThreadPool ThreadPool;
  47
  48#define  MIGRATION_RESUME_ACK_VALUE  (1)
  49
  50/*
  51 * 1<<6=64 pages -> 256K chunk when page size is 4K.  This gives us
  52 * the benefit that all the chunks are 64 pages aligned then the
  53 * bitmaps are always aligned to LONG.
  54 */
  55#define CLEAR_BITMAP_SHIFT_MIN             6
  56/*
  57 * 1<<18=256K pages -> 1G chunk when page size is 4K.  This is the
  58 * default value to use if no one specified.
  59 */
  60#define CLEAR_BITMAP_SHIFT_DEFAULT        18
  61/*
  62 * 1<<31=2G pages -> 8T chunk when page size is 4K.  This should be
  63 * big enough and make sure we won't overflow easily.
  64 */
  65#define CLEAR_BITMAP_SHIFT_MAX            31
  66
  67/* This is an abstraction of a "temp huge page" for postcopy's purpose */
  68typedef struct {
  69    /*
  70     * This points to a temporary huge page as a buffer for UFFDIO_COPY.  It's
  71     * mmap()ed and needs to be freed when cleanup.
  72     */
  73    void *tmp_huge_page;
  74    /*
  75     * This points to the host page we're going to install for this temp page.
  76     * It tells us after we've received the whole page, where we should put it.
  77     */
  78    void *host_addr;
  79    /* Number of small pages copied (in size of TARGET_PAGE_SIZE) */
  80    unsigned int target_pages;
  81    /* Whether this page contains all zeros */
  82    bool all_zero;
  83} PostcopyTmpPage;
  84
  85typedef enum {
  86    PREEMPT_THREAD_NONE = 0,
  87    PREEMPT_THREAD_CREATED,
  88    PREEMPT_THREAD_QUIT,
  89} PreemptThreadStatus;
  90
  91/* State for the incoming migration */
  92struct MigrationIncomingState {
  93    QEMUFile *from_src_file;
  94    /* Previously received RAM's RAMBlock pointer */
  95    RAMBlock *last_recv_block[RAM_CHANNEL_MAX];
  96    /* A hook to allow cleanup at the end of incoming migration */
  97    void *transport_data;
  98    void (*transport_cleanup)(void *data);
  99    /*
 100     * Used to sync thread creations.  Note that we can't create threads in
 101     * parallel with this event.
 102     */
 103    QemuEvent  thread_sync_event;
 104    /*
 105     * Free at the start of the main state load, set as the main thread finishes
 106     * loading state.
 107     */
 108    QemuEvent main_thread_load_event;
 109
 110    /* For network announces */
 111    AnnounceTimer  announce_timer;
 112
 113    size_t         largest_page_size;
 114    bool           have_fault_thread;
 115    QemuThread     fault_thread;
 116    /* Set this when we want the fault thread to quit */
 117    bool           fault_thread_quit;
 118
 119    bool           have_listen_thread;
 120    QemuThread     listen_thread;
 121
 122    /* For the kernel to send us notifications */
 123    int       userfault_fd;
 124    /* To notify the fault_thread to wake, e.g., when need to quit */
 125    int       userfault_event_fd;
 126    QEMUFile *to_src_file;
 127    QemuMutex rp_mutex;    /* We send replies from multiple threads */
 128    /* RAMBlock of last request sent to source */
 129    RAMBlock *last_rb;
 130    /*
 131     * Number of postcopy channels including the default precopy channel, so
 132     * vanilla postcopy will only contain one channel which contain both
 133     * precopy and postcopy streams.
 134     *
 135     * This is calculated when the src requests to enable postcopy but before
 136     * it starts.  Its value can depend on e.g. whether postcopy preemption is
 137     * enabled.
 138     */
 139    unsigned int postcopy_channels;
 140    /* QEMUFile for postcopy only; it'll be handled by a separate thread */
 141    QEMUFile *postcopy_qemufile_dst;
 142    /*
 143     * When postcopy_qemufile_dst is properly setup, this sem is posted.
 144     * One can wait on this semaphore to wait until the preempt channel is
 145     * properly setup.
 146     */
 147    QemuSemaphore postcopy_qemufile_dst_done;
 148    /* Postcopy priority thread is used to receive postcopy requested pages */
 149    QemuThread postcopy_prio_thread;
 150    /*
 151     * Always set by the main vm load thread only, but can be read by the
 152     * postcopy preempt thread.  "volatile" makes sure all reads will be
 153     * up-to-date across cores.
 154     */
 155    volatile PreemptThreadStatus preempt_thread_status;
 156    /*
 157     * Used to sync between the ram load main thread and the fast ram load
 158     * thread.  It protects postcopy_qemufile_dst, which is the postcopy
 159     * fast channel.
 160     *
 161     * The ram fast load thread will take it mostly for the whole lifecycle
 162     * because it needs to continuously read data from the channel, and
 163     * it'll only release this mutex if postcopy is interrupted, so that
 164     * the ram load main thread will take this mutex over and properly
 165     * release the broken channel.
 166     */
 167    QemuMutex postcopy_prio_thread_mutex;
 168    /*
 169     * An array of temp host huge pages to be used, one for each postcopy
 170     * channel.
 171     */
 172    PostcopyTmpPage *postcopy_tmp_pages;
 173    /* This is shared for all postcopy channels */
 174    void     *postcopy_tmp_zero_page;
 175    /* PostCopyFD's for external userfaultfds & handlers of shared memory */
 176    GArray   *postcopy_remote_fds;
 177
 178    MigrationStatus state;
 179
 180    /*
 181     * The incoming migration coroutine, non-NULL during qemu_loadvm_state().
 182     * Used to wake the migration incoming coroutine from rdma code. How much is
 183     * it safe - it's a question.
 184     */
 185    Coroutine *loadvm_co;
 186
 187    /* The coroutine we should enter (back) after failover */
 188    Coroutine *colo_incoming_co;
 189    QemuEvent colo_incoming_event;
 190
 191    /* Optional load threads pool and its thread exit request flag */
 192    ThreadPool *load_threads;
 193    bool load_threads_abort;
 194
 195    /*
 196     * PostcopyBlocktimeContext to keep information for postcopy
 197     * live migration, to calculate vCPU block time
 198     * */
 199    struct PostcopyBlocktimeContext *blocktime_ctx;
 200
 201    /* notify PAUSED postcopy incoming migrations to try to continue */
 202    QemuSemaphore postcopy_pause_sem_dst;
 203    QemuSemaphore postcopy_pause_sem_fault;
 204    /*
 205     * This semaphore is used to allow the ram fast load thread (only when
 206     * postcopy preempt is enabled) fall into sleep when there's network
 207     * interruption detected.  When the recovery is done, the main load
 208     * thread will kick the fast ram load thread using this semaphore.
 209     */
 210    QemuSemaphore postcopy_pause_sem_fast_load;
 211
 212    /* List of listening socket addresses  */
 213    SocketAddressList *socket_address_list;
 214
 215    /* A tree of pages that we requested to the source VM */
 216    GTree *page_requested;
 217    /*
 218     * For postcopy only, count the number of requested page faults that
 219     * still haven't been resolved.
 220     */
 221    int page_requested_count;
 222    /*
 223     * The mutex helps to maintain the requested pages that we sent to the
 224     * source, IOW, to guarantee coherent between the page_requests tree and
 225     * the per-ramblock receivedmap.  Note! This does not guarantee consistency
 226     * of the real page copy procedures (using UFFDIO_[ZERO]COPY).  E.g., even
 227     * if one bit in receivedmap is cleared, UFFDIO_COPY could have happened
 228     * for that page already.  This is intended so that the mutex won't
 229     * serialize and blocked by slow operations like UFFDIO_* ioctls.  However
 230     * this should be enough to make sure the page_requested tree always
 231     * contains valid information.
 232     */
 233    QemuMutex page_request_mutex;
 234    /*
 235     * If postcopy preempt is enabled, there is a chance that the main
 236     * thread finished loading its data before the preempt channel has
 237     * finished loading the urgent pages.  If that happens, the two threads
 238     * will use this condvar to synchronize, so the main thread will always
 239     * wait until all pages received.
 240     */
 241    QemuCond page_request_cond;
 242
 243    /*
 244     * Number of devices that have yet to approve switchover. When this reaches
 245     * zero an ACK that it's OK to do switchover is sent to the source. No lock
 246     * is needed as this field is updated serially.
 247     */
 248    unsigned int switchover_ack_pending_num;
 249
 250    /* Do exit on incoming migration failure */
 251    bool exit_on_error;
 252};
 253
 254MigrationIncomingState *migration_incoming_get_current(void);
 255void migration_incoming_state_destroy(void);
 256void migration_incoming_transport_cleanup(MigrationIncomingState *mis);
 257/*
 258 * Functions to work with blocktime context
 259 */
 260void fill_destination_postcopy_migration_info(MigrationInfo *info);
 261
 262#define TYPE_MIGRATION "migration"
 263
 264typedef struct MigrationClass MigrationClass;
 265DECLARE_OBJ_CHECKERS(MigrationState, MigrationClass,
 266                     MIGRATION_OBJ, TYPE_MIGRATION)
 267
 268struct MigrationClass {
 269    /*< private >*/
 270    DeviceClass parent_class;
 271};
 272
 273struct MigrationState {
 274    /*< private >*/
 275    DeviceState parent_obj;
 276
 277    /*< public >*/
 278    QemuThread thread;
 279    /* Protected by qemu_file_lock */
 280    QEMUFile *to_dst_file;
 281    /* Postcopy specific transfer channel */
 282    QEMUFile *postcopy_qemufile_src;
 283    /*
 284     * It is posted when the preempt channel is established.  Note: this is
 285     * used for both the start or recover of a postcopy migration.  We'll
 286     * post to this sem every time a new preempt channel is created in the
 287     * main thread, and we keep post() and wait() in pair.
 288     */
 289    QemuSemaphore postcopy_qemufile_src_sem;
 290    QIOChannelBuffer *bioc;
 291    /*
 292     * Protects to_dst_file/from_dst_file pointers.  We need to make sure we
 293     * won't yield or hang during the critical section, since this lock will be
 294     * used in OOB command handler.
 295     */
 296    QemuMutex qemu_file_lock;
 297
 298    /*
 299     * Used to allow urgent requests to override rate limiting.
 300     */
 301    QemuSemaphore rate_limit_sem;
 302
 303    /* pages already send at the beginning of current iteration */
 304    uint64_t iteration_initial_pages;
 305
 306    /* pages transferred per second */
 307    double pages_per_second;
 308
 309    /* bytes already send at the beginning of current iteration */
 310    uint64_t iteration_initial_bytes;
 311    /* time at the start of current iteration */
 312    int64_t iteration_start_time;
 313    /*
 314     * The final stage happens when the remaining data is smaller than
 315     * this threshold; it's calculated from the requested downtime and
 316     * measured bandwidth, or avail-switchover-bandwidth if specified.
 317     */
 318    uint64_t threshold_size;
 319
 320    /* params from 'migrate-set-parameters' */
 321    MigrationParameters parameters;
 322
 323    MigrationStatus state;
 324
 325    /* State related to return path */
 326    struct {
 327        /* Protected by qemu_file_lock */
 328        QEMUFile     *from_dst_file;
 329        QemuThread    rp_thread;
 330        /*
 331         * We can also check non-zero of rp_thread, but there's no "official"
 332         * way to do this, so this bool makes it slightly more elegant.
 333         * Checking from_dst_file for this is racy because from_dst_file will
 334         * be cleared in the rp_thread!
 335         */
 336        bool          rp_thread_created;
 337        /*
 338         * Used to synchronize between migration main thread and return
 339         * path thread.  The migration thread can wait() on this sem, while
 340         * other threads (e.g., return path thread) can kick it using a
 341         * post().
 342         */
 343        QemuSemaphore rp_sem;
 344        /*
 345         * We post to this when we got one PONG from dest. So far it's an
 346         * easy way to know the main channel has successfully established
 347         * on dest QEMU.
 348         */
 349        QemuSemaphore rp_pong_acks;
 350    } rp_state;
 351
 352    double mbps;
 353    /* Timestamp when recent migration starts (ms) */
 354    int64_t start_time;
 355    /* Total time used by latest migration (ms) */
 356    int64_t total_time;
 357    /* Timestamp when VM is down (ms) to migrate the last stuff */
 358    int64_t downtime_start;
 359    int64_t downtime;
 360    int64_t expected_downtime;
 361    bool capabilities[MIGRATION_CAPABILITY__MAX];
 362    int64_t setup_time;
 363
 364    /*
 365     * State before stopping the vm by vm_stop_force_state().
 366     * If migration is interrupted by any reason, we need to continue
 367     * running the guest on source if it was running or restore its stopped
 368     * state.
 369     */
 370    RunState vm_old_state;
 371
 372    /* Flag set once the migration has been asked to enter postcopy */
 373    bool start_postcopy;
 374
 375    /* Flag set once the migration thread is running (and needs joining) */
 376    bool migration_thread_running;
 377
 378    /* Migration is waiting for guest to unplug device */
 379    QemuSemaphore wait_unplug_sem;
 380
 381    /* Migration is paused due to pause-before-switchover */
 382    QemuEvent pause_event;
 383
 384    /* The event is used to notify COLO thread that failover is finished */
 385    QemuEvent colo_exit_event;
 386
 387    /* The event is used to notify COLO thread to do checkpoint */
 388    QemuEvent colo_checkpoint_event;
 389    int64_t colo_checkpoint_time;
 390    QEMUTimer *colo_delay_timer;
 391
 392    /* The first error that has occurred.
 393       We used the mutex to be able to return the 1st error message */
 394    Error *error;
 395    /* mutex to protect errp */
 396    QemuMutex error_mutex;
 397
 398    /*
 399     * Global switch on whether we need to store the global state
 400     * during migration.
 401     */
 402    bool store_global_state;
 403
 404    /* Whether we send QEMU_VM_CONFIGURATION during migration */
 405    bool send_configuration;
 406    /* Whether we send section footer during migration */
 407    bool send_section_footer;
 408    /* Whether we send switchover start notification during migration */
 409    bool send_switchover_start;
 410
 411    /* Needed by postcopy-pause state */
 412    QemuSemaphore postcopy_pause_sem;
 413    /*
 414     * This variable only affects behavior when postcopy preempt mode is
 415     * enabled.
 416     *
 417     * When set:
 418     *
 419     * - postcopy preempt src QEMU instance will generate an EOS message at
 420     *   the end of migration to shut the preempt channel on dest side.
 421     *
 422     * - postcopy preempt channel will be created at the setup phase on src
 423         QEMU.
 424     *
 425     * When clear:
 426     *
 427     * - postcopy preempt src QEMU instance will _not_ generate an EOS
 428     *   message at the end of migration, the dest qemu will shutdown the
 429     *   channel itself.
 430     *
 431     * - postcopy preempt channel will be created at the switching phase
 432     *   from precopy -> postcopy (to avoid race condition of misordered
 433     *   creation of channels).
 434     *
 435     * NOTE: See message-id <ZBoShWArKDPpX/D7@work-vm> on qemu-devel
 436     * mailing list for more information on the possible race.  Everyone
 437     * should probably just keep this value untouched after set by the
 438     * machine type (or the default).
 439     */
 440    bool preempt_pre_7_2;
 441
 442    /*
 443     * flush every channel after each section sent.
 444     *
 445     * This assures that we can't mix pages from one iteration through
 446     * ram pages with pages for the following iteration.  We really
 447     * only need to do this flush after we have go through all the
 448     * dirty pages.  For historical reasons, we do that after each
 449     * section.  This is suboptimal (we flush too many times).
 450     * Default value is false. (since 8.1)
 451     */
 452    bool multifd_flush_after_each_section;
 453
 454    /*
 455     * This variable only makes sense when set on the machine that is
 456     * the destination of a multifd migration with TLS enabled. It
 457     * affects the behavior of the last send->recv iteration with
 458     * regards to termination of the TLS session.
 459     *
 460     * When set:
 461     *
 462     * - the destination QEMU instance can expect to never get a
 463     *   GNUTLS_E_PREMATURE_TERMINATION error. Manifested as the error
 464     *   message: "The TLS connection was non-properly terminated".
 465     *
 466     * When clear:
 467     *
 468     * - the destination QEMU instance can expect to see a
 469     *   GNUTLS_E_PREMATURE_TERMINATION error in any multifd channel
 470     *   whenever the last recv() call of that channel happens after
 471     *   the source QEMU instance has already issued shutdown() on the
 472     *   channel.
 473     *
 474     *   Commit 637280aeb2 (since 9.1) introduced a side effect that
 475     *   causes the destination instance to not be affected by the
 476     *   premature termination, while commit 1d457daf86 (since 10.0)
 477     *   causes the premature termination condition to be once again
 478     *   reachable.
 479     *
 480     * NOTE: Regardless of the state of this option, a premature
 481     * termination of the TLS connection might happen due to error at
 482     * any moment prior to the last send->recv iteration.
 483     */
 484    bool multifd_clean_tls_termination;
 485
 486    /*
 487     * This decides the size of guest memory chunk that will be used
 488     * to track dirty bitmap clearing.  The size of memory chunk will
 489     * be GUEST_PAGE_SIZE << N.  Say, N=0 means we will clear dirty
 490     * bitmap for each page to send (1<<0=1); N=10 means we will clear
 491     * dirty bitmap only once for 1<<10=1K continuous guest pages
 492     * (which is in 4M chunk).
 493     */
 494    uint8_t clear_bitmap_shift;
 495
 496    /*
 497     * This save hostname when out-going migration starts
 498     */
 499    char *hostname;
 500
 501    /* QEMU_VM_VMDESCRIPTION content filled for all non-iterable devices. */
 502    JSONWriter *vmdesc;
 503
 504    /*
 505     * Indicates whether an ACK from the destination that it's OK to do
 506     * switchover has been received.
 507     */
 508    bool switchover_acked;
 509    /* Is this a rdma migration */
 510    bool rdma_migration;
 511
 512    GSource *hup_source;
 513};
 514
 515void migrate_set_state(MigrationStatus *state, MigrationStatus old_state,
 516                       MigrationStatus new_state);
 517
 518void migration_fd_process_incoming(QEMUFile *f);
 519void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp);
 520void migration_incoming_process(void);
 521
 522bool  migration_has_all_channels(void);
 523
 524void migrate_set_error(MigrationState *s, const Error *error);
 525bool migrate_has_error(MigrationState *s);
 526
 527void migration_connect(MigrationState *s, Error *error_in);
 528
 529int migration_call_notifiers(MigrationState *s, MigrationEventType type,
 530                             Error **errp);
 531
 532int migrate_init(MigrationState *s, Error **errp);
 533bool migration_is_blocked(Error **errp);
 534/* True if outgoing migration has entered postcopy phase */
 535bool migration_in_postcopy(void);
 536bool migration_postcopy_is_alive(MigrationStatus state);
 537MigrationState *migrate_get_current(void);
 538bool migration_has_failed(MigrationState *);
 539bool migrate_mode_is_cpr(MigrationState *);
 540
 541uint64_t ram_get_total_transferred_pages(void);
 542
 543/* Sending on the return path - generic and then for each message type */
 544void migrate_send_rp_shut(MigrationIncomingState *mis,
 545                          uint32_t value);
 546void migrate_send_rp_pong(MigrationIncomingState *mis,
 547                          uint32_t value);
 548int migrate_send_rp_req_pages(MigrationIncomingState *mis, RAMBlock *rb,
 549                              ram_addr_t start, uint64_t haddr, uint32_t tid);
 550int migrate_send_rp_message_req_pages(MigrationIncomingState *mis,
 551                                      RAMBlock *rb, ram_addr_t start);
 552void migrate_send_rp_recv_bitmap(MigrationIncomingState *mis,
 553                                 char *block_name);
 554void migrate_send_rp_resume_ack(MigrationIncomingState *mis, uint32_t value);
 555int migrate_send_rp_switchover_ack(MigrationIncomingState *mis);
 556
 557void dirty_bitmap_mig_before_vm_start(void);
 558void dirty_bitmap_mig_cancel_outgoing(void);
 559void dirty_bitmap_mig_cancel_incoming(void);
 560bool check_dirty_bitmap_mig_alias_map(const BitmapMigrationNodeAliasList *bbm,
 561                                      Error **errp);
 562
 563void migrate_add_address(SocketAddress *address);
 564int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque);
 565
 566#define qemu_ram_foreach_block \
 567  #warning "Use foreach_not_ignored_block in migration code"
 568
 569void migration_make_urgent_request(void);
 570void migration_consume_urgent_request(void);
 571bool migration_rate_limit(void);
 572void migration_bh_schedule(QEMUBHFunc *cb, void *opaque);
 573void migration_cancel(void);
 574
 575void migration_populate_vfio_info(MigrationInfo *info);
 576void migration_reset_vfio_bytes_transferred(void);
 577void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
 578
 579/*
 580 * Migration thread waiting for return path thread.  Return non-zero if an
 581 * error is detected.
 582 */
 583int migration_rp_wait(MigrationState *s);
 584/*
 585 * Kick the migration thread waiting for return path messages.  NOTE: the
 586 * name can be slightly confusing (when read as "kick the rp thread"), just
 587 * to remember the target is always the migration thread.
 588 */
 589void migration_rp_kick(MigrationState *s);
 590
 591void migration_bitmap_sync_precopy(bool last_stage);
 592
 593/* migration/block-dirty-bitmap.c */
 594void dirty_bitmap_mig_init(void);
 595bool should_send_vmdesc(void);
 596
 597#endif
 598