qemu/migration.c
<<
>>
Prefs
   1/*
   2 * QEMU live migration
   3 *
   4 * Copyright IBM, Corp. 2008
   5 *
   6 * Authors:
   7 *  Anthony Liguori   <aliguori@us.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 * Contributions after 2012-01-13 are licensed under the terms of the
  13 * GNU GPL, version 2 or (at your option) any later version.
  14 */
  15
  16#include "qemu-common.h"
  17#include "qemu/main-loop.h"
  18#include "migration/migration.h"
  19#include "monitor/monitor.h"
  20#include "migration/qemu-file.h"
  21#include "sysemu/sysemu.h"
  22#include "block/block.h"
  23#include "qemu/sockets.h"
  24#include "migration/block.h"
  25#include "qemu/thread.h"
  26#include "qmp-commands.h"
  27#include "trace.h"
  28
  29//#define DEBUG_MIGRATION
  30
  31#ifdef DEBUG_MIGRATION
  32#define DPRINTF(fmt, ...) \
  33    do { printf("migration: " fmt, ## __VA_ARGS__); } while (0)
  34#else
  35#define DPRINTF(fmt, ...) \
  36    do { } while (0)
  37#endif
  38
  39enum {
  40    MIG_STATE_ERROR = -1,
  41    MIG_STATE_NONE,
  42    MIG_STATE_SETUP,
  43    MIG_STATE_CANCELLED,
  44    MIG_STATE_ACTIVE,
  45    MIG_STATE_COMPLETED,
  46};
  47
  48#define MAX_THROTTLE  (32 << 20)      /* Migration speed throttling */
  49
  50/* Amount of time to allocate to each "chunk" of bandwidth-throttled
  51 * data. */
  52#define BUFFER_DELAY     100
  53#define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY)
  54
  55/* Migration XBZRLE default cache size */
  56#define DEFAULT_MIGRATE_CACHE_SIZE (64 * 1024 * 1024)
  57
  58static NotifierList migration_state_notifiers =
  59    NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
  60
  61/* When we add fault tolerance, we could have several
  62   migrations at once.  For now we don't need to add
  63   dynamic creation of migration */
  64
  65MigrationState *migrate_get_current(void)
  66{
  67    static MigrationState current_migration = {
  68        .state = MIG_STATE_NONE,
  69        .bandwidth_limit = MAX_THROTTLE,
  70        .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE,
  71        .mbps = -1,
  72    };
  73
  74    return &current_migration;
  75}
  76
  77void qemu_start_incoming_migration(const char *uri, Error **errp)
  78{
  79    const char *p;
  80
  81    if (strstart(uri, "tcp:", &p))
  82        tcp_start_incoming_migration(p, errp);
  83#ifdef CONFIG_RDMA
  84    else if (strstart(uri, "x-rdma:", &p))
  85        rdma_start_incoming_migration(p, errp);
  86#endif
  87#if !defined(WIN32)
  88    else if (strstart(uri, "exec:", &p))
  89        exec_start_incoming_migration(p, errp);
  90    else if (strstart(uri, "unix:", &p))
  91        unix_start_incoming_migration(p, errp);
  92    else if (strstart(uri, "fd:", &p))
  93        fd_start_incoming_migration(p, errp);
  94#endif
  95    else {
  96        error_setg(errp, "unknown migration protocol: %s", uri);
  97    }
  98}
  99
 100static void process_incoming_migration_co(void *opaque)
 101{
 102    QEMUFile *f = opaque;
 103    int ret;
 104
 105    ret = qemu_loadvm_state(f);
 106    qemu_fclose(f);
 107    if (ret < 0) {
 108        fprintf(stderr, "load of migration failed\n");
 109        exit(EXIT_FAILURE);
 110    }
 111    qemu_announce_self();
 112    DPRINTF("successfully loaded vm state\n");
 113
 114    bdrv_clear_incoming_migration_all();
 115    /* Make sure all file formats flush their mutable metadata */
 116    bdrv_invalidate_cache_all();
 117
 118    if (autostart) {
 119        vm_start();
 120    } else {
 121        runstate_set(RUN_STATE_PAUSED);
 122    }
 123}
 124
 125void process_incoming_migration(QEMUFile *f)
 126{
 127    Coroutine *co = qemu_coroutine_create(process_incoming_migration_co);
 128    int fd = qemu_get_fd(f);
 129
 130    assert(fd != -1);
 131    qemu_set_nonblock(fd);
 132    qemu_coroutine_enter(co, f);
 133}
 134
 135/* amount of nanoseconds we are willing to wait for migration to be down.
 136 * the choice of nanoseconds is because it is the maximum resolution that
 137 * get_clock() can achieve. It is an internal measure. All user-visible
 138 * units must be in seconds */
 139static uint64_t max_downtime = 30000000;
 140
 141uint64_t migrate_max_downtime(void)
 142{
 143    return max_downtime;
 144}
 145
 146MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp)
 147{
 148    MigrationCapabilityStatusList *head = NULL;
 149    MigrationCapabilityStatusList *caps;
 150    MigrationState *s = migrate_get_current();
 151    int i;
 152
 153    caps = NULL; /* silence compiler warning */
 154    for (i = 0; i < MIGRATION_CAPABILITY_MAX; i++) {
 155        if (head == NULL) {
 156            head = g_malloc0(sizeof(*caps));
 157            caps = head;
 158        } else {
 159            caps->next = g_malloc0(sizeof(*caps));
 160            caps = caps->next;
 161        }
 162        caps->value =
 163            g_malloc(sizeof(*caps->value));
 164        caps->value->capability = i;
 165        caps->value->state = s->enabled_capabilities[i];
 166    }
 167
 168    return head;
 169}
 170
 171static void get_xbzrle_cache_stats(MigrationInfo *info)
 172{
 173    if (migrate_use_xbzrle()) {
 174        info->has_xbzrle_cache = true;
 175        info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache));
 176        info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size();
 177        info->xbzrle_cache->bytes = xbzrle_mig_bytes_transferred();
 178        info->xbzrle_cache->pages = xbzrle_mig_pages_transferred();
 179        info->xbzrle_cache->cache_miss = xbzrle_mig_pages_cache_miss();
 180        info->xbzrle_cache->overflow = xbzrle_mig_pages_overflow();
 181    }
 182}
 183
 184MigrationInfo *qmp_query_migrate(Error **errp)
 185{
 186    MigrationInfo *info = g_malloc0(sizeof(*info));
 187    MigrationState *s = migrate_get_current();
 188
 189    switch (s->state) {
 190    case MIG_STATE_NONE:
 191        /* no migration has happened ever */
 192        break;
 193    case MIG_STATE_SETUP:
 194        info->has_status = true;
 195        info->status = g_strdup("setup");
 196        info->has_total_time = false;
 197        break;
 198    case MIG_STATE_ACTIVE:
 199        info->has_status = true;
 200        info->status = g_strdup("active");
 201        info->has_total_time = true;
 202        info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME)
 203            - s->total_time;
 204        info->has_expected_downtime = true;
 205        info->expected_downtime = s->expected_downtime;
 206        info->has_setup_time = true;
 207        info->setup_time = s->setup_time;
 208
 209        info->has_ram = true;
 210        info->ram = g_malloc0(sizeof(*info->ram));
 211        info->ram->transferred = ram_bytes_transferred();
 212        info->ram->remaining = ram_bytes_remaining();
 213        info->ram->total = ram_bytes_total();
 214        info->ram->duplicate = dup_mig_pages_transferred();
 215        info->ram->skipped = skipped_mig_pages_transferred();
 216        info->ram->normal = norm_mig_pages_transferred();
 217        info->ram->normal_bytes = norm_mig_bytes_transferred();
 218        info->ram->dirty_pages_rate = s->dirty_pages_rate;
 219        info->ram->mbps = s->mbps;
 220
 221        if (blk_mig_active()) {
 222            info->has_disk = true;
 223            info->disk = g_malloc0(sizeof(*info->disk));
 224            info->disk->transferred = blk_mig_bytes_transferred();
 225            info->disk->remaining = blk_mig_bytes_remaining();
 226            info->disk->total = blk_mig_bytes_total();
 227        }
 228
 229        get_xbzrle_cache_stats(info);
 230        break;
 231    case MIG_STATE_COMPLETED:
 232        get_xbzrle_cache_stats(info);
 233
 234        info->has_status = true;
 235        info->status = g_strdup("completed");
 236        info->has_total_time = true;
 237        info->total_time = s->total_time;
 238        info->has_downtime = true;
 239        info->downtime = s->downtime;
 240        info->has_setup_time = true;
 241        info->setup_time = s->setup_time;
 242
 243        info->has_ram = true;
 244        info->ram = g_malloc0(sizeof(*info->ram));
 245        info->ram->transferred = ram_bytes_transferred();
 246        info->ram->remaining = 0;
 247        info->ram->total = ram_bytes_total();
 248        info->ram->duplicate = dup_mig_pages_transferred();
 249        info->ram->skipped = skipped_mig_pages_transferred();
 250        info->ram->normal = norm_mig_pages_transferred();
 251        info->ram->normal_bytes = norm_mig_bytes_transferred();
 252        info->ram->mbps = s->mbps;
 253        break;
 254    case MIG_STATE_ERROR:
 255        info->has_status = true;
 256        info->status = g_strdup("failed");
 257        break;
 258    case MIG_STATE_CANCELLED:
 259        info->has_status = true;
 260        info->status = g_strdup("cancelled");
 261        break;
 262    }
 263
 264    return info;
 265}
 266
 267void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
 268                                  Error **errp)
 269{
 270    MigrationState *s = migrate_get_current();
 271    MigrationCapabilityStatusList *cap;
 272
 273    if (s->state == MIG_STATE_ACTIVE || s->state == MIG_STATE_SETUP) {
 274        error_set(errp, QERR_MIGRATION_ACTIVE);
 275        return;
 276    }
 277
 278    for (cap = params; cap; cap = cap->next) {
 279        s->enabled_capabilities[cap->value->capability] = cap->value->state;
 280    }
 281}
 282
 283/* shared migration helpers */
 284
 285static void migrate_fd_cleanup(void *opaque)
 286{
 287    MigrationState *s = opaque;
 288
 289    qemu_bh_delete(s->cleanup_bh);
 290    s->cleanup_bh = NULL;
 291
 292    if (s->file) {
 293        DPRINTF("closing file\n");
 294        qemu_mutex_unlock_iothread();
 295        qemu_thread_join(&s->thread);
 296        qemu_mutex_lock_iothread();
 297
 298        qemu_fclose(s->file);
 299        s->file = NULL;
 300    }
 301
 302    assert(s->state != MIG_STATE_ACTIVE);
 303
 304    if (s->state != MIG_STATE_COMPLETED) {
 305        qemu_savevm_state_cancel();
 306    }
 307
 308    notifier_list_notify(&migration_state_notifiers, s);
 309}
 310
 311static void migrate_set_state(MigrationState *s, int old_state, int new_state)
 312{
 313    if (atomic_cmpxchg(&s->state, old_state, new_state) == new_state) {
 314        trace_migrate_set_state(new_state);
 315    }
 316}
 317
 318void migrate_fd_error(MigrationState *s)
 319{
 320    DPRINTF("setting error state\n");
 321    assert(s->file == NULL);
 322    s->state = MIG_STATE_ERROR;
 323    trace_migrate_set_state(MIG_STATE_ERROR);
 324    notifier_list_notify(&migration_state_notifiers, s);
 325}
 326
 327static void migrate_fd_cancel(MigrationState *s)
 328{
 329    DPRINTF("cancelling migration\n");
 330
 331    migrate_set_state(s, s->state, MIG_STATE_CANCELLED);
 332}
 333
 334void add_migration_state_change_notifier(Notifier *notify)
 335{
 336    notifier_list_add(&migration_state_notifiers, notify);
 337}
 338
 339void remove_migration_state_change_notifier(Notifier *notify)
 340{
 341    notifier_remove(notify);
 342}
 343
 344bool migration_in_setup(MigrationState *s)
 345{
 346    return s->state == MIG_STATE_SETUP;
 347}
 348
 349bool migration_has_finished(MigrationState *s)
 350{
 351    return s->state == MIG_STATE_COMPLETED;
 352}
 353
 354bool migration_has_failed(MigrationState *s)
 355{
 356    return (s->state == MIG_STATE_CANCELLED ||
 357            s->state == MIG_STATE_ERROR);
 358}
 359
 360static MigrationState *migrate_init(const MigrationParams *params)
 361{
 362    MigrationState *s = migrate_get_current();
 363    int64_t bandwidth_limit = s->bandwidth_limit;
 364    bool enabled_capabilities[MIGRATION_CAPABILITY_MAX];
 365    int64_t xbzrle_cache_size = s->xbzrle_cache_size;
 366
 367    memcpy(enabled_capabilities, s->enabled_capabilities,
 368           sizeof(enabled_capabilities));
 369
 370    memset(s, 0, sizeof(*s));
 371    s->params = *params;
 372    memcpy(s->enabled_capabilities, enabled_capabilities,
 373           sizeof(enabled_capabilities));
 374    s->xbzrle_cache_size = xbzrle_cache_size;
 375
 376    s->bandwidth_limit = bandwidth_limit;
 377    s->state = MIG_STATE_SETUP;
 378    trace_migrate_set_state(MIG_STATE_SETUP);
 379
 380    s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 381    return s;
 382}
 383
 384static GSList *migration_blockers;
 385
 386void migrate_add_blocker(Error *reason)
 387{
 388    migration_blockers = g_slist_prepend(migration_blockers, reason);
 389}
 390
 391void migrate_del_blocker(Error *reason)
 392{
 393    migration_blockers = g_slist_remove(migration_blockers, reason);
 394}
 395
 396void qmp_migrate(const char *uri, bool has_blk, bool blk,
 397                 bool has_inc, bool inc, bool has_detach, bool detach,
 398                 Error **errp)
 399{
 400    Error *local_err = NULL;
 401    MigrationState *s = migrate_get_current();
 402    MigrationParams params;
 403    const char *p;
 404
 405    params.blk = has_blk && blk;
 406    params.shared = has_inc && inc;
 407
 408    if (s->state == MIG_STATE_ACTIVE || s->state == MIG_STATE_SETUP) {
 409        error_set(errp, QERR_MIGRATION_ACTIVE);
 410        return;
 411    }
 412
 413    if (qemu_savevm_state_blocked(errp)) {
 414        return;
 415    }
 416
 417    if (migration_blockers) {
 418        *errp = error_copy(migration_blockers->data);
 419        return;
 420    }
 421
 422    s = migrate_init(&params);
 423
 424    if (strstart(uri, "tcp:", &p)) {
 425        tcp_start_outgoing_migration(s, p, &local_err);
 426#ifdef CONFIG_RDMA
 427    } else if (strstart(uri, "x-rdma:", &p)) {
 428        rdma_start_outgoing_migration(s, p, &local_err);
 429#endif
 430#if !defined(WIN32)
 431    } else if (strstart(uri, "exec:", &p)) {
 432        exec_start_outgoing_migration(s, p, &local_err);
 433    } else if (strstart(uri, "unix:", &p)) {
 434        unix_start_outgoing_migration(s, p, &local_err);
 435    } else if (strstart(uri, "fd:", &p)) {
 436        fd_start_outgoing_migration(s, p, &local_err);
 437#endif
 438    } else {
 439        error_set(errp, QERR_INVALID_PARAMETER_VALUE, "uri", "a valid migration protocol");
 440        s->state = MIG_STATE_ERROR;
 441        return;
 442    }
 443
 444    if (local_err) {
 445        migrate_fd_error(s);
 446        error_propagate(errp, local_err);
 447        return;
 448    }
 449}
 450
 451void qmp_migrate_cancel(Error **errp)
 452{
 453    migrate_fd_cancel(migrate_get_current());
 454}
 455
 456void qmp_migrate_set_cache_size(int64_t value, Error **errp)
 457{
 458    MigrationState *s = migrate_get_current();
 459
 460    /* Check for truncation */
 461    if (value != (size_t)value) {
 462        error_set(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 463                  "exceeding address space");
 464        return;
 465    }
 466
 467    s->xbzrle_cache_size = xbzrle_cache_resize(value);
 468}
 469
 470int64_t qmp_query_migrate_cache_size(Error **errp)
 471{
 472    return migrate_xbzrle_cache_size();
 473}
 474
 475void qmp_migrate_set_speed(int64_t value, Error **errp)
 476{
 477    MigrationState *s;
 478
 479    if (value < 0) {
 480        value = 0;
 481    }
 482    if (value > SIZE_MAX) {
 483        value = SIZE_MAX;
 484    }
 485
 486    s = migrate_get_current();
 487    s->bandwidth_limit = value;
 488    if (s->file) {
 489        qemu_file_set_rate_limit(s->file, s->bandwidth_limit / XFER_LIMIT_RATIO);
 490    }
 491}
 492
 493void qmp_migrate_set_downtime(double value, Error **errp)
 494{
 495    value *= 1e9;
 496    value = MAX(0, MIN(UINT64_MAX, value));
 497    max_downtime = (uint64_t)value;
 498}
 499
 500bool migrate_rdma_pin_all(void)
 501{
 502    MigrationState *s;
 503
 504    s = migrate_get_current();
 505
 506    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL];
 507}
 508
 509bool migrate_auto_converge(void)
 510{
 511    MigrationState *s;
 512
 513    s = migrate_get_current();
 514
 515    return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE];
 516}
 517
 518bool migrate_zero_blocks(void)
 519{
 520    MigrationState *s;
 521
 522    s = migrate_get_current();
 523
 524    return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS];
 525}
 526
 527int migrate_use_xbzrle(void)
 528{
 529    MigrationState *s;
 530
 531    s = migrate_get_current();
 532
 533    return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE];
 534}
 535
 536int64_t migrate_xbzrle_cache_size(void)
 537{
 538    MigrationState *s;
 539
 540    s = migrate_get_current();
 541
 542    return s->xbzrle_cache_size;
 543}
 544
 545/* migration thread support */
 546
 547static void *migration_thread(void *opaque)
 548{
 549    MigrationState *s = opaque;
 550    int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 551    int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
 552    int64_t initial_bytes = 0;
 553    int64_t max_size = 0;
 554    int64_t start_time = initial_time;
 555    bool old_vm_running = false;
 556
 557    DPRINTF("beginning savevm\n");
 558    qemu_savevm_state_begin(s->file, &s->params);
 559
 560    s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
 561    migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_ACTIVE);
 562
 563    DPRINTF("setup complete\n");
 564
 565    while (s->state == MIG_STATE_ACTIVE) {
 566        int64_t current_time;
 567        uint64_t pending_size;
 568
 569        if (!qemu_file_rate_limit(s->file)) {
 570            DPRINTF("iterate\n");
 571            pending_size = qemu_savevm_state_pending(s->file, max_size);
 572            DPRINTF("pending size %" PRIu64 " max %" PRIu64 "\n",
 573                    pending_size, max_size);
 574            if (pending_size && pending_size >= max_size) {
 575                qemu_savevm_state_iterate(s->file);
 576            } else {
 577                int ret;
 578
 579                DPRINTF("done iterating\n");
 580                qemu_mutex_lock_iothread();
 581                start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 582                qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
 583                old_vm_running = runstate_is_running();
 584
 585                ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
 586                if (ret >= 0) {
 587                    qemu_file_set_rate_limit(s->file, INT64_MAX);
 588                    qemu_savevm_state_complete(s->file);
 589                }
 590                qemu_mutex_unlock_iothread();
 591
 592                if (ret < 0) {
 593                    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
 594                    break;
 595                }
 596
 597                if (!qemu_file_get_error(s->file)) {
 598                    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_COMPLETED);
 599                    break;
 600                }
 601            }
 602        }
 603
 604        if (qemu_file_get_error(s->file)) {
 605            migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
 606            break;
 607        }
 608        current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 609        if (current_time >= initial_time + BUFFER_DELAY) {
 610            uint64_t transferred_bytes = qemu_ftell(s->file) - initial_bytes;
 611            uint64_t time_spent = current_time - initial_time;
 612            double bandwidth = transferred_bytes / time_spent;
 613            max_size = bandwidth * migrate_max_downtime() / 1000000;
 614
 615            s->mbps = time_spent ? (((double) transferred_bytes * 8.0) /
 616                    ((double) time_spent / 1000.0)) / 1000.0 / 1000.0 : -1;
 617
 618            DPRINTF("transferred %" PRIu64 " time_spent %" PRIu64
 619                    " bandwidth %g max_size %" PRId64 "\n",
 620                    transferred_bytes, time_spent, bandwidth, max_size);
 621            /* if we haven't sent anything, we don't want to recalculate
 622               10000 is a small enough number for our purposes */
 623            if (s->dirty_bytes_rate && transferred_bytes > 10000) {
 624                s->expected_downtime = s->dirty_bytes_rate / bandwidth;
 625            }
 626
 627            qemu_file_reset_rate_limit(s->file);
 628            initial_time = current_time;
 629            initial_bytes = qemu_ftell(s->file);
 630        }
 631        if (qemu_file_rate_limit(s->file)) {
 632            /* usleep expects microseconds */
 633            g_usleep((initial_time + BUFFER_DELAY - current_time)*1000);
 634        }
 635    }
 636
 637    qemu_mutex_lock_iothread();
 638    if (s->state == MIG_STATE_COMPLETED) {
 639        int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 640        s->total_time = end_time - s->total_time;
 641        s->downtime = end_time - start_time;
 642        runstate_set(RUN_STATE_POSTMIGRATE);
 643    } else {
 644        if (old_vm_running) {
 645            vm_start();
 646        }
 647    }
 648    qemu_bh_schedule(s->cleanup_bh);
 649    qemu_mutex_unlock_iothread();
 650
 651    return NULL;
 652}
 653
 654void migrate_fd_connect(MigrationState *s)
 655{
 656    s->state = MIG_STATE_SETUP;
 657    trace_migrate_set_state(MIG_STATE_SETUP);
 658
 659    /* This is a best 1st approximation. ns to ms */
 660    s->expected_downtime = max_downtime/1000000;
 661    s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
 662
 663    qemu_file_set_rate_limit(s->file,
 664                             s->bandwidth_limit / XFER_LIMIT_RATIO);
 665
 666    /* Notify before starting migration thread */
 667    notifier_list_notify(&migration_state_notifiers, s);
 668
 669    qemu_thread_create(&s->thread, migration_thread, s,
 670                       QEMU_THREAD_JOINABLE);
 671}
 672