qemu/migration.c
<<
>>
Prefs
   1/*
   2 * QEMU live migration
   3 *
   4 * Copyright IBM, Corp. 2008
   5 *
   6 * Authors:
   7 *  Anthony Liguori   <aliguori@us.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 * Contributions after 2012-01-13 are licensed under the terms of the
  13 * GNU GPL, version 2 or (at your option) any later version.
  14 */
  15
  16#include "qemu-common.h"
  17#include "migration/migration.h"
  18#include "monitor/monitor.h"
  19#include "migration/qemu-file.h"
  20#include "sysemu/sysemu.h"
  21#include "block/block.h"
  22#include "qemu/sockets.h"
  23#include "migration/block.h"
  24#include "qemu/thread.h"
  25#include "qmp-commands.h"
  26#include "trace.h"
  27
  28//#define DEBUG_MIGRATION
  29
  30#ifdef DEBUG_MIGRATION
  31#define DPRINTF(fmt, ...) \
  32    do { printf("migration: " fmt, ## __VA_ARGS__); } while (0)
  33#else
  34#define DPRINTF(fmt, ...) \
  35    do { } while (0)
  36#endif
  37
  38enum {
  39    MIG_STATE_ERROR = -1,
  40    MIG_STATE_NONE,
  41    MIG_STATE_SETUP,
  42    MIG_STATE_CANCELLED,
  43    MIG_STATE_ACTIVE,
  44    MIG_STATE_COMPLETED,
  45};
  46
  47#define MAX_THROTTLE  (32 << 20)      /* Migration speed throttling */
  48
  49/* Amount of time to allocate to each "chunk" of bandwidth-throttled
  50 * data. */
  51#define BUFFER_DELAY     100
  52#define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY)
  53
  54/* Migration XBZRLE default cache size */
  55#define DEFAULT_MIGRATE_CACHE_SIZE (64 * 1024 * 1024)
  56
  57static NotifierList migration_state_notifiers =
  58    NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
  59
  60/* When we add fault tolerance, we could have several
  61   migrations at once.  For now we don't need to add
  62   dynamic creation of migration */
  63
  64MigrationState *migrate_get_current(void)
  65{
  66    static MigrationState current_migration = {
  67        .state = MIG_STATE_NONE,
  68        .bandwidth_limit = MAX_THROTTLE,
  69        .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE,
  70        .mbps = -1,
  71    };
  72
  73    return &current_migration;
  74}
  75
  76void qemu_start_incoming_migration(const char *uri, Error **errp)
  77{
  78    const char *p;
  79
  80    if (strstart(uri, "tcp:", &p))
  81        tcp_start_incoming_migration(p, errp);
  82#ifdef CONFIG_RDMA
  83    else if (strstart(uri, "x-rdma:", &p))
  84        rdma_start_incoming_migration(p, errp);
  85#endif
  86#if !defined(WIN32)
  87    else if (strstart(uri, "exec:", &p))
  88        exec_start_incoming_migration(p, errp);
  89    else if (strstart(uri, "unix:", &p))
  90        unix_start_incoming_migration(p, errp);
  91    else if (strstart(uri, "fd:", &p))
  92        fd_start_incoming_migration(p, errp);
  93#endif
  94    else {
  95        error_setg(errp, "unknown migration protocol: %s", uri);
  96    }
  97}
  98
  99static void process_incoming_migration_co(void *opaque)
 100{
 101    QEMUFile *f = opaque;
 102    int ret;
 103
 104    ret = qemu_loadvm_state(f);
 105    qemu_fclose(f);
 106    if (ret < 0) {
 107        fprintf(stderr, "load of migration failed\n");
 108        exit(EXIT_FAILURE);
 109    }
 110    qemu_announce_self();
 111    DPRINTF("successfully loaded vm state\n");
 112
 113    bdrv_clear_incoming_migration_all();
 114    /* Make sure all file formats flush their mutable metadata */
 115    bdrv_invalidate_cache_all();
 116
 117    if (autostart) {
 118        vm_start();
 119    } else {
 120        runstate_set(RUN_STATE_PAUSED);
 121    }
 122}
 123
 124void process_incoming_migration(QEMUFile *f)
 125{
 126    Coroutine *co = qemu_coroutine_create(process_incoming_migration_co);
 127    int fd = qemu_get_fd(f);
 128
 129    assert(fd != -1);
 130    qemu_set_nonblock(fd);
 131    qemu_coroutine_enter(co, f);
 132}
 133
 134/* amount of nanoseconds we are willing to wait for migration to be down.
 135 * the choice of nanoseconds is because it is the maximum resolution that
 136 * get_clock() can achieve. It is an internal measure. All user-visible
 137 * units must be in seconds */
 138static uint64_t max_downtime = 30000000;
 139
 140uint64_t migrate_max_downtime(void)
 141{
 142    return max_downtime;
 143}
 144
 145MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp)
 146{
 147    MigrationCapabilityStatusList *head = NULL;
 148    MigrationCapabilityStatusList *caps;
 149    MigrationState *s = migrate_get_current();
 150    int i;
 151
 152    for (i = 0; i < MIGRATION_CAPABILITY_MAX; i++) {
 153        if (head == NULL) {
 154            head = g_malloc0(sizeof(*caps));
 155            caps = head;
 156        } else {
 157            caps->next = g_malloc0(sizeof(*caps));
 158            caps = caps->next;
 159        }
 160        caps->value =
 161            g_malloc(sizeof(*caps->value));
 162        caps->value->capability = i;
 163        caps->value->state = s->enabled_capabilities[i];
 164    }
 165
 166    return head;
 167}
 168
 169static void get_xbzrle_cache_stats(MigrationInfo *info)
 170{
 171    if (migrate_use_xbzrle()) {
 172        info->has_xbzrle_cache = true;
 173        info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache));
 174        info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size();
 175        info->xbzrle_cache->bytes = xbzrle_mig_bytes_transferred();
 176        info->xbzrle_cache->pages = xbzrle_mig_pages_transferred();
 177        info->xbzrle_cache->cache_miss = xbzrle_mig_pages_cache_miss();
 178        info->xbzrle_cache->overflow = xbzrle_mig_pages_overflow();
 179    }
 180}
 181
 182MigrationInfo *qmp_query_migrate(Error **errp)
 183{
 184    MigrationInfo *info = g_malloc0(sizeof(*info));
 185    MigrationState *s = migrate_get_current();
 186
 187    switch (s->state) {
 188    case MIG_STATE_NONE:
 189        /* no migration has happened ever */
 190        break;
 191    case MIG_STATE_SETUP:
 192        info->has_status = true;
 193        info->status = g_strdup("setup");
 194        info->has_total_time = false;
 195        break;
 196    case MIG_STATE_ACTIVE:
 197        info->has_status = true;
 198        info->status = g_strdup("active");
 199        info->has_total_time = true;
 200        info->total_time = qemu_get_clock_ms(rt_clock)
 201            - s->total_time;
 202        info->has_expected_downtime = true;
 203        info->expected_downtime = s->expected_downtime;
 204        info->has_setup_time = true;
 205        info->setup_time = s->setup_time;
 206
 207        info->has_ram = true;
 208        info->ram = g_malloc0(sizeof(*info->ram));
 209        info->ram->transferred = ram_bytes_transferred();
 210        info->ram->remaining = ram_bytes_remaining();
 211        info->ram->total = ram_bytes_total();
 212        info->ram->duplicate = dup_mig_pages_transferred();
 213        info->ram->skipped = skipped_mig_pages_transferred();
 214        info->ram->normal = norm_mig_pages_transferred();
 215        info->ram->normal_bytes = norm_mig_bytes_transferred();
 216        info->ram->dirty_pages_rate = s->dirty_pages_rate;
 217        info->ram->mbps = s->mbps;
 218
 219        if (blk_mig_active()) {
 220            info->has_disk = true;
 221            info->disk = g_malloc0(sizeof(*info->disk));
 222            info->disk->transferred = blk_mig_bytes_transferred();
 223            info->disk->remaining = blk_mig_bytes_remaining();
 224            info->disk->total = blk_mig_bytes_total();
 225        }
 226
 227        get_xbzrle_cache_stats(info);
 228        break;
 229    case MIG_STATE_COMPLETED:
 230        get_xbzrle_cache_stats(info);
 231
 232        info->has_status = true;
 233        info->status = g_strdup("completed");
 234        info->has_total_time = true;
 235        info->total_time = s->total_time;
 236        info->has_downtime = true;
 237        info->downtime = s->downtime;
 238        info->has_setup_time = true;
 239        info->setup_time = s->setup_time;
 240
 241        info->has_ram = true;
 242        info->ram = g_malloc0(sizeof(*info->ram));
 243        info->ram->transferred = ram_bytes_transferred();
 244        info->ram->remaining = 0;
 245        info->ram->total = ram_bytes_total();
 246        info->ram->duplicate = dup_mig_pages_transferred();
 247        info->ram->skipped = skipped_mig_pages_transferred();
 248        info->ram->normal = norm_mig_pages_transferred();
 249        info->ram->normal_bytes = norm_mig_bytes_transferred();
 250        info->ram->mbps = s->mbps;
 251        break;
 252    case MIG_STATE_ERROR:
 253        info->has_status = true;
 254        info->status = g_strdup("failed");
 255        break;
 256    case MIG_STATE_CANCELLED:
 257        info->has_status = true;
 258        info->status = g_strdup("cancelled");
 259        break;
 260    }
 261
 262    return info;
 263}
 264
 265void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
 266                                  Error **errp)
 267{
 268    MigrationState *s = migrate_get_current();
 269    MigrationCapabilityStatusList *cap;
 270
 271    if (s->state == MIG_STATE_ACTIVE || s->state == MIG_STATE_SETUP) {
 272        error_set(errp, QERR_MIGRATION_ACTIVE);
 273        return;
 274    }
 275
 276    for (cap = params; cap; cap = cap->next) {
 277        s->enabled_capabilities[cap->value->capability] = cap->value->state;
 278    }
 279}
 280
 281/* shared migration helpers */
 282
 283static void migrate_fd_cleanup(void *opaque)
 284{
 285    MigrationState *s = opaque;
 286
 287    qemu_bh_delete(s->cleanup_bh);
 288    s->cleanup_bh = NULL;
 289
 290    if (s->file) {
 291        DPRINTF("closing file\n");
 292        qemu_mutex_unlock_iothread();
 293        qemu_thread_join(&s->thread);
 294        qemu_mutex_lock_iothread();
 295
 296        qemu_fclose(s->file);
 297        s->file = NULL;
 298    }
 299
 300    assert(s->state != MIG_STATE_ACTIVE);
 301
 302    if (s->state != MIG_STATE_COMPLETED) {
 303        qemu_savevm_state_cancel();
 304    }
 305
 306    notifier_list_notify(&migration_state_notifiers, s);
 307}
 308
 309static void migrate_set_state(MigrationState *s, int old_state, int new_state)
 310{
 311    if (atomic_cmpxchg(&s->state, old_state, new_state) == new_state) {
 312        trace_migrate_set_state(new_state);
 313    }
 314}
 315
 316void migrate_fd_error(MigrationState *s)
 317{
 318    DPRINTF("setting error state\n");
 319    assert(s->file == NULL);
 320    s->state = MIG_STATE_ERROR;
 321    trace_migrate_set_state(MIG_STATE_ERROR);
 322    notifier_list_notify(&migration_state_notifiers, s);
 323}
 324
 325static void migrate_fd_cancel(MigrationState *s)
 326{
 327    DPRINTF("cancelling migration\n");
 328
 329    migrate_set_state(s, s->state, MIG_STATE_CANCELLED);
 330}
 331
 332void add_migration_state_change_notifier(Notifier *notify)
 333{
 334    notifier_list_add(&migration_state_notifiers, notify);
 335}
 336
 337void remove_migration_state_change_notifier(Notifier *notify)
 338{
 339    notifier_remove(notify);
 340}
 341
 342bool migration_in_setup(MigrationState *s)
 343{
 344    return s->state == MIG_STATE_SETUP;
 345}
 346
 347bool migration_has_finished(MigrationState *s)
 348{
 349    return s->state == MIG_STATE_COMPLETED;
 350}
 351
 352bool migration_has_failed(MigrationState *s)
 353{
 354    return (s->state == MIG_STATE_CANCELLED ||
 355            s->state == MIG_STATE_ERROR);
 356}
 357
 358static MigrationState *migrate_init(const MigrationParams *params)
 359{
 360    MigrationState *s = migrate_get_current();
 361    int64_t bandwidth_limit = s->bandwidth_limit;
 362    bool enabled_capabilities[MIGRATION_CAPABILITY_MAX];
 363    int64_t xbzrle_cache_size = s->xbzrle_cache_size;
 364
 365    memcpy(enabled_capabilities, s->enabled_capabilities,
 366           sizeof(enabled_capabilities));
 367
 368    memset(s, 0, sizeof(*s));
 369    s->params = *params;
 370    memcpy(s->enabled_capabilities, enabled_capabilities,
 371           sizeof(enabled_capabilities));
 372    s->xbzrle_cache_size = xbzrle_cache_size;
 373
 374    s->bandwidth_limit = bandwidth_limit;
 375    s->state = MIG_STATE_SETUP;
 376    trace_migrate_set_state(MIG_STATE_SETUP);
 377
 378    s->total_time = qemu_get_clock_ms(rt_clock);
 379    return s;
 380}
 381
 382static GSList *migration_blockers;
 383
 384void migrate_add_blocker(Error *reason)
 385{
 386    migration_blockers = g_slist_prepend(migration_blockers, reason);
 387}
 388
 389void migrate_del_blocker(Error *reason)
 390{
 391    migration_blockers = g_slist_remove(migration_blockers, reason);
 392}
 393
 394void qmp_migrate(const char *uri, bool has_blk, bool blk,
 395                 bool has_inc, bool inc, bool has_detach, bool detach,
 396                 Error **errp)
 397{
 398    Error *local_err = NULL;
 399    MigrationState *s = migrate_get_current();
 400    MigrationParams params;
 401    const char *p;
 402
 403    params.blk = has_blk && blk;
 404    params.shared = has_inc && inc;
 405
 406    if (s->state == MIG_STATE_ACTIVE || s->state == MIG_STATE_SETUP) {
 407        error_set(errp, QERR_MIGRATION_ACTIVE);
 408        return;
 409    }
 410
 411    if (qemu_savevm_state_blocked(errp)) {
 412        return;
 413    }
 414
 415    if (migration_blockers) {
 416        *errp = error_copy(migration_blockers->data);
 417        return;
 418    }
 419
 420    s = migrate_init(&params);
 421
 422    if (strstart(uri, "tcp:", &p)) {
 423        tcp_start_outgoing_migration(s, p, &local_err);
 424#ifdef CONFIG_RDMA
 425    } else if (strstart(uri, "x-rdma:", &p)) {
 426        rdma_start_outgoing_migration(s, p, &local_err);
 427#endif
 428#if !defined(WIN32)
 429    } else if (strstart(uri, "exec:", &p)) {
 430        exec_start_outgoing_migration(s, p, &local_err);
 431    } else if (strstart(uri, "unix:", &p)) {
 432        unix_start_outgoing_migration(s, p, &local_err);
 433    } else if (strstart(uri, "fd:", &p)) {
 434        fd_start_outgoing_migration(s, p, &local_err);
 435#endif
 436    } else {
 437        error_set(errp, QERR_INVALID_PARAMETER_VALUE, "uri", "a valid migration protocol");
 438        return;
 439    }
 440
 441    if (local_err) {
 442        migrate_fd_error(s);
 443        error_propagate(errp, local_err);
 444        return;
 445    }
 446}
 447
 448void qmp_migrate_cancel(Error **errp)
 449{
 450    migrate_fd_cancel(migrate_get_current());
 451}
 452
 453void qmp_migrate_set_cache_size(int64_t value, Error **errp)
 454{
 455    MigrationState *s = migrate_get_current();
 456
 457    /* Check for truncation */
 458    if (value != (size_t)value) {
 459        error_set(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 460                  "exceeding address space");
 461        return;
 462    }
 463
 464    s->xbzrle_cache_size = xbzrle_cache_resize(value);
 465}
 466
 467int64_t qmp_query_migrate_cache_size(Error **errp)
 468{
 469    return migrate_xbzrle_cache_size();
 470}
 471
 472void qmp_migrate_set_speed(int64_t value, Error **errp)
 473{
 474    MigrationState *s;
 475
 476    if (value < 0) {
 477        value = 0;
 478    }
 479    if (value > SIZE_MAX) {
 480        value = SIZE_MAX;
 481    }
 482
 483    s = migrate_get_current();
 484    s->bandwidth_limit = value;
 485    if (s->file) {
 486        qemu_file_set_rate_limit(s->file, s->bandwidth_limit / XFER_LIMIT_RATIO);
 487    }
 488}
 489
 490void qmp_migrate_set_downtime(double value, Error **errp)
 491{
 492    value *= 1e9;
 493    value = MAX(0, MIN(UINT64_MAX, value));
 494    max_downtime = (uint64_t)value;
 495}
 496
 497bool migrate_rdma_pin_all(void)
 498{
 499    MigrationState *s;
 500
 501    s = migrate_get_current();
 502
 503    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL];
 504}
 505
 506bool migrate_auto_converge(void)
 507{
 508    MigrationState *s;
 509
 510    s = migrate_get_current();
 511
 512    return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE];
 513}
 514
 515bool migrate_zero_blocks(void)
 516{
 517    MigrationState *s;
 518
 519    s = migrate_get_current();
 520
 521    return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS];
 522}
 523
 524int migrate_use_xbzrle(void)
 525{
 526    MigrationState *s;
 527
 528    s = migrate_get_current();
 529
 530    return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE];
 531}
 532
 533int64_t migrate_xbzrle_cache_size(void)
 534{
 535    MigrationState *s;
 536
 537    s = migrate_get_current();
 538
 539    return s->xbzrle_cache_size;
 540}
 541
 542/* migration thread support */
 543
 544static void *migration_thread(void *opaque)
 545{
 546    MigrationState *s = opaque;
 547    int64_t initial_time = qemu_get_clock_ms(rt_clock);
 548    int64_t setup_start = qemu_get_clock_ms(host_clock);
 549    int64_t initial_bytes = 0;
 550    int64_t max_size = 0;
 551    int64_t start_time = initial_time;
 552    bool old_vm_running = false;
 553
 554    DPRINTF("beginning savevm\n");
 555    qemu_savevm_state_begin(s->file, &s->params);
 556
 557    s->setup_time = qemu_get_clock_ms(host_clock) - setup_start;
 558    migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_ACTIVE);
 559
 560    DPRINTF("setup complete\n");
 561
 562    while (s->state == MIG_STATE_ACTIVE) {
 563        int64_t current_time;
 564        uint64_t pending_size;
 565
 566        if (!qemu_file_rate_limit(s->file)) {
 567            DPRINTF("iterate\n");
 568            pending_size = qemu_savevm_state_pending(s->file, max_size);
 569            DPRINTF("pending size %lu max %lu\n", pending_size, max_size);
 570            if (pending_size && pending_size >= max_size) {
 571                qemu_savevm_state_iterate(s->file);
 572            } else {
 573                int ret;
 574
 575                DPRINTF("done iterating\n");
 576                qemu_mutex_lock_iothread();
 577                start_time = qemu_get_clock_ms(rt_clock);
 578                qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
 579                old_vm_running = runstate_is_running();
 580
 581                ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
 582                if (ret >= 0) {
 583                    qemu_file_set_rate_limit(s->file, INT_MAX);
 584                    qemu_savevm_state_complete(s->file);
 585                }
 586                qemu_mutex_unlock_iothread();
 587
 588                if (ret < 0) {
 589                    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
 590                    break;
 591                }
 592
 593                if (!qemu_file_get_error(s->file)) {
 594                    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_COMPLETED);
 595                    break;
 596                }
 597            }
 598        }
 599
 600        if (qemu_file_get_error(s->file)) {
 601            migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
 602            break;
 603        }
 604        current_time = qemu_get_clock_ms(rt_clock);
 605        if (current_time >= initial_time + BUFFER_DELAY) {
 606            uint64_t transferred_bytes = qemu_ftell(s->file) - initial_bytes;
 607            uint64_t time_spent = current_time - initial_time;
 608            double bandwidth = transferred_bytes / time_spent;
 609            max_size = bandwidth * migrate_max_downtime() / 1000000;
 610
 611            s->mbps = time_spent ? (((double) transferred_bytes * 8.0) /
 612                    ((double) time_spent / 1000.0)) / 1000.0 / 1000.0 : -1;
 613
 614            DPRINTF("transferred %" PRIu64 " time_spent %" PRIu64
 615                    " bandwidth %g max_size %" PRId64 "\n",
 616                    transferred_bytes, time_spent, bandwidth, max_size);
 617            /* if we haven't sent anything, we don't want to recalculate
 618               10000 is a small enough number for our purposes */
 619            if (s->dirty_bytes_rate && transferred_bytes > 10000) {
 620                s->expected_downtime = s->dirty_bytes_rate / bandwidth;
 621            }
 622
 623            qemu_file_reset_rate_limit(s->file);
 624            initial_time = current_time;
 625            initial_bytes = qemu_ftell(s->file);
 626        }
 627        if (qemu_file_rate_limit(s->file)) {
 628            /* usleep expects microseconds */
 629            g_usleep((initial_time + BUFFER_DELAY - current_time)*1000);
 630        }
 631    }
 632
 633    qemu_mutex_lock_iothread();
 634    if (s->state == MIG_STATE_COMPLETED) {
 635        int64_t end_time = qemu_get_clock_ms(rt_clock);
 636        s->total_time = end_time - s->total_time;
 637        s->downtime = end_time - start_time;
 638        runstate_set(RUN_STATE_POSTMIGRATE);
 639    } else {
 640        if (old_vm_running) {
 641            vm_start();
 642        }
 643    }
 644    qemu_bh_schedule(s->cleanup_bh);
 645    qemu_mutex_unlock_iothread();
 646
 647    return NULL;
 648}
 649
 650void migrate_fd_connect(MigrationState *s)
 651{
 652    s->state = MIG_STATE_SETUP;
 653    trace_migrate_set_state(MIG_STATE_SETUP);
 654
 655    /* This is a best 1st approximation. ns to ms */
 656    s->expected_downtime = max_downtime/1000000;
 657    s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s);
 658
 659    qemu_file_set_rate_limit(s->file,
 660                             s->bandwidth_limit / XFER_LIMIT_RATIO);
 661
 662    /* Notify before starting migration thread */
 663    notifier_list_notify(&migration_state_notifiers, s);
 664
 665    qemu_thread_create(&s->thread, migration_thread, s,
 666                       QEMU_THREAD_JOINABLE);
 667}
 668