qemu/block-migration.c
<<
>>
Prefs
   1/*
   2 * QEMU live block migration
   3 *
   4 * Copyright IBM, Corp. 2009
   5 *
   6 * Authors:
   7 *  Liran Schour   <lirans@il.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 * Contributions after 2012-01-13 are licensed under the terms of the
  13 * GNU GPL, version 2 or (at your option) any later version.
  14 */
  15
  16#include "qemu-common.h"
  17#include "block/block_int.h"
  18#include "hw/hw.h"
  19#include "qemu/queue.h"
  20#include "qemu/timer.h"
  21#include "migration/block.h"
  22#include "migration/migration.h"
  23#include "sysemu/blockdev.h"
  24#include <assert.h>
  25
  26#define BLOCK_SIZE                       (1 << 20)
  27#define BDRV_SECTORS_PER_DIRTY_CHUNK     (BLOCK_SIZE >> BDRV_SECTOR_BITS)
  28
  29#define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
  30#define BLK_MIG_FLAG_EOS                0x02
  31#define BLK_MIG_FLAG_PROGRESS           0x04
  32#define BLK_MIG_FLAG_ZERO_BLOCK         0x08
  33
  34#define MAX_IS_ALLOCATED_SEARCH 65536
  35
  36//#define DEBUG_BLK_MIGRATION
  37
  38#ifdef DEBUG_BLK_MIGRATION
  39#define DPRINTF(fmt, ...) \
  40    do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
  41#else
  42#define DPRINTF(fmt, ...) \
  43    do { } while (0)
  44#endif
  45
  46typedef struct BlkMigDevState {
  47    /* Written during setup phase.  Can be read without a lock.  */
  48    BlockDriverState *bs;
  49    int shared_base;
  50    int64_t total_sectors;
  51    QSIMPLEQ_ENTRY(BlkMigDevState) entry;
  52
  53    /* Only used by migration thread.  Does not need a lock.  */
  54    int bulk_completed;
  55    int64_t cur_sector;
  56    int64_t cur_dirty;
  57
  58    /* Protected by block migration lock.  */
  59    unsigned long *aio_bitmap;
  60    int64_t completed_sectors;
  61    BdrvDirtyBitmap *dirty_bitmap;
  62} BlkMigDevState;
  63
  64typedef struct BlkMigBlock {
  65    /* Only used by migration thread.  */
  66    uint8_t *buf;
  67    BlkMigDevState *bmds;
  68    int64_t sector;
  69    int nr_sectors;
  70    struct iovec iov;
  71    QEMUIOVector qiov;
  72    BlockDriverAIOCB *aiocb;
  73
  74    /* Protected by block migration lock.  */
  75    int ret;
  76    QSIMPLEQ_ENTRY(BlkMigBlock) entry;
  77} BlkMigBlock;
  78
  79typedef struct BlkMigState {
  80    /* Written during setup phase.  Can be read without a lock.  */
  81    int blk_enable;
  82    int shared_base;
  83    QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
  84    int64_t total_sector_sum;
  85    bool zero_blocks;
  86
  87    /* Protected by lock.  */
  88    QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
  89    int submitted;
  90    int read_done;
  91
  92    /* Only used by migration thread.  Does not need a lock.  */
  93    int transferred;
  94    int prev_progress;
  95    int bulk_completed;
  96
  97    /* Lock must be taken _inside_ the iothread lock.  */
  98    QemuMutex lock;
  99} BlkMigState;
 100
 101static BlkMigState block_mig_state;
 102
 103static void blk_mig_lock(void)
 104{
 105    qemu_mutex_lock(&block_mig_state.lock);
 106}
 107
 108static void blk_mig_unlock(void)
 109{
 110    qemu_mutex_unlock(&block_mig_state.lock);
 111}
 112
 113/* Must run outside of the iothread lock during the bulk phase,
 114 * or the VM will stall.
 115 */
 116
 117static void blk_send(QEMUFile *f, BlkMigBlock * blk)
 118{
 119    int len;
 120    uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;
 121
 122    if (block_mig_state.zero_blocks &&
 123        buffer_is_zero(blk->buf, BLOCK_SIZE)) {
 124        flags |= BLK_MIG_FLAG_ZERO_BLOCK;
 125    }
 126
 127    /* sector number and flags */
 128    qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
 129                     | flags);
 130
 131    /* device name */
 132    len = strlen(blk->bmds->bs->device_name);
 133    qemu_put_byte(f, len);
 134    qemu_put_buffer(f, (uint8_t *)blk->bmds->bs->device_name, len);
 135
 136    /* if a block is zero we need to flush here since the network
 137     * bandwidth is now a lot higher than the storage device bandwidth.
 138     * thus if we queue zero blocks we slow down the migration */
 139    if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
 140        qemu_fflush(f);
 141        return;
 142    }
 143
 144    qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
 145}
 146
 147int blk_mig_active(void)
 148{
 149    return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
 150}
 151
 152uint64_t blk_mig_bytes_transferred(void)
 153{
 154    BlkMigDevState *bmds;
 155    uint64_t sum = 0;
 156
 157    blk_mig_lock();
 158    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 159        sum += bmds->completed_sectors;
 160    }
 161    blk_mig_unlock();
 162    return sum << BDRV_SECTOR_BITS;
 163}
 164
 165uint64_t blk_mig_bytes_remaining(void)
 166{
 167    return blk_mig_bytes_total() - blk_mig_bytes_transferred();
 168}
 169
 170uint64_t blk_mig_bytes_total(void)
 171{
 172    BlkMigDevState *bmds;
 173    uint64_t sum = 0;
 174
 175    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 176        sum += bmds->total_sectors;
 177    }
 178    return sum << BDRV_SECTOR_BITS;
 179}
 180
 181
 182/* Called with migration lock held.  */
 183
 184static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
 185{
 186    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
 187
 188    if ((sector << BDRV_SECTOR_BITS) < bdrv_getlength(bmds->bs)) {
 189        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
 190            (1UL << (chunk % (sizeof(unsigned long) * 8))));
 191    } else {
 192        return 0;
 193    }
 194}
 195
 196/* Called with migration lock held.  */
 197
 198static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
 199                             int nb_sectors, int set)
 200{
 201    int64_t start, end;
 202    unsigned long val, idx, bit;
 203
 204    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
 205    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
 206
 207    for (; start <= end; start++) {
 208        idx = start / (sizeof(unsigned long) * 8);
 209        bit = start % (sizeof(unsigned long) * 8);
 210        val = bmds->aio_bitmap[idx];
 211        if (set) {
 212            val |= 1UL << bit;
 213        } else {
 214            val &= ~(1UL << bit);
 215        }
 216        bmds->aio_bitmap[idx] = val;
 217    }
 218}
 219
 220static void alloc_aio_bitmap(BlkMigDevState *bmds)
 221{
 222    BlockDriverState *bs = bmds->bs;
 223    int64_t bitmap_size;
 224
 225    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
 226            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
 227    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
 228
 229    bmds->aio_bitmap = g_malloc0(bitmap_size);
 230}
 231
 232/* Never hold migration lock when yielding to the main loop!  */
 233
 234static void blk_mig_read_cb(void *opaque, int ret)
 235{
 236    BlkMigBlock *blk = opaque;
 237
 238    blk_mig_lock();
 239    blk->ret = ret;
 240
 241    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
 242    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
 243
 244    block_mig_state.submitted--;
 245    block_mig_state.read_done++;
 246    assert(block_mig_state.submitted >= 0);
 247    blk_mig_unlock();
 248}
 249
 250/* Called with no lock taken.  */
 251
 252static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
 253{
 254    int64_t total_sectors = bmds->total_sectors;
 255    int64_t cur_sector = bmds->cur_sector;
 256    BlockDriverState *bs = bmds->bs;
 257    BlkMigBlock *blk;
 258    int nr_sectors;
 259
 260    if (bmds->shared_base) {
 261        qemu_mutex_lock_iothread();
 262        while (cur_sector < total_sectors &&
 263               !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
 264                                  &nr_sectors)) {
 265            cur_sector += nr_sectors;
 266        }
 267        qemu_mutex_unlock_iothread();
 268    }
 269
 270    if (cur_sector >= total_sectors) {
 271        bmds->cur_sector = bmds->completed_sectors = total_sectors;
 272        return 1;
 273    }
 274
 275    bmds->completed_sectors = cur_sector;
 276
 277    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
 278
 279    /* we are going to transfer a full block even if it is not allocated */
 280    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
 281
 282    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
 283        nr_sectors = total_sectors - cur_sector;
 284    }
 285
 286    blk = g_malloc(sizeof(BlkMigBlock));
 287    blk->buf = g_malloc(BLOCK_SIZE);
 288    blk->bmds = bmds;
 289    blk->sector = cur_sector;
 290    blk->nr_sectors = nr_sectors;
 291
 292    blk->iov.iov_base = blk->buf;
 293    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
 294    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
 295
 296    blk_mig_lock();
 297    block_mig_state.submitted++;
 298    blk_mig_unlock();
 299
 300    qemu_mutex_lock_iothread();
 301    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
 302                                nr_sectors, blk_mig_read_cb, blk);
 303
 304    bdrv_reset_dirty(bs, cur_sector, nr_sectors);
 305    qemu_mutex_unlock_iothread();
 306
 307    bmds->cur_sector = cur_sector + nr_sectors;
 308    return (bmds->cur_sector >= total_sectors);
 309}
 310
 311/* Called with iothread lock taken.  */
 312
 313static void set_dirty_tracking(void)
 314{
 315    BlkMigDevState *bmds;
 316
 317    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 318        bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE);
 319    }
 320}
 321
 322static void unset_dirty_tracking(void)
 323{
 324    BlkMigDevState *bmds;
 325
 326    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 327        bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
 328    }
 329}
 330
 331static void init_blk_migration_it(void *opaque, BlockDriverState *bs)
 332{
 333    BlkMigDevState *bmds;
 334    int64_t sectors;
 335
 336    if (!bdrv_is_read_only(bs)) {
 337        sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
 338        if (sectors <= 0) {
 339            return;
 340        }
 341
 342        bmds = g_malloc0(sizeof(BlkMigDevState));
 343        bmds->bs = bs;
 344        bmds->bulk_completed = 0;
 345        bmds->total_sectors = sectors;
 346        bmds->completed_sectors = 0;
 347        bmds->shared_base = block_mig_state.shared_base;
 348        alloc_aio_bitmap(bmds);
 349        bdrv_set_in_use(bs, 1);
 350        bdrv_ref(bs);
 351
 352        block_mig_state.total_sector_sum += sectors;
 353
 354        if (bmds->shared_base) {
 355            DPRINTF("Start migration for %s with shared base image\n",
 356                    bs->device_name);
 357        } else {
 358            DPRINTF("Start full migration for %s\n", bs->device_name);
 359        }
 360
 361        QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
 362    }
 363}
 364
 365static void init_blk_migration(QEMUFile *f)
 366{
 367    block_mig_state.submitted = 0;
 368    block_mig_state.read_done = 0;
 369    block_mig_state.transferred = 0;
 370    block_mig_state.total_sector_sum = 0;
 371    block_mig_state.prev_progress = -1;
 372    block_mig_state.bulk_completed = 0;
 373    block_mig_state.zero_blocks = migrate_zero_blocks();
 374
 375    bdrv_iterate(init_blk_migration_it, NULL);
 376}
 377
 378/* Called with no lock taken.  */
 379
 380static int blk_mig_save_bulked_block(QEMUFile *f)
 381{
 382    int64_t completed_sector_sum = 0;
 383    BlkMigDevState *bmds;
 384    int progress;
 385    int ret = 0;
 386
 387    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 388        if (bmds->bulk_completed == 0) {
 389            if (mig_save_device_bulk(f, bmds) == 1) {
 390                /* completed bulk section for this device */
 391                bmds->bulk_completed = 1;
 392            }
 393            completed_sector_sum += bmds->completed_sectors;
 394            ret = 1;
 395            break;
 396        } else {
 397            completed_sector_sum += bmds->completed_sectors;
 398        }
 399    }
 400
 401    if (block_mig_state.total_sector_sum != 0) {
 402        progress = completed_sector_sum * 100 /
 403                   block_mig_state.total_sector_sum;
 404    } else {
 405        progress = 100;
 406    }
 407    if (progress != block_mig_state.prev_progress) {
 408        block_mig_state.prev_progress = progress;
 409        qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
 410                         | BLK_MIG_FLAG_PROGRESS);
 411        DPRINTF("Completed %d %%\r", progress);
 412    }
 413
 414    return ret;
 415}
 416
 417static void blk_mig_reset_dirty_cursor(void)
 418{
 419    BlkMigDevState *bmds;
 420
 421    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 422        bmds->cur_dirty = 0;
 423    }
 424}
 425
 426/* Called with iothread lock taken.  */
 427
 428static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
 429                                 int is_async)
 430{
 431    BlkMigBlock *blk;
 432    int64_t total_sectors = bmds->total_sectors;
 433    int64_t sector;
 434    int nr_sectors;
 435    int ret = -EIO;
 436
 437    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
 438        blk_mig_lock();
 439        if (bmds_aio_inflight(bmds, sector)) {
 440            blk_mig_unlock();
 441            bdrv_drain_all();
 442        } else {
 443            blk_mig_unlock();
 444        }
 445        if (bdrv_get_dirty(bmds->bs, bmds->dirty_bitmap, sector)) {
 446
 447            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
 448                nr_sectors = total_sectors - sector;
 449            } else {
 450                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
 451            }
 452            blk = g_malloc(sizeof(BlkMigBlock));
 453            blk->buf = g_malloc(BLOCK_SIZE);
 454            blk->bmds = bmds;
 455            blk->sector = sector;
 456            blk->nr_sectors = nr_sectors;
 457
 458            if (is_async) {
 459                blk->iov.iov_base = blk->buf;
 460                blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
 461                qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
 462
 463                blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
 464                                            nr_sectors, blk_mig_read_cb, blk);
 465
 466                blk_mig_lock();
 467                block_mig_state.submitted++;
 468                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
 469                blk_mig_unlock();
 470            } else {
 471                ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors);
 472                if (ret < 0) {
 473                    goto error;
 474                }
 475                blk_send(f, blk);
 476
 477                g_free(blk->buf);
 478                g_free(blk);
 479            }
 480
 481            bdrv_reset_dirty(bmds->bs, sector, nr_sectors);
 482            break;
 483        }
 484        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
 485        bmds->cur_dirty = sector;
 486    }
 487
 488    return (bmds->cur_dirty >= bmds->total_sectors);
 489
 490error:
 491    DPRINTF("Error reading sector %" PRId64 "\n", sector);
 492    g_free(blk->buf);
 493    g_free(blk);
 494    return ret;
 495}
 496
 497/* Called with iothread lock taken.
 498 *
 499 * return value:
 500 * 0: too much data for max_downtime
 501 * 1: few enough data for max_downtime
 502*/
 503static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
 504{
 505    BlkMigDevState *bmds;
 506    int ret = 1;
 507
 508    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 509        ret = mig_save_device_dirty(f, bmds, is_async);
 510        if (ret <= 0) {
 511            break;
 512        }
 513    }
 514
 515    return ret;
 516}
 517
 518/* Called with no locks taken.  */
 519
 520static int flush_blks(QEMUFile *f)
 521{
 522    BlkMigBlock *blk;
 523    int ret = 0;
 524
 525    DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
 526            __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
 527            block_mig_state.transferred);
 528
 529    blk_mig_lock();
 530    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
 531        if (qemu_file_rate_limit(f)) {
 532            break;
 533        }
 534        if (blk->ret < 0) {
 535            ret = blk->ret;
 536            break;
 537        }
 538
 539        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
 540        blk_mig_unlock();
 541        blk_send(f, blk);
 542        blk_mig_lock();
 543
 544        g_free(blk->buf);
 545        g_free(blk);
 546
 547        block_mig_state.read_done--;
 548        block_mig_state.transferred++;
 549        assert(block_mig_state.read_done >= 0);
 550    }
 551    blk_mig_unlock();
 552
 553    DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
 554            block_mig_state.submitted, block_mig_state.read_done,
 555            block_mig_state.transferred);
 556    return ret;
 557}
 558
 559/* Called with iothread lock taken.  */
 560
 561static int64_t get_remaining_dirty(void)
 562{
 563    BlkMigDevState *bmds;
 564    int64_t dirty = 0;
 565
 566    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 567        dirty += bdrv_get_dirty_count(bmds->bs, bmds->dirty_bitmap);
 568    }
 569
 570    return dirty << BDRV_SECTOR_BITS;
 571}
 572
 573/* Called with iothread lock taken.  */
 574
 575static void blk_mig_cleanup(void)
 576{
 577    BlkMigDevState *bmds;
 578    BlkMigBlock *blk;
 579
 580    bdrv_drain_all();
 581
 582    unset_dirty_tracking();
 583
 584    blk_mig_lock();
 585    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
 586        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
 587        bdrv_set_in_use(bmds->bs, 0);
 588        bdrv_unref(bmds->bs);
 589        g_free(bmds->aio_bitmap);
 590        g_free(bmds);
 591    }
 592
 593    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
 594        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
 595        g_free(blk->buf);
 596        g_free(blk);
 597    }
 598    blk_mig_unlock();
 599}
 600
 601static void block_migration_cancel(void *opaque)
 602{
 603    blk_mig_cleanup();
 604}
 605
 606static int block_save_setup(QEMUFile *f, void *opaque)
 607{
 608    int ret;
 609
 610    DPRINTF("Enter save live setup submitted %d transferred %d\n",
 611            block_mig_state.submitted, block_mig_state.transferred);
 612
 613    qemu_mutex_lock_iothread();
 614    init_blk_migration(f);
 615
 616    /* start track dirty blocks */
 617    set_dirty_tracking();
 618    qemu_mutex_unlock_iothread();
 619
 620    ret = flush_blks(f);
 621    blk_mig_reset_dirty_cursor();
 622    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 623
 624    return ret;
 625}
 626
 627static int block_save_iterate(QEMUFile *f, void *opaque)
 628{
 629    int ret;
 630    int64_t last_ftell = qemu_ftell(f);
 631
 632    DPRINTF("Enter save live iterate submitted %d transferred %d\n",
 633            block_mig_state.submitted, block_mig_state.transferred);
 634
 635    ret = flush_blks(f);
 636    if (ret) {
 637        return ret;
 638    }
 639
 640    blk_mig_reset_dirty_cursor();
 641
 642    /* control the rate of transfer */
 643    blk_mig_lock();
 644    while ((block_mig_state.submitted +
 645            block_mig_state.read_done) * BLOCK_SIZE <
 646           qemu_file_get_rate_limit(f)) {
 647        blk_mig_unlock();
 648        if (block_mig_state.bulk_completed == 0) {
 649            /* first finish the bulk phase */
 650            if (blk_mig_save_bulked_block(f) == 0) {
 651                /* finished saving bulk on all devices */
 652                block_mig_state.bulk_completed = 1;
 653            }
 654            ret = 0;
 655        } else {
 656            /* Always called with iothread lock taken for
 657             * simplicity, block_save_complete also calls it.
 658             */
 659            qemu_mutex_lock_iothread();
 660            ret = blk_mig_save_dirty_block(f, 1);
 661            qemu_mutex_unlock_iothread();
 662        }
 663        if (ret < 0) {
 664            return ret;
 665        }
 666        blk_mig_lock();
 667        if (ret != 0) {
 668            /* no more dirty blocks */
 669            break;
 670        }
 671    }
 672    blk_mig_unlock();
 673
 674    ret = flush_blks(f);
 675    if (ret) {
 676        return ret;
 677    }
 678
 679    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 680    return qemu_ftell(f) - last_ftell;
 681}
 682
 683/* Called with iothread lock taken.  */
 684
 685static int block_save_complete(QEMUFile *f, void *opaque)
 686{
 687    int ret;
 688
 689    DPRINTF("Enter save live complete submitted %d transferred %d\n",
 690            block_mig_state.submitted, block_mig_state.transferred);
 691
 692    ret = flush_blks(f);
 693    if (ret) {
 694        return ret;
 695    }
 696
 697    blk_mig_reset_dirty_cursor();
 698
 699    /* we know for sure that save bulk is completed and
 700       all async read completed */
 701    blk_mig_lock();
 702    assert(block_mig_state.submitted == 0);
 703    blk_mig_unlock();
 704
 705    do {
 706        ret = blk_mig_save_dirty_block(f, 0);
 707        if (ret < 0) {
 708            return ret;
 709        }
 710    } while (ret == 0);
 711
 712    /* report completion */
 713    qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
 714
 715    DPRINTF("Block migration completed\n");
 716
 717    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 718
 719    blk_mig_cleanup();
 720    return 0;
 721}
 722
 723static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
 724{
 725    /* Estimate pending number of bytes to send */
 726    uint64_t pending;
 727
 728    qemu_mutex_lock_iothread();
 729    blk_mig_lock();
 730    pending = get_remaining_dirty() +
 731                       block_mig_state.submitted * BLOCK_SIZE +
 732                       block_mig_state.read_done * BLOCK_SIZE;
 733
 734    /* Report at least one block pending during bulk phase */
 735    if (pending == 0 && !block_mig_state.bulk_completed) {
 736        pending = BLOCK_SIZE;
 737    }
 738    blk_mig_unlock();
 739    qemu_mutex_unlock_iothread();
 740
 741    DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
 742    return pending;
 743}
 744
 745static int block_load(QEMUFile *f, void *opaque, int version_id)
 746{
 747    static int banner_printed;
 748    int len, flags;
 749    char device_name[256];
 750    int64_t addr;
 751    BlockDriverState *bs, *bs_prev = NULL;
 752    uint8_t *buf;
 753    int64_t total_sectors = 0;
 754    int nr_sectors;
 755    int ret;
 756
 757    do {
 758        addr = qemu_get_be64(f);
 759
 760        flags = addr & ~BDRV_SECTOR_MASK;
 761        addr >>= BDRV_SECTOR_BITS;
 762
 763        if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
 764            /* get device name */
 765            len = qemu_get_byte(f);
 766            qemu_get_buffer(f, (uint8_t *)device_name, len);
 767            device_name[len] = '\0';
 768
 769            bs = bdrv_find(device_name);
 770            if (!bs) {
 771                fprintf(stderr, "Error unknown block device %s\n",
 772                        device_name);
 773                return -EINVAL;
 774            }
 775
 776            if (bs != bs_prev) {
 777                bs_prev = bs;
 778                total_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
 779                if (total_sectors <= 0) {
 780                    error_report("Error getting length of block device %s",
 781                                 device_name);
 782                    return -EINVAL;
 783                }
 784            }
 785
 786            if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
 787                nr_sectors = total_sectors - addr;
 788            } else {
 789                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
 790            }
 791
 792            if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
 793                ret = bdrv_write_zeroes(bs, addr, nr_sectors,
 794                                        BDRV_REQ_MAY_UNMAP);
 795            } else {
 796                buf = g_malloc(BLOCK_SIZE);
 797                qemu_get_buffer(f, buf, BLOCK_SIZE);
 798                ret = bdrv_write(bs, addr, buf, nr_sectors);
 799                g_free(buf);
 800            }
 801
 802            if (ret < 0) {
 803                return ret;
 804            }
 805        } else if (flags & BLK_MIG_FLAG_PROGRESS) {
 806            if (!banner_printed) {
 807                printf("Receiving block device images\n");
 808                banner_printed = 1;
 809            }
 810            printf("Completed %d %%%c", (int)addr,
 811                   (addr == 100) ? '\n' : '\r');
 812            fflush(stdout);
 813        } else if (!(flags & BLK_MIG_FLAG_EOS)) {
 814            fprintf(stderr, "Unknown block migration flags: %#x\n", flags);
 815            return -EINVAL;
 816        }
 817        ret = qemu_file_get_error(f);
 818        if (ret != 0) {
 819            return ret;
 820        }
 821    } while (!(flags & BLK_MIG_FLAG_EOS));
 822
 823    return 0;
 824}
 825
 826static void block_set_params(const MigrationParams *params, void *opaque)
 827{
 828    block_mig_state.blk_enable = params->blk;
 829    block_mig_state.shared_base = params->shared;
 830
 831    /* shared base means that blk_enable = 1 */
 832    block_mig_state.blk_enable |= params->shared;
 833}
 834
 835static bool block_is_active(void *opaque)
 836{
 837    return block_mig_state.blk_enable == 1;
 838}
 839
 840SaveVMHandlers savevm_block_handlers = {
 841    .set_params = block_set_params,
 842    .save_live_setup = block_save_setup,
 843    .save_live_iterate = block_save_iterate,
 844    .save_live_complete = block_save_complete,
 845    .save_live_pending = block_save_pending,
 846    .load_state = block_load,
 847    .cancel = block_migration_cancel,
 848    .is_active = block_is_active,
 849};
 850
 851void blk_mig_init(void)
 852{
 853    QSIMPLEQ_INIT(&block_mig_state.bmds_list);
 854    QSIMPLEQ_INIT(&block_mig_state.blk_list);
 855    qemu_mutex_init(&block_mig_state.lock);
 856
 857    register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers,
 858                         &block_mig_state);
 859}
 860