qemu/migration/block.c
<<
>>
Prefs
   1/*
   2 * QEMU live block migration
   3 *
   4 * Copyright IBM, Corp. 2009
   5 *
   6 * Authors:
   7 *  Liran Schour   <lirans@il.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 * Contributions after 2012-01-13 are licensed under the terms of the
  13 * GNU GPL, version 2 or (at your option) any later version.
  14 */
  15
  16#include "qemu-common.h"
  17#include "block/block.h"
  18#include "qemu/error-report.h"
  19#include "qemu/main-loop.h"
  20#include "hw/hw.h"
  21#include "qemu/queue.h"
  22#include "qemu/timer.h"
  23#include "migration/block.h"
  24#include "migration/migration.h"
  25#include "sysemu/blockdev.h"
  26#include <assert.h>
  27
  28#define BLOCK_SIZE                       (1 << 20)
  29#define BDRV_SECTORS_PER_DIRTY_CHUNK     (BLOCK_SIZE >> BDRV_SECTOR_BITS)
  30
  31#define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
  32#define BLK_MIG_FLAG_EOS                0x02
  33#define BLK_MIG_FLAG_PROGRESS           0x04
  34#define BLK_MIG_FLAG_ZERO_BLOCK         0x08
  35
  36#define MAX_IS_ALLOCATED_SEARCH 65536
  37
  38//#define DEBUG_BLK_MIGRATION
  39
  40#ifdef DEBUG_BLK_MIGRATION
  41#define DPRINTF(fmt, ...) \
  42    do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
  43#else
  44#define DPRINTF(fmt, ...) \
  45    do { } while (0)
  46#endif
  47
  48typedef struct BlkMigDevState {
  49    /* Written during setup phase.  Can be read without a lock.  */
  50    BlockDriverState *bs;
  51    int shared_base;
  52    int64_t total_sectors;
  53    QSIMPLEQ_ENTRY(BlkMigDevState) entry;
  54
  55    /* Only used by migration thread.  Does not need a lock.  */
  56    int bulk_completed;
  57    int64_t cur_sector;
  58    int64_t cur_dirty;
  59
  60    /* Protected by block migration lock.  */
  61    unsigned long *aio_bitmap;
  62    int64_t completed_sectors;
  63    BdrvDirtyBitmap *dirty_bitmap;
  64    Error *blocker;
  65} BlkMigDevState;
  66
  67typedef struct BlkMigBlock {
  68    /* Only used by migration thread.  */
  69    uint8_t *buf;
  70    BlkMigDevState *bmds;
  71    int64_t sector;
  72    int nr_sectors;
  73    struct iovec iov;
  74    QEMUIOVector qiov;
  75    BlockAIOCB *aiocb;
  76
  77    /* Protected by block migration lock.  */
  78    int ret;
  79    QSIMPLEQ_ENTRY(BlkMigBlock) entry;
  80} BlkMigBlock;
  81
  82typedef struct BlkMigState {
  83    /* Written during setup phase.  Can be read without a lock.  */
  84    int blk_enable;
  85    int shared_base;
  86    QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
  87    int64_t total_sector_sum;
  88    bool zero_blocks;
  89
  90    /* Protected by lock.  */
  91    QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
  92    int submitted;
  93    int read_done;
  94
  95    /* Only used by migration thread.  Does not need a lock.  */
  96    int transferred;
  97    int prev_progress;
  98    int bulk_completed;
  99
 100    /* Lock must be taken _inside_ the iothread lock.  */
 101    QemuMutex lock;
 102} BlkMigState;
 103
 104static BlkMigState block_mig_state;
 105
 106static void blk_mig_lock(void)
 107{
 108    qemu_mutex_lock(&block_mig_state.lock);
 109}
 110
 111static void blk_mig_unlock(void)
 112{
 113    qemu_mutex_unlock(&block_mig_state.lock);
 114}
 115
 116/* Must run outside of the iothread lock during the bulk phase,
 117 * or the VM will stall.
 118 */
 119
 120static void blk_send(QEMUFile *f, BlkMigBlock * blk)
 121{
 122    int len;
 123    uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;
 124
 125    if (block_mig_state.zero_blocks &&
 126        buffer_is_zero(blk->buf, BLOCK_SIZE)) {
 127        flags |= BLK_MIG_FLAG_ZERO_BLOCK;
 128    }
 129
 130    /* sector number and flags */
 131    qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
 132                     | flags);
 133
 134    /* device name */
 135    len = strlen(bdrv_get_device_name(blk->bmds->bs));
 136    qemu_put_byte(f, len);
 137    qemu_put_buffer(f, (uint8_t *)bdrv_get_device_name(blk->bmds->bs), len);
 138
 139    /* if a block is zero we need to flush here since the network
 140     * bandwidth is now a lot higher than the storage device bandwidth.
 141     * thus if we queue zero blocks we slow down the migration */
 142    if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
 143        qemu_fflush(f);
 144        return;
 145    }
 146
 147    qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
 148}
 149
 150int blk_mig_active(void)
 151{
 152    return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
 153}
 154
 155uint64_t blk_mig_bytes_transferred(void)
 156{
 157    BlkMigDevState *bmds;
 158    uint64_t sum = 0;
 159
 160    blk_mig_lock();
 161    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 162        sum += bmds->completed_sectors;
 163    }
 164    blk_mig_unlock();
 165    return sum << BDRV_SECTOR_BITS;
 166}
 167
 168uint64_t blk_mig_bytes_remaining(void)
 169{
 170    return blk_mig_bytes_total() - blk_mig_bytes_transferred();
 171}
 172
 173uint64_t blk_mig_bytes_total(void)
 174{
 175    BlkMigDevState *bmds;
 176    uint64_t sum = 0;
 177
 178    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 179        sum += bmds->total_sectors;
 180    }
 181    return sum << BDRV_SECTOR_BITS;
 182}
 183
 184
 185/* Called with migration lock held.  */
 186
 187static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
 188{
 189    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
 190
 191    if (sector < bdrv_nb_sectors(bmds->bs)) {
 192        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
 193            (1UL << (chunk % (sizeof(unsigned long) * 8))));
 194    } else {
 195        return 0;
 196    }
 197}
 198
 199/* Called with migration lock held.  */
 200
 201static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
 202                             int nb_sectors, int set)
 203{
 204    int64_t start, end;
 205    unsigned long val, idx, bit;
 206
 207    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
 208    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
 209
 210    for (; start <= end; start++) {
 211        idx = start / (sizeof(unsigned long) * 8);
 212        bit = start % (sizeof(unsigned long) * 8);
 213        val = bmds->aio_bitmap[idx];
 214        if (set) {
 215            val |= 1UL << bit;
 216        } else {
 217            val &= ~(1UL << bit);
 218        }
 219        bmds->aio_bitmap[idx] = val;
 220    }
 221}
 222
 223static void alloc_aio_bitmap(BlkMigDevState *bmds)
 224{
 225    BlockDriverState *bs = bmds->bs;
 226    int64_t bitmap_size;
 227
 228    bitmap_size = bdrv_nb_sectors(bs) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
 229    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
 230
 231    bmds->aio_bitmap = g_malloc0(bitmap_size);
 232}
 233
 234/* Never hold migration lock when yielding to the main loop!  */
 235
 236static void blk_mig_read_cb(void *opaque, int ret)
 237{
 238    BlkMigBlock *blk = opaque;
 239
 240    blk_mig_lock();
 241    blk->ret = ret;
 242
 243    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
 244    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
 245
 246    block_mig_state.submitted--;
 247    block_mig_state.read_done++;
 248    assert(block_mig_state.submitted >= 0);
 249    blk_mig_unlock();
 250}
 251
 252/* Called with no lock taken.  */
 253
 254static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
 255{
 256    int64_t total_sectors = bmds->total_sectors;
 257    int64_t cur_sector = bmds->cur_sector;
 258    BlockDriverState *bs = bmds->bs;
 259    BlkMigBlock *blk;
 260    int nr_sectors;
 261
 262    if (bmds->shared_base) {
 263        qemu_mutex_lock_iothread();
 264        while (cur_sector < total_sectors &&
 265               !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
 266                                  &nr_sectors)) {
 267            cur_sector += nr_sectors;
 268        }
 269        qemu_mutex_unlock_iothread();
 270    }
 271
 272    if (cur_sector >= total_sectors) {
 273        bmds->cur_sector = bmds->completed_sectors = total_sectors;
 274        return 1;
 275    }
 276
 277    bmds->completed_sectors = cur_sector;
 278
 279    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
 280
 281    /* we are going to transfer a full block even if it is not allocated */
 282    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
 283
 284    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
 285        nr_sectors = total_sectors - cur_sector;
 286    }
 287
 288    blk = g_new(BlkMigBlock, 1);
 289    blk->buf = g_malloc(BLOCK_SIZE);
 290    blk->bmds = bmds;
 291    blk->sector = cur_sector;
 292    blk->nr_sectors = nr_sectors;
 293
 294    blk->iov.iov_base = blk->buf;
 295    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
 296    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
 297
 298    blk_mig_lock();
 299    block_mig_state.submitted++;
 300    blk_mig_unlock();
 301
 302    qemu_mutex_lock_iothread();
 303    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
 304                                nr_sectors, blk_mig_read_cb, blk);
 305
 306    bdrv_reset_dirty(bs, cur_sector, nr_sectors);
 307    qemu_mutex_unlock_iothread();
 308
 309    bmds->cur_sector = cur_sector + nr_sectors;
 310    return (bmds->cur_sector >= total_sectors);
 311}
 312
 313/* Called with iothread lock taken.  */
 314
 315static int set_dirty_tracking(void)
 316{
 317    BlkMigDevState *bmds;
 318    int ret;
 319
 320    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 321        bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE,
 322                                                      NULL);
 323        if (!bmds->dirty_bitmap) {
 324            ret = -errno;
 325            goto fail;
 326        }
 327    }
 328    return 0;
 329
 330fail:
 331    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 332        if (bmds->dirty_bitmap) {
 333            bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
 334        }
 335    }
 336    return ret;
 337}
 338
 339static void unset_dirty_tracking(void)
 340{
 341    BlkMigDevState *bmds;
 342
 343    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 344        bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
 345    }
 346}
 347
 348static void init_blk_migration(QEMUFile *f)
 349{
 350    BlockDriverState *bs;
 351    BlkMigDevState *bmds;
 352    int64_t sectors;
 353
 354    block_mig_state.submitted = 0;
 355    block_mig_state.read_done = 0;
 356    block_mig_state.transferred = 0;
 357    block_mig_state.total_sector_sum = 0;
 358    block_mig_state.prev_progress = -1;
 359    block_mig_state.bulk_completed = 0;
 360    block_mig_state.zero_blocks = migrate_zero_blocks();
 361
 362    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
 363        if (bdrv_is_read_only(bs)) {
 364            continue;
 365        }
 366
 367        sectors = bdrv_nb_sectors(bs);
 368        if (sectors <= 0) {
 369            return;
 370        }
 371
 372        bmds = g_new0(BlkMigDevState, 1);
 373        bmds->bs = bs;
 374        bmds->bulk_completed = 0;
 375        bmds->total_sectors = sectors;
 376        bmds->completed_sectors = 0;
 377        bmds->shared_base = block_mig_state.shared_base;
 378        alloc_aio_bitmap(bmds);
 379        error_setg(&bmds->blocker, "block device is in use by migration");
 380        bdrv_op_block_all(bs, bmds->blocker);
 381        bdrv_ref(bs);
 382
 383        block_mig_state.total_sector_sum += sectors;
 384
 385        if (bmds->shared_base) {
 386            DPRINTF("Start migration for %s with shared base image\n",
 387                    bdrv_get_device_name(bs));
 388        } else {
 389            DPRINTF("Start full migration for %s\n", bdrv_get_device_name(bs));
 390        }
 391
 392        QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
 393    }
 394}
 395
 396/* Called with no lock taken.  */
 397
 398static int blk_mig_save_bulked_block(QEMUFile *f)
 399{
 400    int64_t completed_sector_sum = 0;
 401    BlkMigDevState *bmds;
 402    int progress;
 403    int ret = 0;
 404
 405    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 406        if (bmds->bulk_completed == 0) {
 407            if (mig_save_device_bulk(f, bmds) == 1) {
 408                /* completed bulk section for this device */
 409                bmds->bulk_completed = 1;
 410            }
 411            completed_sector_sum += bmds->completed_sectors;
 412            ret = 1;
 413            break;
 414        } else {
 415            completed_sector_sum += bmds->completed_sectors;
 416        }
 417    }
 418
 419    if (block_mig_state.total_sector_sum != 0) {
 420        progress = completed_sector_sum * 100 /
 421                   block_mig_state.total_sector_sum;
 422    } else {
 423        progress = 100;
 424    }
 425    if (progress != block_mig_state.prev_progress) {
 426        block_mig_state.prev_progress = progress;
 427        qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
 428                         | BLK_MIG_FLAG_PROGRESS);
 429        DPRINTF("Completed %d %%\r", progress);
 430    }
 431
 432    return ret;
 433}
 434
 435static void blk_mig_reset_dirty_cursor(void)
 436{
 437    BlkMigDevState *bmds;
 438
 439    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 440        bmds->cur_dirty = 0;
 441    }
 442}
 443
 444/* Called with iothread lock taken.  */
 445
 446static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
 447                                 int is_async)
 448{
 449    BlkMigBlock *blk;
 450    int64_t total_sectors = bmds->total_sectors;
 451    int64_t sector;
 452    int nr_sectors;
 453    int ret = -EIO;
 454
 455    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
 456        blk_mig_lock();
 457        if (bmds_aio_inflight(bmds, sector)) {
 458            blk_mig_unlock();
 459            bdrv_drain_all();
 460        } else {
 461            blk_mig_unlock();
 462        }
 463        if (bdrv_get_dirty(bmds->bs, bmds->dirty_bitmap, sector)) {
 464
 465            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
 466                nr_sectors = total_sectors - sector;
 467            } else {
 468                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
 469            }
 470            blk = g_new(BlkMigBlock, 1);
 471            blk->buf = g_malloc(BLOCK_SIZE);
 472            blk->bmds = bmds;
 473            blk->sector = sector;
 474            blk->nr_sectors = nr_sectors;
 475
 476            if (is_async) {
 477                blk->iov.iov_base = blk->buf;
 478                blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
 479                qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
 480
 481                blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
 482                                            nr_sectors, blk_mig_read_cb, blk);
 483
 484                blk_mig_lock();
 485                block_mig_state.submitted++;
 486                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
 487                blk_mig_unlock();
 488            } else {
 489                ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors);
 490                if (ret < 0) {
 491                    goto error;
 492                }
 493                blk_send(f, blk);
 494
 495                g_free(blk->buf);
 496                g_free(blk);
 497            }
 498
 499            bdrv_reset_dirty(bmds->bs, sector, nr_sectors);
 500            break;
 501        }
 502        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
 503        bmds->cur_dirty = sector;
 504    }
 505
 506    return (bmds->cur_dirty >= bmds->total_sectors);
 507
 508error:
 509    DPRINTF("Error reading sector %" PRId64 "\n", sector);
 510    g_free(blk->buf);
 511    g_free(blk);
 512    return ret;
 513}
 514
 515/* Called with iothread lock taken.
 516 *
 517 * return value:
 518 * 0: too much data for max_downtime
 519 * 1: few enough data for max_downtime
 520*/
 521static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
 522{
 523    BlkMigDevState *bmds;
 524    int ret = 1;
 525
 526    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 527        ret = mig_save_device_dirty(f, bmds, is_async);
 528        if (ret <= 0) {
 529            break;
 530        }
 531    }
 532
 533    return ret;
 534}
 535
 536/* Called with no locks taken.  */
 537
 538static int flush_blks(QEMUFile *f)
 539{
 540    BlkMigBlock *blk;
 541    int ret = 0;
 542
 543    DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
 544            __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
 545            block_mig_state.transferred);
 546
 547    blk_mig_lock();
 548    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
 549        if (qemu_file_rate_limit(f)) {
 550            break;
 551        }
 552        if (blk->ret < 0) {
 553            ret = blk->ret;
 554            break;
 555        }
 556
 557        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
 558        blk_mig_unlock();
 559        blk_send(f, blk);
 560        blk_mig_lock();
 561
 562        g_free(blk->buf);
 563        g_free(blk);
 564
 565        block_mig_state.read_done--;
 566        block_mig_state.transferred++;
 567        assert(block_mig_state.read_done >= 0);
 568    }
 569    blk_mig_unlock();
 570
 571    DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
 572            block_mig_state.submitted, block_mig_state.read_done,
 573            block_mig_state.transferred);
 574    return ret;
 575}
 576
 577/* Called with iothread lock taken.  */
 578
 579static int64_t get_remaining_dirty(void)
 580{
 581    BlkMigDevState *bmds;
 582    int64_t dirty = 0;
 583
 584    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 585        dirty += bdrv_get_dirty_count(bmds->bs, bmds->dirty_bitmap);
 586    }
 587
 588    return dirty << BDRV_SECTOR_BITS;
 589}
 590
 591/* Called with iothread lock taken.  */
 592
 593static void blk_mig_cleanup(void)
 594{
 595    BlkMigDevState *bmds;
 596    BlkMigBlock *blk;
 597
 598    bdrv_drain_all();
 599
 600    unset_dirty_tracking();
 601
 602    blk_mig_lock();
 603    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
 604        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
 605        bdrv_op_unblock_all(bmds->bs, bmds->blocker);
 606        error_free(bmds->blocker);
 607        bdrv_unref(bmds->bs);
 608        g_free(bmds->aio_bitmap);
 609        g_free(bmds);
 610    }
 611
 612    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
 613        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
 614        g_free(blk->buf);
 615        g_free(blk);
 616    }
 617    blk_mig_unlock();
 618}
 619
 620static void block_migration_cancel(void *opaque)
 621{
 622    blk_mig_cleanup();
 623}
 624
 625static int block_save_setup(QEMUFile *f, void *opaque)
 626{
 627    int ret;
 628
 629    DPRINTF("Enter save live setup submitted %d transferred %d\n",
 630            block_mig_state.submitted, block_mig_state.transferred);
 631
 632    qemu_mutex_lock_iothread();
 633    init_blk_migration(f);
 634
 635    /* start track dirty blocks */
 636    ret = set_dirty_tracking();
 637
 638    if (ret) {
 639        qemu_mutex_unlock_iothread();
 640        return ret;
 641    }
 642
 643    qemu_mutex_unlock_iothread();
 644
 645    ret = flush_blks(f);
 646    blk_mig_reset_dirty_cursor();
 647    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 648
 649    return ret;
 650}
 651
 652static int block_save_iterate(QEMUFile *f, void *opaque)
 653{
 654    int ret;
 655    int64_t last_ftell = qemu_ftell(f);
 656    int64_t delta_ftell;
 657
 658    DPRINTF("Enter save live iterate submitted %d transferred %d\n",
 659            block_mig_state.submitted, block_mig_state.transferred);
 660
 661    ret = flush_blks(f);
 662    if (ret) {
 663        return ret;
 664    }
 665
 666    blk_mig_reset_dirty_cursor();
 667
 668    /* control the rate of transfer */
 669    blk_mig_lock();
 670    while ((block_mig_state.submitted +
 671            block_mig_state.read_done) * BLOCK_SIZE <
 672           qemu_file_get_rate_limit(f)) {
 673        blk_mig_unlock();
 674        if (block_mig_state.bulk_completed == 0) {
 675            /* first finish the bulk phase */
 676            if (blk_mig_save_bulked_block(f) == 0) {
 677                /* finished saving bulk on all devices */
 678                block_mig_state.bulk_completed = 1;
 679            }
 680            ret = 0;
 681        } else {
 682            /* Always called with iothread lock taken for
 683             * simplicity, block_save_complete also calls it.
 684             */
 685            qemu_mutex_lock_iothread();
 686            ret = blk_mig_save_dirty_block(f, 1);
 687            qemu_mutex_unlock_iothread();
 688        }
 689        if (ret < 0) {
 690            return ret;
 691        }
 692        blk_mig_lock();
 693        if (ret != 0) {
 694            /* no more dirty blocks */
 695            break;
 696        }
 697    }
 698    blk_mig_unlock();
 699
 700    ret = flush_blks(f);
 701    if (ret) {
 702        return ret;
 703    }
 704
 705    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 706    delta_ftell = qemu_ftell(f) - last_ftell;
 707    if (delta_ftell > 0) {
 708        return 1;
 709    } else if (delta_ftell < 0) {
 710        return -1;
 711    } else {
 712        return 0;
 713    }
 714}
 715
 716/* Called with iothread lock taken.  */
 717
 718static int block_save_complete(QEMUFile *f, void *opaque)
 719{
 720    int ret;
 721
 722    DPRINTF("Enter save live complete submitted %d transferred %d\n",
 723            block_mig_state.submitted, block_mig_state.transferred);
 724
 725    ret = flush_blks(f);
 726    if (ret) {
 727        return ret;
 728    }
 729
 730    blk_mig_reset_dirty_cursor();
 731
 732    /* we know for sure that save bulk is completed and
 733       all async read completed */
 734    blk_mig_lock();
 735    assert(block_mig_state.submitted == 0);
 736    blk_mig_unlock();
 737
 738    do {
 739        ret = blk_mig_save_dirty_block(f, 0);
 740        if (ret < 0) {
 741            return ret;
 742        }
 743    } while (ret == 0);
 744
 745    /* report completion */
 746    qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
 747
 748    DPRINTF("Block migration completed\n");
 749
 750    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 751
 752    blk_mig_cleanup();
 753    return 0;
 754}
 755
 756static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
 757{
 758    /* Estimate pending number of bytes to send */
 759    uint64_t pending;
 760
 761    qemu_mutex_lock_iothread();
 762    blk_mig_lock();
 763    pending = get_remaining_dirty() +
 764                       block_mig_state.submitted * BLOCK_SIZE +
 765                       block_mig_state.read_done * BLOCK_SIZE;
 766
 767    /* Report at least one block pending during bulk phase */
 768    if (pending == 0 && !block_mig_state.bulk_completed) {
 769        pending = BLOCK_SIZE;
 770    }
 771    blk_mig_unlock();
 772    qemu_mutex_unlock_iothread();
 773
 774    DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
 775    return pending;
 776}
 777
 778static int block_load(QEMUFile *f, void *opaque, int version_id)
 779{
 780    static int banner_printed;
 781    int len, flags;
 782    char device_name[256];
 783    int64_t addr;
 784    BlockDriverState *bs, *bs_prev = NULL;
 785    uint8_t *buf;
 786    int64_t total_sectors = 0;
 787    int nr_sectors;
 788    int ret;
 789
 790    do {
 791        addr = qemu_get_be64(f);
 792
 793        flags = addr & ~BDRV_SECTOR_MASK;
 794        addr >>= BDRV_SECTOR_BITS;
 795
 796        if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
 797            /* get device name */
 798            len = qemu_get_byte(f);
 799            qemu_get_buffer(f, (uint8_t *)device_name, len);
 800            device_name[len] = '\0';
 801
 802            bs = bdrv_find(device_name);
 803            if (!bs) {
 804                fprintf(stderr, "Error unknown block device %s\n",
 805                        device_name);
 806                return -EINVAL;
 807            }
 808
 809            if (bs != bs_prev) {
 810                bs_prev = bs;
 811                total_sectors = bdrv_nb_sectors(bs);
 812                if (total_sectors <= 0) {
 813                    error_report("Error getting length of block device %s",
 814                                 device_name);
 815                    return -EINVAL;
 816                }
 817            }
 818
 819            if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
 820                nr_sectors = total_sectors - addr;
 821            } else {
 822                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
 823            }
 824
 825            if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
 826                ret = bdrv_write_zeroes(bs, addr, nr_sectors,
 827                                        BDRV_REQ_MAY_UNMAP);
 828            } else {
 829                buf = g_malloc(BLOCK_SIZE);
 830                qemu_get_buffer(f, buf, BLOCK_SIZE);
 831                ret = bdrv_write(bs, addr, buf, nr_sectors);
 832                g_free(buf);
 833            }
 834
 835            if (ret < 0) {
 836                return ret;
 837            }
 838        } else if (flags & BLK_MIG_FLAG_PROGRESS) {
 839            if (!banner_printed) {
 840                printf("Receiving block device images\n");
 841                banner_printed = 1;
 842            }
 843            printf("Completed %d %%%c", (int)addr,
 844                   (addr == 100) ? '\n' : '\r');
 845            fflush(stdout);
 846        } else if (!(flags & BLK_MIG_FLAG_EOS)) {
 847            fprintf(stderr, "Unknown block migration flags: %#x\n", flags);
 848            return -EINVAL;
 849        }
 850        ret = qemu_file_get_error(f);
 851        if (ret != 0) {
 852            return ret;
 853        }
 854    } while (!(flags & BLK_MIG_FLAG_EOS));
 855
 856    return 0;
 857}
 858
 859static void block_set_params(const MigrationParams *params, void *opaque)
 860{
 861    block_mig_state.blk_enable = params->blk;
 862    block_mig_state.shared_base = params->shared;
 863
 864    /* shared base means that blk_enable = 1 */
 865    block_mig_state.blk_enable |= params->shared;
 866}
 867
 868static bool block_is_active(void *opaque)
 869{
 870    return block_mig_state.blk_enable == 1;
 871}
 872
 873static SaveVMHandlers savevm_block_handlers = {
 874    .set_params = block_set_params,
 875    .save_live_setup = block_save_setup,
 876    .save_live_iterate = block_save_iterate,
 877    .save_live_complete = block_save_complete,
 878    .save_live_pending = block_save_pending,
 879    .load_state = block_load,
 880    .cancel = block_migration_cancel,
 881    .is_active = block_is_active,
 882};
 883
 884void blk_mig_init(void)
 885{
 886    QSIMPLEQ_INIT(&block_mig_state.bmds_list);
 887    QSIMPLEQ_INIT(&block_mig_state.blk_list);
 888    qemu_mutex_init(&block_mig_state.lock);
 889
 890    register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers,
 891                         &block_mig_state);
 892}
 893