qemu/migration/block.c
<<
>>
Prefs
   1/*
   2 * QEMU live block migration
   3 *
   4 * Copyright IBM, Corp. 2009
   5 *
   6 * Authors:
   7 *  Liran Schour   <lirans@il.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 * Contributions after 2012-01-13 are licensed under the terms of the
  13 * GNU GPL, version 2 or (at your option) any later version.
  14 */
  15
  16#include "qemu/osdep.h"
  17#include "qapi/error.h"
  18#include "qemu-common.h"
  19#include "block/block.h"
  20#include "qemu/error-report.h"
  21#include "qemu/main-loop.h"
  22#include "hw/hw.h"
  23#include "qemu/cutils.h"
  24#include "qemu/queue.h"
  25#include "qemu/timer.h"
  26#include "migration/block.h"
  27#include "migration/migration.h"
  28#include "sysemu/blockdev.h"
  29#include "sysemu/block-backend.h"
  30
  31#define BLOCK_SIZE                       (1 << 20)
  32#define BDRV_SECTORS_PER_DIRTY_CHUNK     (BLOCK_SIZE >> BDRV_SECTOR_BITS)
  33
  34#define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
  35#define BLK_MIG_FLAG_EOS                0x02
  36#define BLK_MIG_FLAG_PROGRESS           0x04
  37#define BLK_MIG_FLAG_ZERO_BLOCK         0x08
  38
  39#define MAX_IS_ALLOCATED_SEARCH 65536
  40
  41#define MAX_INFLIGHT_IO 512
  42
  43//#define DEBUG_BLK_MIGRATION
  44
  45#ifdef DEBUG_BLK_MIGRATION
  46#define DPRINTF(fmt, ...) \
  47    do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0)
  48#else
  49#define DPRINTF(fmt, ...) \
  50    do { } while (0)
  51#endif
  52
  53typedef struct BlkMigDevState {
  54    /* Written during setup phase.  Can be read without a lock.  */
  55    BlockDriverState *bs;
  56    int shared_base;
  57    int64_t total_sectors;
  58    QSIMPLEQ_ENTRY(BlkMigDevState) entry;
  59    Error *blocker;
  60
  61    /* Only used by migration thread.  Does not need a lock.  */
  62    int bulk_completed;
  63    int64_t cur_sector;
  64    int64_t cur_dirty;
  65
  66    /* Data in the aio_bitmap is protected by block migration lock.
  67     * Allocation and free happen during setup and cleanup respectively.
  68     */
  69    unsigned long *aio_bitmap;
  70
  71    /* Protected by block migration lock.  */
  72    int64_t completed_sectors;
  73
  74    /* During migration this is protected by iothread lock / AioContext.
  75     * Allocation and free happen during setup and cleanup respectively.
  76     */
  77    BdrvDirtyBitmap *dirty_bitmap;
  78} BlkMigDevState;
  79
  80typedef struct BlkMigBlock {
  81    /* Only used by migration thread.  */
  82    uint8_t *buf;
  83    BlkMigDevState *bmds;
  84    int64_t sector;
  85    int nr_sectors;
  86    struct iovec iov;
  87    QEMUIOVector qiov;
  88    BlockAIOCB *aiocb;
  89
  90    /* Protected by block migration lock.  */
  91    int ret;
  92    QSIMPLEQ_ENTRY(BlkMigBlock) entry;
  93} BlkMigBlock;
  94
  95typedef struct BlkMigState {
  96    /* Written during setup phase.  Can be read without a lock.  */
  97    int blk_enable;
  98    int shared_base;
  99    QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list;
 100    int64_t total_sector_sum;
 101    bool zero_blocks;
 102
 103    /* Protected by lock.  */
 104    QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list;
 105    int submitted;
 106    int read_done;
 107
 108    /* Only used by migration thread.  Does not need a lock.  */
 109    int transferred;
 110    int prev_progress;
 111    int bulk_completed;
 112
 113    /* Lock must be taken _inside_ the iothread lock and any AioContexts.  */
 114    QemuMutex lock;
 115} BlkMigState;
 116
 117static BlkMigState block_mig_state;
 118
 119static void blk_mig_lock(void)
 120{
 121    qemu_mutex_lock(&block_mig_state.lock);
 122}
 123
 124static void blk_mig_unlock(void)
 125{
 126    qemu_mutex_unlock(&block_mig_state.lock);
 127}
 128
 129/* Must run outside of the iothread lock during the bulk phase,
 130 * or the VM will stall.
 131 */
 132
 133static void blk_send(QEMUFile *f, BlkMigBlock * blk)
 134{
 135    int len;
 136    uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;
 137
 138    if (block_mig_state.zero_blocks &&
 139        buffer_is_zero(blk->buf, BLOCK_SIZE)) {
 140        flags |= BLK_MIG_FLAG_ZERO_BLOCK;
 141    }
 142
 143    /* sector number and flags */
 144    qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
 145                     | flags);
 146
 147    /* device name */
 148    len = strlen(bdrv_get_device_name(blk->bmds->bs));
 149    qemu_put_byte(f, len);
 150    qemu_put_buffer(f, (uint8_t *)bdrv_get_device_name(blk->bmds->bs), len);
 151
 152    /* if a block is zero we need to flush here since the network
 153     * bandwidth is now a lot higher than the storage device bandwidth.
 154     * thus if we queue zero blocks we slow down the migration */
 155    if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
 156        qemu_fflush(f);
 157        return;
 158    }
 159
 160    qemu_put_buffer(f, blk->buf, BLOCK_SIZE);
 161}
 162
 163int blk_mig_active(void)
 164{
 165    return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
 166}
 167
 168uint64_t blk_mig_bytes_transferred(void)
 169{
 170    BlkMigDevState *bmds;
 171    uint64_t sum = 0;
 172
 173    blk_mig_lock();
 174    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 175        sum += bmds->completed_sectors;
 176    }
 177    blk_mig_unlock();
 178    return sum << BDRV_SECTOR_BITS;
 179}
 180
 181uint64_t blk_mig_bytes_remaining(void)
 182{
 183    return blk_mig_bytes_total() - blk_mig_bytes_transferred();
 184}
 185
 186uint64_t blk_mig_bytes_total(void)
 187{
 188    BlkMigDevState *bmds;
 189    uint64_t sum = 0;
 190
 191    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 192        sum += bmds->total_sectors;
 193    }
 194    return sum << BDRV_SECTOR_BITS;
 195}
 196
 197
 198/* Called with migration lock held.  */
 199
 200static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
 201{
 202    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
 203
 204    if (sector < bdrv_nb_sectors(bmds->bs)) {
 205        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
 206            (1UL << (chunk % (sizeof(unsigned long) * 8))));
 207    } else {
 208        return 0;
 209    }
 210}
 211
 212/* Called with migration lock held.  */
 213
 214static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
 215                             int nb_sectors, int set)
 216{
 217    int64_t start, end;
 218    unsigned long val, idx, bit;
 219
 220    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
 221    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
 222
 223    for (; start <= end; start++) {
 224        idx = start / (sizeof(unsigned long) * 8);
 225        bit = start % (sizeof(unsigned long) * 8);
 226        val = bmds->aio_bitmap[idx];
 227        if (set) {
 228            val |= 1UL << bit;
 229        } else {
 230            val &= ~(1UL << bit);
 231        }
 232        bmds->aio_bitmap[idx] = val;
 233    }
 234}
 235
 236static void alloc_aio_bitmap(BlkMigDevState *bmds)
 237{
 238    BlockDriverState *bs = bmds->bs;
 239    int64_t bitmap_size;
 240
 241    bitmap_size = bdrv_nb_sectors(bs) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
 242    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
 243
 244    bmds->aio_bitmap = g_malloc0(bitmap_size);
 245}
 246
 247/* Never hold migration lock when yielding to the main loop!  */
 248
 249static void blk_mig_read_cb(void *opaque, int ret)
 250{
 251    BlkMigBlock *blk = opaque;
 252
 253    blk_mig_lock();
 254    blk->ret = ret;
 255
 256    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
 257    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
 258
 259    block_mig_state.submitted--;
 260    block_mig_state.read_done++;
 261    assert(block_mig_state.submitted >= 0);
 262    blk_mig_unlock();
 263}
 264
 265/* Called with no lock taken.  */
 266
 267static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
 268{
 269    int64_t total_sectors = bmds->total_sectors;
 270    int64_t cur_sector = bmds->cur_sector;
 271    BlockDriverState *bs = bmds->bs;
 272    BlkMigBlock *blk;
 273    int nr_sectors;
 274
 275    if (bmds->shared_base) {
 276        qemu_mutex_lock_iothread();
 277        aio_context_acquire(bdrv_get_aio_context(bs));
 278        while (cur_sector < total_sectors &&
 279               !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH,
 280                                  &nr_sectors)) {
 281            cur_sector += nr_sectors;
 282        }
 283        aio_context_release(bdrv_get_aio_context(bs));
 284        qemu_mutex_unlock_iothread();
 285    }
 286
 287    if (cur_sector >= total_sectors) {
 288        bmds->cur_sector = bmds->completed_sectors = total_sectors;
 289        return 1;
 290    }
 291
 292    bmds->completed_sectors = cur_sector;
 293
 294    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
 295
 296    /* we are going to transfer a full block even if it is not allocated */
 297    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
 298
 299    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
 300        nr_sectors = total_sectors - cur_sector;
 301    }
 302
 303    blk = g_new(BlkMigBlock, 1);
 304    blk->buf = g_malloc(BLOCK_SIZE);
 305    blk->bmds = bmds;
 306    blk->sector = cur_sector;
 307    blk->nr_sectors = nr_sectors;
 308
 309    blk->iov.iov_base = blk->buf;
 310    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
 311    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
 312
 313    blk_mig_lock();
 314    block_mig_state.submitted++;
 315    blk_mig_unlock();
 316
 317    /* We do not know if bs is under the main thread (and thus does
 318     * not acquire the AioContext when doing AIO) or rather under
 319     * dataplane.  Thus acquire both the iothread mutex and the
 320     * AioContext.
 321     *
 322     * This is ugly and will disappear when we make bdrv_* thread-safe,
 323     * without the need to acquire the AioContext.
 324     */
 325    qemu_mutex_lock_iothread();
 326    aio_context_acquire(bdrv_get_aio_context(bmds->bs));
 327    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
 328                                nr_sectors, blk_mig_read_cb, blk);
 329
 330    bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector, nr_sectors);
 331    aio_context_release(bdrv_get_aio_context(bmds->bs));
 332    qemu_mutex_unlock_iothread();
 333
 334    bmds->cur_sector = cur_sector + nr_sectors;
 335    return (bmds->cur_sector >= total_sectors);
 336}
 337
 338/* Called with iothread lock taken.  */
 339
 340static int set_dirty_tracking(void)
 341{
 342    BlkMigDevState *bmds;
 343    int ret;
 344
 345    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 346        aio_context_acquire(bdrv_get_aio_context(bmds->bs));
 347        bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE,
 348                                                      NULL, NULL);
 349        aio_context_release(bdrv_get_aio_context(bmds->bs));
 350        if (!bmds->dirty_bitmap) {
 351            ret = -errno;
 352            goto fail;
 353        }
 354    }
 355    return 0;
 356
 357fail:
 358    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 359        if (bmds->dirty_bitmap) {
 360            aio_context_acquire(bdrv_get_aio_context(bmds->bs));
 361            bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
 362            aio_context_release(bdrv_get_aio_context(bmds->bs));
 363        }
 364    }
 365    return ret;
 366}
 367
 368/* Called with iothread lock taken.  */
 369
 370static void unset_dirty_tracking(void)
 371{
 372    BlkMigDevState *bmds;
 373
 374    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 375        aio_context_acquire(bdrv_get_aio_context(bmds->bs));
 376        bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap);
 377        aio_context_release(bdrv_get_aio_context(bmds->bs));
 378    }
 379}
 380
 381static void init_blk_migration(QEMUFile *f)
 382{
 383    BlockDriverState *bs;
 384    BlkMigDevState *bmds;
 385    int64_t sectors;
 386
 387    block_mig_state.submitted = 0;
 388    block_mig_state.read_done = 0;
 389    block_mig_state.transferred = 0;
 390    block_mig_state.total_sector_sum = 0;
 391    block_mig_state.prev_progress = -1;
 392    block_mig_state.bulk_completed = 0;
 393    block_mig_state.zero_blocks = migrate_zero_blocks();
 394
 395    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
 396        if (bdrv_is_read_only(bs)) {
 397            continue;
 398        }
 399
 400        sectors = bdrv_nb_sectors(bs);
 401        if (sectors <= 0) {
 402            return;
 403        }
 404
 405        bmds = g_new0(BlkMigDevState, 1);
 406        bmds->bs = bs;
 407        bmds->bulk_completed = 0;
 408        bmds->total_sectors = sectors;
 409        bmds->completed_sectors = 0;
 410        bmds->shared_base = block_mig_state.shared_base;
 411        alloc_aio_bitmap(bmds);
 412        error_setg(&bmds->blocker, "block device is in use by migration");
 413        bdrv_op_block_all(bs, bmds->blocker);
 414        bdrv_ref(bs);
 415
 416        block_mig_state.total_sector_sum += sectors;
 417
 418        if (bmds->shared_base) {
 419            DPRINTF("Start migration for %s with shared base image\n",
 420                    bdrv_get_device_name(bs));
 421        } else {
 422            DPRINTF("Start full migration for %s\n", bdrv_get_device_name(bs));
 423        }
 424
 425        QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
 426    }
 427}
 428
 429/* Called with no lock taken.  */
 430
 431static int blk_mig_save_bulked_block(QEMUFile *f)
 432{
 433    int64_t completed_sector_sum = 0;
 434    BlkMigDevState *bmds;
 435    int progress;
 436    int ret = 0;
 437
 438    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 439        if (bmds->bulk_completed == 0) {
 440            if (mig_save_device_bulk(f, bmds) == 1) {
 441                /* completed bulk section for this device */
 442                bmds->bulk_completed = 1;
 443            }
 444            completed_sector_sum += bmds->completed_sectors;
 445            ret = 1;
 446            break;
 447        } else {
 448            completed_sector_sum += bmds->completed_sectors;
 449        }
 450    }
 451
 452    if (block_mig_state.total_sector_sum != 0) {
 453        progress = completed_sector_sum * 100 /
 454                   block_mig_state.total_sector_sum;
 455    } else {
 456        progress = 100;
 457    }
 458    if (progress != block_mig_state.prev_progress) {
 459        block_mig_state.prev_progress = progress;
 460        qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
 461                         | BLK_MIG_FLAG_PROGRESS);
 462        DPRINTF("Completed %d %%\r", progress);
 463    }
 464
 465    return ret;
 466}
 467
 468static void blk_mig_reset_dirty_cursor(void)
 469{
 470    BlkMigDevState *bmds;
 471
 472    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 473        bmds->cur_dirty = 0;
 474    }
 475}
 476
 477/* Called with iothread lock and AioContext taken.  */
 478
 479static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
 480                                 int is_async)
 481{
 482    BlkMigBlock *blk;
 483    int64_t total_sectors = bmds->total_sectors;
 484    int64_t sector;
 485    int nr_sectors;
 486    int ret = -EIO;
 487
 488    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
 489        blk_mig_lock();
 490        if (bmds_aio_inflight(bmds, sector)) {
 491            blk_mig_unlock();
 492            bdrv_drain(bmds->bs);
 493        } else {
 494            blk_mig_unlock();
 495        }
 496        if (bdrv_get_dirty(bmds->bs, bmds->dirty_bitmap, sector)) {
 497
 498            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
 499                nr_sectors = total_sectors - sector;
 500            } else {
 501                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
 502            }
 503            blk = g_new(BlkMigBlock, 1);
 504            blk->buf = g_malloc(BLOCK_SIZE);
 505            blk->bmds = bmds;
 506            blk->sector = sector;
 507            blk->nr_sectors = nr_sectors;
 508
 509            if (is_async) {
 510                blk->iov.iov_base = blk->buf;
 511                blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
 512                qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
 513
 514                blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
 515                                            nr_sectors, blk_mig_read_cb, blk);
 516
 517                blk_mig_lock();
 518                block_mig_state.submitted++;
 519                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
 520                blk_mig_unlock();
 521            } else {
 522                ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors);
 523                if (ret < 0) {
 524                    goto error;
 525                }
 526                blk_send(f, blk);
 527
 528                g_free(blk->buf);
 529                g_free(blk);
 530            }
 531
 532            bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, sector, nr_sectors);
 533            break;
 534        }
 535        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
 536        bmds->cur_dirty = sector;
 537    }
 538
 539    return (bmds->cur_dirty >= bmds->total_sectors);
 540
 541error:
 542    DPRINTF("Error reading sector %" PRId64 "\n", sector);
 543    g_free(blk->buf);
 544    g_free(blk);
 545    return ret;
 546}
 547
 548/* Called with iothread lock taken.
 549 *
 550 * return value:
 551 * 0: too much data for max_downtime
 552 * 1: few enough data for max_downtime
 553*/
 554static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
 555{
 556    BlkMigDevState *bmds;
 557    int ret = 1;
 558
 559    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 560        aio_context_acquire(bdrv_get_aio_context(bmds->bs));
 561        ret = mig_save_device_dirty(f, bmds, is_async);
 562        aio_context_release(bdrv_get_aio_context(bmds->bs));
 563        if (ret <= 0) {
 564            break;
 565        }
 566    }
 567
 568    return ret;
 569}
 570
 571/* Called with no locks taken.  */
 572
 573static int flush_blks(QEMUFile *f)
 574{
 575    BlkMigBlock *blk;
 576    int ret = 0;
 577
 578    DPRINTF("%s Enter submitted %d read_done %d transferred %d\n",
 579            __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done,
 580            block_mig_state.transferred);
 581
 582    blk_mig_lock();
 583    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
 584        if (qemu_file_rate_limit(f)) {
 585            break;
 586        }
 587        if (blk->ret < 0) {
 588            ret = blk->ret;
 589            break;
 590        }
 591
 592        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
 593        blk_mig_unlock();
 594        blk_send(f, blk);
 595        blk_mig_lock();
 596
 597        g_free(blk->buf);
 598        g_free(blk);
 599
 600        block_mig_state.read_done--;
 601        block_mig_state.transferred++;
 602        assert(block_mig_state.read_done >= 0);
 603    }
 604    blk_mig_unlock();
 605
 606    DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__,
 607            block_mig_state.submitted, block_mig_state.read_done,
 608            block_mig_state.transferred);
 609    return ret;
 610}
 611
 612/* Called with iothread lock taken.  */
 613
 614static int64_t get_remaining_dirty(void)
 615{
 616    BlkMigDevState *bmds;
 617    int64_t dirty = 0;
 618
 619    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 620        aio_context_acquire(bdrv_get_aio_context(bmds->bs));
 621        dirty += bdrv_get_dirty_count(bmds->dirty_bitmap);
 622        aio_context_release(bdrv_get_aio_context(bmds->bs));
 623    }
 624
 625    return dirty << BDRV_SECTOR_BITS;
 626}
 627
 628/* Called with iothread lock taken.  */
 629
 630static void block_migration_cleanup(void *opaque)
 631{
 632    BlkMigDevState *bmds;
 633    BlkMigBlock *blk;
 634    AioContext *ctx;
 635
 636    bdrv_drain_all();
 637
 638    unset_dirty_tracking();
 639
 640    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
 641        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
 642        bdrv_op_unblock_all(bmds->bs, bmds->blocker);
 643        error_free(bmds->blocker);
 644
 645        /* Save ctx, because bmds->bs can disappear during bdrv_unref.  */
 646        ctx = bdrv_get_aio_context(bmds->bs);
 647        aio_context_acquire(ctx);
 648        bdrv_unref(bmds->bs);
 649        aio_context_release(ctx);
 650
 651        g_free(bmds->aio_bitmap);
 652        g_free(bmds);
 653    }
 654
 655    blk_mig_lock();
 656    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
 657        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
 658        g_free(blk->buf);
 659        g_free(blk);
 660    }
 661    blk_mig_unlock();
 662}
 663
 664static int block_save_setup(QEMUFile *f, void *opaque)
 665{
 666    int ret;
 667
 668    DPRINTF("Enter save live setup submitted %d transferred %d\n",
 669            block_mig_state.submitted, block_mig_state.transferred);
 670
 671    qemu_mutex_lock_iothread();
 672    init_blk_migration(f);
 673
 674    /* start track dirty blocks */
 675    ret = set_dirty_tracking();
 676
 677    qemu_mutex_unlock_iothread();
 678
 679    if (ret) {
 680        return ret;
 681    }
 682
 683    ret = flush_blks(f);
 684    blk_mig_reset_dirty_cursor();
 685    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 686
 687    return ret;
 688}
 689
 690static int block_save_iterate(QEMUFile *f, void *opaque)
 691{
 692    int ret;
 693    int64_t last_ftell = qemu_ftell(f);
 694    int64_t delta_ftell;
 695
 696    DPRINTF("Enter save live iterate submitted %d transferred %d\n",
 697            block_mig_state.submitted, block_mig_state.transferred);
 698
 699    ret = flush_blks(f);
 700    if (ret) {
 701        return ret;
 702    }
 703
 704    blk_mig_reset_dirty_cursor();
 705
 706    /* control the rate of transfer */
 707    blk_mig_lock();
 708    while ((block_mig_state.submitted +
 709            block_mig_state.read_done) * BLOCK_SIZE <
 710           qemu_file_get_rate_limit(f) &&
 711           (block_mig_state.submitted +
 712            block_mig_state.read_done) <
 713           MAX_INFLIGHT_IO) {
 714        blk_mig_unlock();
 715        if (block_mig_state.bulk_completed == 0) {
 716            /* first finish the bulk phase */
 717            if (blk_mig_save_bulked_block(f) == 0) {
 718                /* finished saving bulk on all devices */
 719                block_mig_state.bulk_completed = 1;
 720            }
 721            ret = 0;
 722        } else {
 723            /* Always called with iothread lock taken for
 724             * simplicity, block_save_complete also calls it.
 725             */
 726            qemu_mutex_lock_iothread();
 727            ret = blk_mig_save_dirty_block(f, 1);
 728            qemu_mutex_unlock_iothread();
 729        }
 730        if (ret < 0) {
 731            return ret;
 732        }
 733        blk_mig_lock();
 734        if (ret != 0) {
 735            /* no more dirty blocks */
 736            break;
 737        }
 738    }
 739    blk_mig_unlock();
 740
 741    ret = flush_blks(f);
 742    if (ret) {
 743        return ret;
 744    }
 745
 746    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 747    delta_ftell = qemu_ftell(f) - last_ftell;
 748    if (delta_ftell > 0) {
 749        return 1;
 750    } else if (delta_ftell < 0) {
 751        return -1;
 752    } else {
 753        return 0;
 754    }
 755}
 756
 757/* Called with iothread lock taken.  */
 758
 759static int block_save_complete(QEMUFile *f, void *opaque)
 760{
 761    int ret;
 762
 763    DPRINTF("Enter save live complete submitted %d transferred %d\n",
 764            block_mig_state.submitted, block_mig_state.transferred);
 765
 766    ret = flush_blks(f);
 767    if (ret) {
 768        return ret;
 769    }
 770
 771    blk_mig_reset_dirty_cursor();
 772
 773    /* we know for sure that save bulk is completed and
 774       all async read completed */
 775    blk_mig_lock();
 776    assert(block_mig_state.submitted == 0);
 777    blk_mig_unlock();
 778
 779    do {
 780        ret = blk_mig_save_dirty_block(f, 0);
 781        if (ret < 0) {
 782            return ret;
 783        }
 784    } while (ret == 0);
 785
 786    /* report completion */
 787    qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
 788
 789    DPRINTF("Block migration completed\n");
 790
 791    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 792
 793    return 0;
 794}
 795
 796static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
 797                               uint64_t *non_postcopiable_pending,
 798                               uint64_t *postcopiable_pending)
 799{
 800    /* Estimate pending number of bytes to send */
 801    uint64_t pending;
 802
 803    qemu_mutex_lock_iothread();
 804    pending = get_remaining_dirty();
 805    qemu_mutex_unlock_iothread();
 806
 807    blk_mig_lock();
 808    pending += block_mig_state.submitted * BLOCK_SIZE +
 809               block_mig_state.read_done * BLOCK_SIZE;
 810    blk_mig_unlock();
 811
 812    /* Report at least one block pending during bulk phase */
 813    if (pending <= max_size && !block_mig_state.bulk_completed) {
 814        pending = max_size + BLOCK_SIZE;
 815    }
 816
 817    DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
 818    /* We don't do postcopy */
 819    *non_postcopiable_pending += pending;
 820}
 821
 822static int block_load(QEMUFile *f, void *opaque, int version_id)
 823{
 824    static int banner_printed;
 825    int len, flags;
 826    char device_name[256];
 827    int64_t addr;
 828    BlockDriverState *bs, *bs_prev = NULL;
 829    BlockBackend *blk;
 830    Error *local_err = NULL;
 831    uint8_t *buf;
 832    int64_t total_sectors = 0;
 833    int nr_sectors;
 834    int ret;
 835
 836    do {
 837        addr = qemu_get_be64(f);
 838
 839        flags = addr & ~BDRV_SECTOR_MASK;
 840        addr >>= BDRV_SECTOR_BITS;
 841
 842        if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
 843            /* get device name */
 844            len = qemu_get_byte(f);
 845            qemu_get_buffer(f, (uint8_t *)device_name, len);
 846            device_name[len] = '\0';
 847
 848            blk = blk_by_name(device_name);
 849            if (!blk) {
 850                fprintf(stderr, "Error unknown block device %s\n",
 851                        device_name);
 852                return -EINVAL;
 853            }
 854            bs = blk_bs(blk);
 855            if (!bs) {
 856                fprintf(stderr, "Block device %s has no medium\n",
 857                        device_name);
 858                return -EINVAL;
 859            }
 860
 861            if (bs != bs_prev) {
 862                bs_prev = bs;
 863                total_sectors = bdrv_nb_sectors(bs);
 864                if (total_sectors <= 0) {
 865                    error_report("Error getting length of block device %s",
 866                                 device_name);
 867                    return -EINVAL;
 868                }
 869
 870                bdrv_invalidate_cache(bs, &local_err);
 871                if (local_err) {
 872                    error_report_err(local_err);
 873                    return -EINVAL;
 874                }
 875            }
 876
 877            if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
 878                nr_sectors = total_sectors - addr;
 879            } else {
 880                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
 881            }
 882
 883            if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
 884                ret = bdrv_write_zeroes(bs, addr, nr_sectors,
 885                                        BDRV_REQ_MAY_UNMAP);
 886            } else {
 887                buf = g_malloc(BLOCK_SIZE);
 888                qemu_get_buffer(f, buf, BLOCK_SIZE);
 889                ret = bdrv_write(bs, addr, buf, nr_sectors);
 890                g_free(buf);
 891            }
 892
 893            if (ret < 0) {
 894                return ret;
 895            }
 896        } else if (flags & BLK_MIG_FLAG_PROGRESS) {
 897            if (!banner_printed) {
 898                printf("Receiving block device images\n");
 899                banner_printed = 1;
 900            }
 901            printf("Completed %d %%%c", (int)addr,
 902                   (addr == 100) ? '\n' : '\r');
 903            fflush(stdout);
 904        } else if (!(flags & BLK_MIG_FLAG_EOS)) {
 905            fprintf(stderr, "Unknown block migration flags: %#x\n", flags);
 906            return -EINVAL;
 907        }
 908        ret = qemu_file_get_error(f);
 909        if (ret != 0) {
 910            return ret;
 911        }
 912    } while (!(flags & BLK_MIG_FLAG_EOS));
 913
 914    return 0;
 915}
 916
 917static void block_set_params(const MigrationParams *params, void *opaque)
 918{
 919    block_mig_state.blk_enable = params->blk;
 920    block_mig_state.shared_base = params->shared;
 921
 922    /* shared base means that blk_enable = 1 */
 923    block_mig_state.blk_enable |= params->shared;
 924}
 925
 926static bool block_is_active(void *opaque)
 927{
 928    return block_mig_state.blk_enable == 1;
 929}
 930
 931static SaveVMHandlers savevm_block_handlers = {
 932    .set_params = block_set_params,
 933    .save_live_setup = block_save_setup,
 934    .save_live_iterate = block_save_iterate,
 935    .save_live_complete_precopy = block_save_complete,
 936    .save_live_pending = block_save_pending,
 937    .load_state = block_load,
 938    .cleanup = block_migration_cleanup,
 939    .is_active = block_is_active,
 940};
 941
 942void blk_mig_init(void)
 943{
 944    QSIMPLEQ_INIT(&block_mig_state.bmds_list);
 945    QSIMPLEQ_INIT(&block_mig_state.blk_list);
 946    qemu_mutex_init(&block_mig_state.lock);
 947
 948    register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers,
 949                         &block_mig_state);
 950}
 951