qemu/migration/block.c
<<
>>
Prefs
   1/*
   2 * QEMU live block migration
   3 *
   4 * Copyright IBM, Corp. 2009
   5 *
   6 * Authors:
   7 *  Liran Schour   <lirans@il.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 * Contributions after 2012-01-13 are licensed under the terms of the
  13 * GNU GPL, version 2 or (at your option) any later version.
  14 */
  15
  16#include "qemu/osdep.h"
  17#include "qapi/error.h"
  18#include "qemu/error-report.h"
  19#include "qemu/main-loop.h"
  20#include "qemu/cutils.h"
  21#include "qemu/queue.h"
  22#include "block.h"
  23#include "block/dirty-bitmap.h"
  24#include "migration/misc.h"
  25#include "migration.h"
  26#include "migration-stats.h"
  27#include "migration/register.h"
  28#include "qemu-file.h"
  29#include "migration/vmstate.h"
  30#include "sysemu/block-backend.h"
  31#include "trace.h"
  32#include "options.h"
  33
  34#define BLK_MIG_BLOCK_SIZE           (1ULL << 20)
  35#define BDRV_SECTORS_PER_DIRTY_CHUNK (BLK_MIG_BLOCK_SIZE >> BDRV_SECTOR_BITS)
  36
  37#define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
  38#define BLK_MIG_FLAG_EOS                0x02
  39#define BLK_MIG_FLAG_PROGRESS           0x04
  40#define BLK_MIG_FLAG_ZERO_BLOCK         0x08
  41
  42#define MAX_IS_ALLOCATED_SEARCH (65536 * BDRV_SECTOR_SIZE)
  43
  44#define MAX_IO_BUFFERS 512
  45#define MAX_PARALLEL_IO 16
  46
  47typedef struct BlkMigDevState {
  48    /* Written during setup phase.  Can be read without a lock.  */
  49    BlockBackend *blk;
  50    char *blk_name;
  51    int shared_base;
  52    int64_t total_sectors;
  53    QSIMPLEQ_ENTRY(BlkMigDevState) entry;
  54    Error *blocker;
  55
  56    /* Only used by migration thread.  Does not need a lock.  */
  57    int bulk_completed;
  58    int64_t cur_sector;
  59    int64_t cur_dirty;
  60
  61    /* Data in the aio_bitmap is protected by block migration lock.
  62     * Allocation and free happen during setup and cleanup respectively.
  63     */
  64    unsigned long *aio_bitmap;
  65
  66    /* Protected by block migration lock.  */
  67    int64_t completed_sectors;
  68
  69    /* During migration this is protected by iothread lock / AioContext.
  70     * Allocation and free happen during setup and cleanup respectively.
  71     */
  72    BdrvDirtyBitmap *dirty_bitmap;
  73} BlkMigDevState;
  74
  75typedef struct BlkMigBlock {
  76    /* Only used by migration thread.  */
  77    uint8_t *buf;
  78    BlkMigDevState *bmds;
  79    int64_t sector;
  80    int nr_sectors;
  81    QEMUIOVector qiov;
  82    BlockAIOCB *aiocb;
  83
  84    /* Protected by block migration lock.  */
  85    int ret;
  86    QSIMPLEQ_ENTRY(BlkMigBlock) entry;
  87} BlkMigBlock;
  88
  89typedef struct BlkMigState {
  90    QSIMPLEQ_HEAD(, BlkMigDevState) bmds_list;
  91    int64_t total_sector_sum;
  92    bool zero_blocks;
  93
  94    /* Protected by lock.  */
  95    QSIMPLEQ_HEAD(, BlkMigBlock) blk_list;
  96    int submitted;
  97    int read_done;
  98
  99    /* Only used by migration thread.  Does not need a lock.  */
 100    int transferred;
 101    int prev_progress;
 102    int bulk_completed;
 103
 104    /* Lock must be taken _inside_ the iothread lock and any AioContexts.  */
 105    QemuMutex lock;
 106} BlkMigState;
 107
 108static BlkMigState block_mig_state;
 109
 110static void blk_mig_lock(void)
 111{
 112    qemu_mutex_lock(&block_mig_state.lock);
 113}
 114
 115static void blk_mig_unlock(void)
 116{
 117    qemu_mutex_unlock(&block_mig_state.lock);
 118}
 119
 120/* Must run outside of the iothread lock during the bulk phase,
 121 * or the VM will stall.
 122 */
 123
 124static void blk_send(QEMUFile *f, BlkMigBlock * blk)
 125{
 126    int len;
 127    uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;
 128
 129    if (block_mig_state.zero_blocks &&
 130        buffer_is_zero(blk->buf, BLK_MIG_BLOCK_SIZE)) {
 131        flags |= BLK_MIG_FLAG_ZERO_BLOCK;
 132    }
 133
 134    /* sector number and flags */
 135    qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
 136                     | flags);
 137
 138    /* device name */
 139    len = strlen(blk->bmds->blk_name);
 140    qemu_put_byte(f, len);
 141    qemu_put_buffer(f, (uint8_t *) blk->bmds->blk_name, len);
 142
 143    /* if a block is zero we need to flush here since the network
 144     * bandwidth is now a lot higher than the storage device bandwidth.
 145     * thus if we queue zero blocks we slow down the migration */
 146    if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
 147        qemu_fflush(f);
 148        return;
 149    }
 150
 151    qemu_put_buffer(f, blk->buf, BLK_MIG_BLOCK_SIZE);
 152}
 153
 154int blk_mig_active(void)
 155{
 156    return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
 157}
 158
 159int blk_mig_bulk_active(void)
 160{
 161    return blk_mig_active() && !block_mig_state.bulk_completed;
 162}
 163
 164uint64_t blk_mig_bytes_transferred(void)
 165{
 166    BlkMigDevState *bmds;
 167    uint64_t sum = 0;
 168
 169    blk_mig_lock();
 170    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 171        sum += bmds->completed_sectors;
 172    }
 173    blk_mig_unlock();
 174    return sum << BDRV_SECTOR_BITS;
 175}
 176
 177uint64_t blk_mig_bytes_remaining(void)
 178{
 179    return blk_mig_bytes_total() - blk_mig_bytes_transferred();
 180}
 181
 182uint64_t blk_mig_bytes_total(void)
 183{
 184    BlkMigDevState *bmds;
 185    uint64_t sum = 0;
 186
 187    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 188        sum += bmds->total_sectors;
 189    }
 190    return sum << BDRV_SECTOR_BITS;
 191}
 192
 193
 194/* Called with migration lock held.  */
 195
 196static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
 197{
 198    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
 199
 200    if (sector < bmds->total_sectors) {
 201        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
 202            (1UL << (chunk % (sizeof(unsigned long) * 8))));
 203    } else {
 204        return 0;
 205    }
 206}
 207
 208/* Called with migration lock held.  */
 209
 210static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
 211                             int nb_sectors, int set)
 212{
 213    int64_t start, end;
 214    unsigned long val, idx, bit;
 215
 216    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
 217    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
 218
 219    for (; start <= end; start++) {
 220        idx = start / (sizeof(unsigned long) * 8);
 221        bit = start % (sizeof(unsigned long) * 8);
 222        val = bmds->aio_bitmap[idx];
 223        if (set) {
 224            val |= 1UL << bit;
 225        } else {
 226            val &= ~(1UL << bit);
 227        }
 228        bmds->aio_bitmap[idx] = val;
 229    }
 230}
 231
 232static void alloc_aio_bitmap(BlkMigDevState *bmds)
 233{
 234    int64_t bitmap_size;
 235
 236    bitmap_size = bmds->total_sectors + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
 237    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
 238
 239    bmds->aio_bitmap = g_malloc0(bitmap_size);
 240}
 241
 242/* Never hold migration lock when yielding to the main loop!  */
 243
 244static void blk_mig_read_cb(void *opaque, int ret)
 245{
 246    BlkMigBlock *blk = opaque;
 247
 248    blk_mig_lock();
 249    blk->ret = ret;
 250
 251    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
 252    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
 253
 254    block_mig_state.submitted--;
 255    block_mig_state.read_done++;
 256    assert(block_mig_state.submitted >= 0);
 257    blk_mig_unlock();
 258}
 259
 260/* Called with no lock taken.  */
 261
 262static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
 263{
 264    int64_t total_sectors = bmds->total_sectors;
 265    int64_t cur_sector = bmds->cur_sector;
 266    BlockBackend *bb = bmds->blk;
 267    BlkMigBlock *blk;
 268    int nr_sectors;
 269    int64_t count;
 270
 271    if (bmds->shared_base) {
 272        qemu_mutex_lock_iothread();
 273        aio_context_acquire(blk_get_aio_context(bb));
 274        /* Skip unallocated sectors; intentionally treats failure or
 275         * partial sector as an allocated sector */
 276        while (cur_sector < total_sectors &&
 277               !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE,
 278                                  MAX_IS_ALLOCATED_SEARCH, &count)) {
 279            if (count < BDRV_SECTOR_SIZE) {
 280                break;
 281            }
 282            cur_sector += count >> BDRV_SECTOR_BITS;
 283        }
 284        aio_context_release(blk_get_aio_context(bb));
 285        qemu_mutex_unlock_iothread();
 286    }
 287
 288    if (cur_sector >= total_sectors) {
 289        bmds->cur_sector = bmds->completed_sectors = total_sectors;
 290        return 1;
 291    }
 292
 293    bmds->completed_sectors = cur_sector;
 294
 295    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
 296
 297    /* we are going to transfer a full block even if it is not allocated */
 298    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
 299
 300    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
 301        nr_sectors = total_sectors - cur_sector;
 302    }
 303
 304    blk = g_new(BlkMigBlock, 1);
 305    blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE);
 306    blk->bmds = bmds;
 307    blk->sector = cur_sector;
 308    blk->nr_sectors = nr_sectors;
 309
 310    qemu_iovec_init_buf(&blk->qiov, blk->buf, nr_sectors * BDRV_SECTOR_SIZE);
 311
 312    blk_mig_lock();
 313    block_mig_state.submitted++;
 314    blk_mig_unlock();
 315
 316    /* We do not know if bs is under the main thread (and thus does
 317     * not acquire the AioContext when doing AIO) or rather under
 318     * dataplane.  Thus acquire both the iothread mutex and the
 319     * AioContext.
 320     *
 321     * This is ugly and will disappear when we make bdrv_* thread-safe,
 322     * without the need to acquire the AioContext.
 323     */
 324    qemu_mutex_lock_iothread();
 325    aio_context_acquire(blk_get_aio_context(bmds->blk));
 326    bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector * BDRV_SECTOR_SIZE,
 327                            nr_sectors * BDRV_SECTOR_SIZE);
 328    blk->aiocb = blk_aio_preadv(bb, cur_sector * BDRV_SECTOR_SIZE, &blk->qiov,
 329                                0, blk_mig_read_cb, blk);
 330    aio_context_release(blk_get_aio_context(bmds->blk));
 331    qemu_mutex_unlock_iothread();
 332
 333    bmds->cur_sector = cur_sector + nr_sectors;
 334    return (bmds->cur_sector >= total_sectors);
 335}
 336
 337/* Called with iothread lock taken.  */
 338
 339static int set_dirty_tracking(void)
 340{
 341    BlkMigDevState *bmds;
 342    int ret;
 343
 344    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 345        bmds->dirty_bitmap = bdrv_create_dirty_bitmap(blk_bs(bmds->blk),
 346                                                      BLK_MIG_BLOCK_SIZE,
 347                                                      NULL, NULL);
 348        if (!bmds->dirty_bitmap) {
 349            ret = -errno;
 350            goto fail;
 351        }
 352    }
 353    return 0;
 354
 355fail:
 356    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 357        if (bmds->dirty_bitmap) {
 358            bdrv_release_dirty_bitmap(bmds->dirty_bitmap);
 359        }
 360    }
 361    return ret;
 362}
 363
 364/* Called with iothread lock taken.  */
 365
 366static void unset_dirty_tracking(void)
 367{
 368    BlkMigDevState *bmds;
 369
 370    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 371        if (bmds->dirty_bitmap) {
 372            bdrv_release_dirty_bitmap(bmds->dirty_bitmap);
 373        }
 374    }
 375}
 376
 377static int init_blk_migration(QEMUFile *f)
 378{
 379    BlockDriverState *bs;
 380    BlkMigDevState *bmds;
 381    int64_t sectors;
 382    BdrvNextIterator it;
 383    int i, num_bs = 0;
 384    struct {
 385        BlkMigDevState *bmds;
 386        BlockDriverState *bs;
 387    } *bmds_bs;
 388    Error *local_err = NULL;
 389    int ret;
 390
 391    block_mig_state.submitted = 0;
 392    block_mig_state.read_done = 0;
 393    block_mig_state.transferred = 0;
 394    block_mig_state.total_sector_sum = 0;
 395    block_mig_state.prev_progress = -1;
 396    block_mig_state.bulk_completed = 0;
 397    block_mig_state.zero_blocks = migrate_zero_blocks();
 398
 399    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 400        num_bs++;
 401    }
 402    bmds_bs = g_malloc0(num_bs * sizeof(*bmds_bs));
 403
 404    for (i = 0, bs = bdrv_first(&it); bs; bs = bdrv_next(&it), i++) {
 405        if (bdrv_is_read_only(bs)) {
 406            continue;
 407        }
 408
 409        sectors = bdrv_nb_sectors(bs);
 410        if (sectors <= 0) {
 411            ret = sectors;
 412            bdrv_next_cleanup(&it);
 413            goto out;
 414        }
 415
 416        bmds = g_new0(BlkMigDevState, 1);
 417        bmds->blk = blk_new(qemu_get_aio_context(),
 418                            BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
 419        bmds->blk_name = g_strdup(bdrv_get_device_name(bs));
 420        bmds->bulk_completed = 0;
 421        bmds->total_sectors = sectors;
 422        bmds->completed_sectors = 0;
 423        bmds->shared_base = migrate_block_incremental();
 424
 425        assert(i < num_bs);
 426        bmds_bs[i].bmds = bmds;
 427        bmds_bs[i].bs = bs;
 428
 429        block_mig_state.total_sector_sum += sectors;
 430
 431        if (bmds->shared_base) {
 432            trace_migration_block_init_shared(bdrv_get_device_name(bs));
 433        } else {
 434            trace_migration_block_init_full(bdrv_get_device_name(bs));
 435        }
 436
 437        QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
 438    }
 439
 440    /* Can only insert new BDSes now because doing so while iterating block
 441     * devices may end up in a deadlock (iterating the new BDSes, too). */
 442    for (i = 0; i < num_bs; i++) {
 443        BlkMigDevState *bmds = bmds_bs[i].bmds;
 444        BlockDriverState *bs = bmds_bs[i].bs;
 445
 446        if (bmds) {
 447            ret = blk_insert_bs(bmds->blk, bs, &local_err);
 448            if (ret < 0) {
 449                error_report_err(local_err);
 450                goto out;
 451            }
 452
 453            alloc_aio_bitmap(bmds);
 454            error_setg(&bmds->blocker, "block device is in use by migration");
 455            bdrv_op_block_all(bs, bmds->blocker);
 456        }
 457    }
 458
 459    ret = 0;
 460out:
 461    g_free(bmds_bs);
 462    return ret;
 463}
 464
 465/* Called with no lock taken.  */
 466
 467static int blk_mig_save_bulked_block(QEMUFile *f)
 468{
 469    int64_t completed_sector_sum = 0;
 470    BlkMigDevState *bmds;
 471    int progress;
 472    int ret = 0;
 473
 474    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 475        if (bmds->bulk_completed == 0) {
 476            if (mig_save_device_bulk(f, bmds) == 1) {
 477                /* completed bulk section for this device */
 478                bmds->bulk_completed = 1;
 479            }
 480            completed_sector_sum += bmds->completed_sectors;
 481            ret = 1;
 482            break;
 483        } else {
 484            completed_sector_sum += bmds->completed_sectors;
 485        }
 486    }
 487
 488    if (block_mig_state.total_sector_sum != 0) {
 489        progress = completed_sector_sum * 100 /
 490                   block_mig_state.total_sector_sum;
 491    } else {
 492        progress = 100;
 493    }
 494    if (progress != block_mig_state.prev_progress) {
 495        block_mig_state.prev_progress = progress;
 496        qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
 497                         | BLK_MIG_FLAG_PROGRESS);
 498        trace_migration_block_progression(progress);
 499    }
 500
 501    return ret;
 502}
 503
 504static void blk_mig_reset_dirty_cursor(void)
 505{
 506    BlkMigDevState *bmds;
 507
 508    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 509        bmds->cur_dirty = 0;
 510    }
 511}
 512
 513/* Called with iothread lock and AioContext taken.  */
 514
 515static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
 516                                 int is_async)
 517{
 518    BlkMigBlock *blk;
 519    int64_t total_sectors = bmds->total_sectors;
 520    int64_t sector;
 521    int nr_sectors;
 522    int ret = -EIO;
 523
 524    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
 525        blk_mig_lock();
 526        if (bmds_aio_inflight(bmds, sector)) {
 527            blk_mig_unlock();
 528            blk_drain(bmds->blk);
 529        } else {
 530            blk_mig_unlock();
 531        }
 532        bdrv_dirty_bitmap_lock(bmds->dirty_bitmap);
 533        if (bdrv_dirty_bitmap_get_locked(bmds->dirty_bitmap,
 534                                         sector * BDRV_SECTOR_SIZE)) {
 535            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
 536                nr_sectors = total_sectors - sector;
 537            } else {
 538                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
 539            }
 540            bdrv_reset_dirty_bitmap_locked(bmds->dirty_bitmap,
 541                                           sector * BDRV_SECTOR_SIZE,
 542                                           nr_sectors * BDRV_SECTOR_SIZE);
 543            bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap);
 544
 545            blk = g_new(BlkMigBlock, 1);
 546            blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE);
 547            blk->bmds = bmds;
 548            blk->sector = sector;
 549            blk->nr_sectors = nr_sectors;
 550
 551            if (is_async) {
 552                qemu_iovec_init_buf(&blk->qiov, blk->buf,
 553                                    nr_sectors * BDRV_SECTOR_SIZE);
 554
 555                blk->aiocb = blk_aio_preadv(bmds->blk,
 556                                            sector * BDRV_SECTOR_SIZE,
 557                                            &blk->qiov, 0, blk_mig_read_cb,
 558                                            blk);
 559
 560                blk_mig_lock();
 561                block_mig_state.submitted++;
 562                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
 563                blk_mig_unlock();
 564            } else {
 565                ret = blk_pread(bmds->blk, sector * BDRV_SECTOR_SIZE,
 566                                nr_sectors * BDRV_SECTOR_SIZE, blk->buf, 0);
 567                if (ret < 0) {
 568                    goto error;
 569                }
 570                blk_send(f, blk);
 571
 572                g_free(blk->buf);
 573                g_free(blk);
 574            }
 575
 576            sector += nr_sectors;
 577            bmds->cur_dirty = sector;
 578            break;
 579        }
 580
 581        bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap);
 582        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
 583        bmds->cur_dirty = sector;
 584    }
 585
 586    return (bmds->cur_dirty >= bmds->total_sectors);
 587
 588error:
 589    trace_migration_block_save_device_dirty(sector);
 590    g_free(blk->buf);
 591    g_free(blk);
 592    return ret;
 593}
 594
 595/* Called with iothread lock taken.
 596 *
 597 * return value:
 598 * 0: too much data for max_downtime
 599 * 1: few enough data for max_downtime
 600*/
 601static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
 602{
 603    BlkMigDevState *bmds;
 604    int ret = 1;
 605
 606    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 607        aio_context_acquire(blk_get_aio_context(bmds->blk));
 608        ret = mig_save_device_dirty(f, bmds, is_async);
 609        aio_context_release(blk_get_aio_context(bmds->blk));
 610        if (ret <= 0) {
 611            break;
 612        }
 613    }
 614
 615    return ret;
 616}
 617
 618/* Called with no locks taken.  */
 619
 620static int flush_blks(QEMUFile *f)
 621{
 622    BlkMigBlock *blk;
 623    int ret = 0;
 624
 625    trace_migration_block_flush_blks("Enter", block_mig_state.submitted,
 626                                     block_mig_state.read_done,
 627                                     block_mig_state.transferred);
 628
 629    blk_mig_lock();
 630    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
 631        if (migration_rate_exceeded(f)) {
 632            break;
 633        }
 634        if (blk->ret < 0) {
 635            ret = blk->ret;
 636            break;
 637        }
 638
 639        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
 640        blk_mig_unlock();
 641        blk_send(f, blk);
 642        blk_mig_lock();
 643
 644        g_free(blk->buf);
 645        g_free(blk);
 646
 647        block_mig_state.read_done--;
 648        block_mig_state.transferred++;
 649        assert(block_mig_state.read_done >= 0);
 650    }
 651    blk_mig_unlock();
 652
 653    trace_migration_block_flush_blks("Exit", block_mig_state.submitted,
 654                                     block_mig_state.read_done,
 655                                     block_mig_state.transferred);
 656    return ret;
 657}
 658
 659/* Called with iothread lock taken.  */
 660
 661static int64_t get_remaining_dirty(void)
 662{
 663    BlkMigDevState *bmds;
 664    int64_t dirty = 0;
 665
 666    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 667        aio_context_acquire(blk_get_aio_context(bmds->blk));
 668        dirty += bdrv_get_dirty_count(bmds->dirty_bitmap);
 669        aio_context_release(blk_get_aio_context(bmds->blk));
 670    }
 671
 672    return dirty;
 673}
 674
 675
 676
 677/* Called with iothread lock taken.  */
 678static void block_migration_cleanup_bmds(void)
 679{
 680    BlkMigDevState *bmds;
 681    BlockDriverState *bs;
 682    AioContext *ctx;
 683
 684    unset_dirty_tracking();
 685
 686    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
 687        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
 688
 689        bs = blk_bs(bmds->blk);
 690        if (bs) {
 691            bdrv_op_unblock_all(bs, bmds->blocker);
 692        }
 693        error_free(bmds->blocker);
 694
 695        /* Save ctx, because bmds->blk can disappear during blk_unref.  */
 696        ctx = blk_get_aio_context(bmds->blk);
 697        aio_context_acquire(ctx);
 698        blk_unref(bmds->blk);
 699        aio_context_release(ctx);
 700
 701        g_free(bmds->blk_name);
 702        g_free(bmds->aio_bitmap);
 703        g_free(bmds);
 704    }
 705}
 706
 707/* Called with iothread lock taken.  */
 708static void block_migration_cleanup(void *opaque)
 709{
 710    BlkMigBlock *blk;
 711
 712    bdrv_drain_all();
 713
 714    block_migration_cleanup_bmds();
 715
 716    blk_mig_lock();
 717    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
 718        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
 719        g_free(blk->buf);
 720        g_free(blk);
 721    }
 722    blk_mig_unlock();
 723}
 724
 725static int block_save_setup(QEMUFile *f, void *opaque)
 726{
 727    int ret;
 728
 729    trace_migration_block_save("setup", block_mig_state.submitted,
 730                               block_mig_state.transferred);
 731
 732    qemu_mutex_lock_iothread();
 733    ret = init_blk_migration(f);
 734    if (ret < 0) {
 735        qemu_mutex_unlock_iothread();
 736        return ret;
 737    }
 738
 739    /* start track dirty blocks */
 740    ret = set_dirty_tracking();
 741
 742    qemu_mutex_unlock_iothread();
 743
 744    if (ret) {
 745        return ret;
 746    }
 747
 748    ret = flush_blks(f);
 749    blk_mig_reset_dirty_cursor();
 750    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 751
 752    return ret;
 753}
 754
 755static int block_save_iterate(QEMUFile *f, void *opaque)
 756{
 757    int ret;
 758    uint64_t last_bytes = qemu_file_transferred(f);
 759
 760    trace_migration_block_save("iterate", block_mig_state.submitted,
 761                               block_mig_state.transferred);
 762
 763    ret = flush_blks(f);
 764    if (ret) {
 765        return ret;
 766    }
 767
 768    blk_mig_reset_dirty_cursor();
 769
 770    /* control the rate of transfer */
 771    blk_mig_lock();
 772    while (block_mig_state.read_done * BLK_MIG_BLOCK_SIZE <
 773           migration_rate_get() &&
 774           block_mig_state.submitted < MAX_PARALLEL_IO &&
 775           (block_mig_state.submitted + block_mig_state.read_done) <
 776           MAX_IO_BUFFERS) {
 777        blk_mig_unlock();
 778        if (block_mig_state.bulk_completed == 0) {
 779            /* first finish the bulk phase */
 780            if (blk_mig_save_bulked_block(f) == 0) {
 781                /* finished saving bulk on all devices */
 782                block_mig_state.bulk_completed = 1;
 783            }
 784            ret = 0;
 785        } else {
 786            /* Always called with iothread lock taken for
 787             * simplicity, block_save_complete also calls it.
 788             */
 789            qemu_mutex_lock_iothread();
 790            ret = blk_mig_save_dirty_block(f, 1);
 791            qemu_mutex_unlock_iothread();
 792        }
 793        if (ret < 0) {
 794            return ret;
 795        }
 796        blk_mig_lock();
 797        if (ret != 0) {
 798            /* no more dirty blocks */
 799            break;
 800        }
 801    }
 802    blk_mig_unlock();
 803
 804    ret = flush_blks(f);
 805    if (ret) {
 806        return ret;
 807    }
 808
 809    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 810    uint64_t delta_bytes = qemu_file_transferred(f) - last_bytes;
 811    return (delta_bytes > 0);
 812}
 813
 814/* Called with iothread lock taken.  */
 815
 816static int block_save_complete(QEMUFile *f, void *opaque)
 817{
 818    int ret;
 819
 820    trace_migration_block_save("complete", block_mig_state.submitted,
 821                               block_mig_state.transferred);
 822
 823    ret = flush_blks(f);
 824    if (ret) {
 825        return ret;
 826    }
 827
 828    blk_mig_reset_dirty_cursor();
 829
 830    /* we know for sure that save bulk is completed and
 831       all async read completed */
 832    blk_mig_lock();
 833    assert(block_mig_state.submitted == 0);
 834    blk_mig_unlock();
 835
 836    do {
 837        ret = blk_mig_save_dirty_block(f, 0);
 838        if (ret < 0) {
 839            return ret;
 840        }
 841    } while (ret == 0);
 842
 843    /* report completion */
 844    qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
 845
 846    trace_migration_block_save_complete();
 847
 848    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 849
 850    /* Make sure that our BlockBackends are gone, so that the block driver
 851     * nodes can be inactivated. */
 852    block_migration_cleanup_bmds();
 853
 854    return 0;
 855}
 856
 857static void block_state_pending(void *opaque, uint64_t *must_precopy,
 858                                uint64_t *can_postcopy)
 859{
 860    /* Estimate pending number of bytes to send */
 861    uint64_t pending;
 862
 863    qemu_mutex_lock_iothread();
 864    pending = get_remaining_dirty();
 865    qemu_mutex_unlock_iothread();
 866
 867    blk_mig_lock();
 868    pending += block_mig_state.submitted * BLK_MIG_BLOCK_SIZE +
 869               block_mig_state.read_done * BLK_MIG_BLOCK_SIZE;
 870    blk_mig_unlock();
 871
 872    /* Report at least one block pending during bulk phase */
 873    if (!pending && !block_mig_state.bulk_completed) {
 874        pending = BLK_MIG_BLOCK_SIZE;
 875    }
 876
 877    trace_migration_block_state_pending(pending);
 878    /* We don't do postcopy */
 879    *must_precopy += pending;
 880}
 881
 882static int block_load(QEMUFile *f, void *opaque, int version_id)
 883{
 884    static int banner_printed;
 885    int len, flags;
 886    char device_name[256];
 887    int64_t addr;
 888    BlockBackend *blk, *blk_prev = NULL;
 889    Error *local_err = NULL;
 890    uint8_t *buf;
 891    int64_t total_sectors = 0;
 892    int nr_sectors;
 893    int ret;
 894    BlockDriverInfo bdi;
 895    int cluster_size = BLK_MIG_BLOCK_SIZE;
 896
 897    do {
 898        addr = qemu_get_be64(f);
 899
 900        flags = addr & (BDRV_SECTOR_SIZE - 1);
 901        addr >>= BDRV_SECTOR_BITS;
 902
 903        if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
 904            /* get device name */
 905            len = qemu_get_byte(f);
 906            qemu_get_buffer(f, (uint8_t *)device_name, len);
 907            device_name[len] = '\0';
 908
 909            blk = blk_by_name(device_name);
 910            if (!blk) {
 911                fprintf(stderr, "Error unknown block device %s\n",
 912                        device_name);
 913                return -EINVAL;
 914            }
 915
 916            if (blk != blk_prev) {
 917                blk_prev = blk;
 918                total_sectors = blk_nb_sectors(blk);
 919                if (total_sectors <= 0) {
 920                    error_report("Error getting length of block device %s",
 921                                 device_name);
 922                    return -EINVAL;
 923                }
 924
 925                blk_activate(blk, &local_err);
 926                if (local_err) {
 927                    error_report_err(local_err);
 928                    return -EINVAL;
 929                }
 930
 931                ret = bdrv_get_info(blk_bs(blk), &bdi);
 932                if (ret == 0 && bdi.cluster_size > 0 &&
 933                    bdi.cluster_size <= BLK_MIG_BLOCK_SIZE &&
 934                    BLK_MIG_BLOCK_SIZE % bdi.cluster_size == 0) {
 935                    cluster_size = bdi.cluster_size;
 936                } else {
 937                    cluster_size = BLK_MIG_BLOCK_SIZE;
 938                }
 939            }
 940
 941            if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
 942                nr_sectors = total_sectors - addr;
 943            } else {
 944                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
 945            }
 946
 947            if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
 948                ret = blk_pwrite_zeroes(blk, addr * BDRV_SECTOR_SIZE,
 949                                        nr_sectors * BDRV_SECTOR_SIZE,
 950                                        BDRV_REQ_MAY_UNMAP);
 951            } else {
 952                int i;
 953                int64_t cur_addr;
 954                uint8_t *cur_buf;
 955
 956                buf = g_malloc(BLK_MIG_BLOCK_SIZE);
 957                qemu_get_buffer(f, buf, BLK_MIG_BLOCK_SIZE);
 958                for (i = 0; i < BLK_MIG_BLOCK_SIZE / cluster_size; i++) {
 959                    cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size;
 960                    cur_buf = buf + i * cluster_size;
 961
 962                    if ((!block_mig_state.zero_blocks ||
 963                        cluster_size < BLK_MIG_BLOCK_SIZE) &&
 964                        buffer_is_zero(cur_buf, cluster_size)) {
 965                        ret = blk_pwrite_zeroes(blk, cur_addr,
 966                                                cluster_size,
 967                                                BDRV_REQ_MAY_UNMAP);
 968                    } else {
 969                        ret = blk_pwrite(blk, cur_addr, cluster_size, cur_buf,
 970                                         0);
 971                    }
 972                    if (ret < 0) {
 973                        break;
 974                    }
 975                }
 976                g_free(buf);
 977            }
 978
 979            if (ret < 0) {
 980                return ret;
 981            }
 982        } else if (flags & BLK_MIG_FLAG_PROGRESS) {
 983            if (!banner_printed) {
 984                printf("Receiving block device images\n");
 985                banner_printed = 1;
 986            }
 987            printf("Completed %d %%%c", (int)addr,
 988                   (addr == 100) ? '\n' : '\r');
 989            fflush(stdout);
 990        } else if (!(flags & BLK_MIG_FLAG_EOS)) {
 991            fprintf(stderr, "Unknown block migration flags: 0x%x\n", flags);
 992            return -EINVAL;
 993        }
 994        ret = qemu_file_get_error(f);
 995        if (ret != 0) {
 996            return ret;
 997        }
 998    } while (!(flags & BLK_MIG_FLAG_EOS));
 999
1000    return 0;
1001}
1002
1003static bool block_is_active(void *opaque)
1004{
1005    return migrate_block();
1006}
1007
1008static SaveVMHandlers savevm_block_handlers = {
1009    .save_setup = block_save_setup,
1010    .save_live_iterate = block_save_iterate,
1011    .save_live_complete_precopy = block_save_complete,
1012    .state_pending_exact = block_state_pending,
1013    .state_pending_estimate = block_state_pending,
1014    .load_state = block_load,
1015    .save_cleanup = block_migration_cleanup,
1016    .is_active = block_is_active,
1017};
1018
1019void blk_mig_init(void)
1020{
1021    QSIMPLEQ_INIT(&block_mig_state.bmds_list);
1022    QSIMPLEQ_INIT(&block_mig_state.blk_list);
1023    qemu_mutex_init(&block_mig_state.lock);
1024
1025    register_savevm_live("block", 0, 1, &savevm_block_handlers,
1026                         &block_mig_state);
1027}
1028