qemu/migration/block.c
<<
>>
Prefs
   1/*
   2 * QEMU live block migration
   3 *
   4 * Copyright IBM, Corp. 2009
   5 *
   6 * Authors:
   7 *  Liran Schour   <lirans@il.ibm.com>
   8 *
   9 * This work is licensed under the terms of the GNU GPL, version 2.  See
  10 * the COPYING file in the top-level directory.
  11 *
  12 * Contributions after 2012-01-13 are licensed under the terms of the
  13 * GNU GPL, version 2 or (at your option) any later version.
  14 */
  15
  16#include "qemu/osdep.h"
  17#include "qapi/error.h"
  18#include "qemu/error-report.h"
  19#include "qemu/main-loop.h"
  20#include "qemu/cutils.h"
  21#include "qemu/queue.h"
  22#include "block.h"
  23#include "block/dirty-bitmap.h"
  24#include "migration/misc.h"
  25#include "migration.h"
  26#include "migration/register.h"
  27#include "qemu-file.h"
  28#include "migration/vmstate.h"
  29#include "sysemu/block-backend.h"
  30#include "trace.h"
  31
  32#define BLK_MIG_BLOCK_SIZE           (1ULL << 20)
  33#define BDRV_SECTORS_PER_DIRTY_CHUNK (BLK_MIG_BLOCK_SIZE >> BDRV_SECTOR_BITS)
  34
  35#define BLK_MIG_FLAG_DEVICE_BLOCK       0x01
  36#define BLK_MIG_FLAG_EOS                0x02
  37#define BLK_MIG_FLAG_PROGRESS           0x04
  38#define BLK_MIG_FLAG_ZERO_BLOCK         0x08
  39
  40#define MAX_IS_ALLOCATED_SEARCH (65536 * BDRV_SECTOR_SIZE)
  41
  42#define MAX_IO_BUFFERS 512
  43#define MAX_PARALLEL_IO 16
  44
  45typedef struct BlkMigDevState {
  46    /* Written during setup phase.  Can be read without a lock.  */
  47    BlockBackend *blk;
  48    char *blk_name;
  49    int shared_base;
  50    int64_t total_sectors;
  51    QSIMPLEQ_ENTRY(BlkMigDevState) entry;
  52    Error *blocker;
  53
  54    /* Only used by migration thread.  Does not need a lock.  */
  55    int bulk_completed;
  56    int64_t cur_sector;
  57    int64_t cur_dirty;
  58
  59    /* Data in the aio_bitmap is protected by block migration lock.
  60     * Allocation and free happen during setup and cleanup respectively.
  61     */
  62    unsigned long *aio_bitmap;
  63
  64    /* Protected by block migration lock.  */
  65    int64_t completed_sectors;
  66
  67    /* During migration this is protected by iothread lock / AioContext.
  68     * Allocation and free happen during setup and cleanup respectively.
  69     */
  70    BdrvDirtyBitmap *dirty_bitmap;
  71} BlkMigDevState;
  72
  73typedef struct BlkMigBlock {
  74    /* Only used by migration thread.  */
  75    uint8_t *buf;
  76    BlkMigDevState *bmds;
  77    int64_t sector;
  78    int nr_sectors;
  79    QEMUIOVector qiov;
  80    BlockAIOCB *aiocb;
  81
  82    /* Protected by block migration lock.  */
  83    int ret;
  84    QSIMPLEQ_ENTRY(BlkMigBlock) entry;
  85} BlkMigBlock;
  86
  87typedef struct BlkMigState {
  88    QSIMPLEQ_HEAD(, BlkMigDevState) bmds_list;
  89    int64_t total_sector_sum;
  90    bool zero_blocks;
  91
  92    /* Protected by lock.  */
  93    QSIMPLEQ_HEAD(, BlkMigBlock) blk_list;
  94    int submitted;
  95    int read_done;
  96
  97    /* Only used by migration thread.  Does not need a lock.  */
  98    int transferred;
  99    int prev_progress;
 100    int bulk_completed;
 101
 102    /* Lock must be taken _inside_ the iothread lock and any AioContexts.  */
 103    QemuMutex lock;
 104} BlkMigState;
 105
 106static BlkMigState block_mig_state;
 107
 108static void blk_mig_lock(void)
 109{
 110    qemu_mutex_lock(&block_mig_state.lock);
 111}
 112
 113static void blk_mig_unlock(void)
 114{
 115    qemu_mutex_unlock(&block_mig_state.lock);
 116}
 117
 118/* Must run outside of the iothread lock during the bulk phase,
 119 * or the VM will stall.
 120 */
 121
 122static void blk_send(QEMUFile *f, BlkMigBlock * blk)
 123{
 124    int len;
 125    uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK;
 126
 127    if (block_mig_state.zero_blocks &&
 128        buffer_is_zero(blk->buf, BLK_MIG_BLOCK_SIZE)) {
 129        flags |= BLK_MIG_FLAG_ZERO_BLOCK;
 130    }
 131
 132    /* sector number and flags */
 133    qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS)
 134                     | flags);
 135
 136    /* device name */
 137    len = strlen(blk->bmds->blk_name);
 138    qemu_put_byte(f, len);
 139    qemu_put_buffer(f, (uint8_t *) blk->bmds->blk_name, len);
 140
 141    /* if a block is zero we need to flush here since the network
 142     * bandwidth is now a lot higher than the storage device bandwidth.
 143     * thus if we queue zero blocks we slow down the migration */
 144    if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
 145        qemu_fflush(f);
 146        return;
 147    }
 148
 149    qemu_put_buffer(f, blk->buf, BLK_MIG_BLOCK_SIZE);
 150}
 151
 152int blk_mig_active(void)
 153{
 154    return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list);
 155}
 156
 157int blk_mig_bulk_active(void)
 158{
 159    return blk_mig_active() && !block_mig_state.bulk_completed;
 160}
 161
 162uint64_t blk_mig_bytes_transferred(void)
 163{
 164    BlkMigDevState *bmds;
 165    uint64_t sum = 0;
 166
 167    blk_mig_lock();
 168    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 169        sum += bmds->completed_sectors;
 170    }
 171    blk_mig_unlock();
 172    return sum << BDRV_SECTOR_BITS;
 173}
 174
 175uint64_t blk_mig_bytes_remaining(void)
 176{
 177    return blk_mig_bytes_total() - blk_mig_bytes_transferred();
 178}
 179
 180uint64_t blk_mig_bytes_total(void)
 181{
 182    BlkMigDevState *bmds;
 183    uint64_t sum = 0;
 184
 185    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 186        sum += bmds->total_sectors;
 187    }
 188    return sum << BDRV_SECTOR_BITS;
 189}
 190
 191
 192/* Called with migration lock held.  */
 193
 194static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
 195{
 196    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
 197
 198    if (sector < bmds->total_sectors) {
 199        return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
 200            (1UL << (chunk % (sizeof(unsigned long) * 8))));
 201    } else {
 202        return 0;
 203    }
 204}
 205
 206/* Called with migration lock held.  */
 207
 208static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num,
 209                             int nb_sectors, int set)
 210{
 211    int64_t start, end;
 212    unsigned long val, idx, bit;
 213
 214    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
 215    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
 216
 217    for (; start <= end; start++) {
 218        idx = start / (sizeof(unsigned long) * 8);
 219        bit = start % (sizeof(unsigned long) * 8);
 220        val = bmds->aio_bitmap[idx];
 221        if (set) {
 222            val |= 1UL << bit;
 223        } else {
 224            val &= ~(1UL << bit);
 225        }
 226        bmds->aio_bitmap[idx] = val;
 227    }
 228}
 229
 230static void alloc_aio_bitmap(BlkMigDevState *bmds)
 231{
 232    int64_t bitmap_size;
 233
 234    bitmap_size = bmds->total_sectors + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
 235    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
 236
 237    bmds->aio_bitmap = g_malloc0(bitmap_size);
 238}
 239
 240/* Never hold migration lock when yielding to the main loop!  */
 241
 242static void blk_mig_read_cb(void *opaque, int ret)
 243{
 244    BlkMigBlock *blk = opaque;
 245
 246    blk_mig_lock();
 247    blk->ret = ret;
 248
 249    QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
 250    bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0);
 251
 252    block_mig_state.submitted--;
 253    block_mig_state.read_done++;
 254    assert(block_mig_state.submitted >= 0);
 255    blk_mig_unlock();
 256}
 257
 258/* Called with no lock taken.  */
 259
 260static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds)
 261{
 262    int64_t total_sectors = bmds->total_sectors;
 263    int64_t cur_sector = bmds->cur_sector;
 264    BlockBackend *bb = bmds->blk;
 265    BlkMigBlock *blk;
 266    int nr_sectors;
 267    int64_t count;
 268
 269    if (bmds->shared_base) {
 270        qemu_mutex_lock_iothread();
 271        aio_context_acquire(blk_get_aio_context(bb));
 272        /* Skip unallocated sectors; intentionally treats failure or
 273         * partial sector as an allocated sector */
 274        while (cur_sector < total_sectors &&
 275               !bdrv_is_allocated(blk_bs(bb), cur_sector * BDRV_SECTOR_SIZE,
 276                                  MAX_IS_ALLOCATED_SEARCH, &count)) {
 277            if (count < BDRV_SECTOR_SIZE) {
 278                break;
 279            }
 280            cur_sector += count >> BDRV_SECTOR_BITS;
 281        }
 282        aio_context_release(blk_get_aio_context(bb));
 283        qemu_mutex_unlock_iothread();
 284    }
 285
 286    if (cur_sector >= total_sectors) {
 287        bmds->cur_sector = bmds->completed_sectors = total_sectors;
 288        return 1;
 289    }
 290
 291    bmds->completed_sectors = cur_sector;
 292
 293    cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
 294
 295    /* we are going to transfer a full block even if it is not allocated */
 296    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
 297
 298    if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
 299        nr_sectors = total_sectors - cur_sector;
 300    }
 301
 302    blk = g_new(BlkMigBlock, 1);
 303    blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE);
 304    blk->bmds = bmds;
 305    blk->sector = cur_sector;
 306    blk->nr_sectors = nr_sectors;
 307
 308    qemu_iovec_init_buf(&blk->qiov, blk->buf, nr_sectors * BDRV_SECTOR_SIZE);
 309
 310    blk_mig_lock();
 311    block_mig_state.submitted++;
 312    blk_mig_unlock();
 313
 314    /* We do not know if bs is under the main thread (and thus does
 315     * not acquire the AioContext when doing AIO) or rather under
 316     * dataplane.  Thus acquire both the iothread mutex and the
 317     * AioContext.
 318     *
 319     * This is ugly and will disappear when we make bdrv_* thread-safe,
 320     * without the need to acquire the AioContext.
 321     */
 322    qemu_mutex_lock_iothread();
 323    aio_context_acquire(blk_get_aio_context(bmds->blk));
 324    bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector * BDRV_SECTOR_SIZE,
 325                            nr_sectors * BDRV_SECTOR_SIZE);
 326    blk->aiocb = blk_aio_preadv(bb, cur_sector * BDRV_SECTOR_SIZE, &blk->qiov,
 327                                0, blk_mig_read_cb, blk);
 328    aio_context_release(blk_get_aio_context(bmds->blk));
 329    qemu_mutex_unlock_iothread();
 330
 331    bmds->cur_sector = cur_sector + nr_sectors;
 332    return (bmds->cur_sector >= total_sectors);
 333}
 334
 335/* Called with iothread lock taken.  */
 336
 337static int set_dirty_tracking(void)
 338{
 339    BlkMigDevState *bmds;
 340    int ret;
 341
 342    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 343        bmds->dirty_bitmap = bdrv_create_dirty_bitmap(blk_bs(bmds->blk),
 344                                                      BLK_MIG_BLOCK_SIZE,
 345                                                      NULL, NULL);
 346        if (!bmds->dirty_bitmap) {
 347            ret = -errno;
 348            goto fail;
 349        }
 350    }
 351    return 0;
 352
 353fail:
 354    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 355        if (bmds->dirty_bitmap) {
 356            bdrv_release_dirty_bitmap(bmds->dirty_bitmap);
 357        }
 358    }
 359    return ret;
 360}
 361
 362/* Called with iothread lock taken.  */
 363
 364static void unset_dirty_tracking(void)
 365{
 366    BlkMigDevState *bmds;
 367
 368    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 369        bdrv_release_dirty_bitmap(bmds->dirty_bitmap);
 370    }
 371}
 372
 373static int init_blk_migration(QEMUFile *f)
 374{
 375    BlockDriverState *bs;
 376    BlkMigDevState *bmds;
 377    int64_t sectors;
 378    BdrvNextIterator it;
 379    int i, num_bs = 0;
 380    struct {
 381        BlkMigDevState *bmds;
 382        BlockDriverState *bs;
 383    } *bmds_bs;
 384    Error *local_err = NULL;
 385    int ret;
 386
 387    block_mig_state.submitted = 0;
 388    block_mig_state.read_done = 0;
 389    block_mig_state.transferred = 0;
 390    block_mig_state.total_sector_sum = 0;
 391    block_mig_state.prev_progress = -1;
 392    block_mig_state.bulk_completed = 0;
 393    block_mig_state.zero_blocks = migrate_zero_blocks();
 394
 395    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 396        num_bs++;
 397    }
 398    bmds_bs = g_malloc0(num_bs * sizeof(*bmds_bs));
 399
 400    for (i = 0, bs = bdrv_first(&it); bs; bs = bdrv_next(&it), i++) {
 401        if (bdrv_is_read_only(bs)) {
 402            continue;
 403        }
 404
 405        sectors = bdrv_nb_sectors(bs);
 406        if (sectors <= 0) {
 407            ret = sectors;
 408            bdrv_next_cleanup(&it);
 409            goto out;
 410        }
 411
 412        bmds = g_new0(BlkMigDevState, 1);
 413        bmds->blk = blk_new(qemu_get_aio_context(),
 414                            BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
 415        bmds->blk_name = g_strdup(bdrv_get_device_name(bs));
 416        bmds->bulk_completed = 0;
 417        bmds->total_sectors = sectors;
 418        bmds->completed_sectors = 0;
 419        bmds->shared_base = migrate_use_block_incremental();
 420
 421        assert(i < num_bs);
 422        bmds_bs[i].bmds = bmds;
 423        bmds_bs[i].bs = bs;
 424
 425        block_mig_state.total_sector_sum += sectors;
 426
 427        if (bmds->shared_base) {
 428            trace_migration_block_init_shared(bdrv_get_device_name(bs));
 429        } else {
 430            trace_migration_block_init_full(bdrv_get_device_name(bs));
 431        }
 432
 433        QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry);
 434    }
 435
 436    /* Can only insert new BDSes now because doing so while iterating block
 437     * devices may end up in a deadlock (iterating the new BDSes, too). */
 438    for (i = 0; i < num_bs; i++) {
 439        BlkMigDevState *bmds = bmds_bs[i].bmds;
 440        BlockDriverState *bs = bmds_bs[i].bs;
 441
 442        if (bmds) {
 443            ret = blk_insert_bs(bmds->blk, bs, &local_err);
 444            if (ret < 0) {
 445                error_report_err(local_err);
 446                goto out;
 447            }
 448
 449            alloc_aio_bitmap(bmds);
 450            error_setg(&bmds->blocker, "block device is in use by migration");
 451            bdrv_op_block_all(bs, bmds->blocker);
 452        }
 453    }
 454
 455    ret = 0;
 456out:
 457    g_free(bmds_bs);
 458    return ret;
 459}
 460
 461/* Called with no lock taken.  */
 462
 463static int blk_mig_save_bulked_block(QEMUFile *f)
 464{
 465    int64_t completed_sector_sum = 0;
 466    BlkMigDevState *bmds;
 467    int progress;
 468    int ret = 0;
 469
 470    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 471        if (bmds->bulk_completed == 0) {
 472            if (mig_save_device_bulk(f, bmds) == 1) {
 473                /* completed bulk section for this device */
 474                bmds->bulk_completed = 1;
 475            }
 476            completed_sector_sum += bmds->completed_sectors;
 477            ret = 1;
 478            break;
 479        } else {
 480            completed_sector_sum += bmds->completed_sectors;
 481        }
 482    }
 483
 484    if (block_mig_state.total_sector_sum != 0) {
 485        progress = completed_sector_sum * 100 /
 486                   block_mig_state.total_sector_sum;
 487    } else {
 488        progress = 100;
 489    }
 490    if (progress != block_mig_state.prev_progress) {
 491        block_mig_state.prev_progress = progress;
 492        qemu_put_be64(f, (progress << BDRV_SECTOR_BITS)
 493                         | BLK_MIG_FLAG_PROGRESS);
 494        trace_migration_block_progression(progress);
 495    }
 496
 497    return ret;
 498}
 499
 500static void blk_mig_reset_dirty_cursor(void)
 501{
 502    BlkMigDevState *bmds;
 503
 504    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 505        bmds->cur_dirty = 0;
 506    }
 507}
 508
 509/* Called with iothread lock and AioContext taken.  */
 510
 511static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds,
 512                                 int is_async)
 513{
 514    BlkMigBlock *blk;
 515    int64_t total_sectors = bmds->total_sectors;
 516    int64_t sector;
 517    int nr_sectors;
 518    int ret = -EIO;
 519
 520    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
 521        blk_mig_lock();
 522        if (bmds_aio_inflight(bmds, sector)) {
 523            blk_mig_unlock();
 524            blk_drain(bmds->blk);
 525        } else {
 526            blk_mig_unlock();
 527        }
 528        bdrv_dirty_bitmap_lock(bmds->dirty_bitmap);
 529        if (bdrv_dirty_bitmap_get_locked(bmds->dirty_bitmap,
 530                                         sector * BDRV_SECTOR_SIZE)) {
 531            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
 532                nr_sectors = total_sectors - sector;
 533            } else {
 534                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
 535            }
 536            bdrv_reset_dirty_bitmap_locked(bmds->dirty_bitmap,
 537                                           sector * BDRV_SECTOR_SIZE,
 538                                           nr_sectors * BDRV_SECTOR_SIZE);
 539            bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap);
 540
 541            blk = g_new(BlkMigBlock, 1);
 542            blk->buf = g_malloc(BLK_MIG_BLOCK_SIZE);
 543            blk->bmds = bmds;
 544            blk->sector = sector;
 545            blk->nr_sectors = nr_sectors;
 546
 547            if (is_async) {
 548                qemu_iovec_init_buf(&blk->qiov, blk->buf,
 549                                    nr_sectors * BDRV_SECTOR_SIZE);
 550
 551                blk->aiocb = blk_aio_preadv(bmds->blk,
 552                                            sector * BDRV_SECTOR_SIZE,
 553                                            &blk->qiov, 0, blk_mig_read_cb,
 554                                            blk);
 555
 556                blk_mig_lock();
 557                block_mig_state.submitted++;
 558                bmds_set_aio_inflight(bmds, sector, nr_sectors, 1);
 559                blk_mig_unlock();
 560            } else {
 561                ret = blk_pread(bmds->blk, sector * BDRV_SECTOR_SIZE,
 562                                nr_sectors * BDRV_SECTOR_SIZE, blk->buf, 0);
 563                if (ret < 0) {
 564                    goto error;
 565                }
 566                blk_send(f, blk);
 567
 568                g_free(blk->buf);
 569                g_free(blk);
 570            }
 571
 572            sector += nr_sectors;
 573            bmds->cur_dirty = sector;
 574            break;
 575        }
 576
 577        bdrv_dirty_bitmap_unlock(bmds->dirty_bitmap);
 578        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
 579        bmds->cur_dirty = sector;
 580    }
 581
 582    return (bmds->cur_dirty >= bmds->total_sectors);
 583
 584error:
 585    trace_migration_block_save_device_dirty(sector);
 586    g_free(blk->buf);
 587    g_free(blk);
 588    return ret;
 589}
 590
 591/* Called with iothread lock taken.
 592 *
 593 * return value:
 594 * 0: too much data for max_downtime
 595 * 1: few enough data for max_downtime
 596*/
 597static int blk_mig_save_dirty_block(QEMUFile *f, int is_async)
 598{
 599    BlkMigDevState *bmds;
 600    int ret = 1;
 601
 602    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 603        aio_context_acquire(blk_get_aio_context(bmds->blk));
 604        ret = mig_save_device_dirty(f, bmds, is_async);
 605        aio_context_release(blk_get_aio_context(bmds->blk));
 606        if (ret <= 0) {
 607            break;
 608        }
 609    }
 610
 611    return ret;
 612}
 613
 614/* Called with no locks taken.  */
 615
 616static int flush_blks(QEMUFile *f)
 617{
 618    BlkMigBlock *blk;
 619    int ret = 0;
 620
 621    trace_migration_block_flush_blks("Enter", block_mig_state.submitted,
 622                                     block_mig_state.read_done,
 623                                     block_mig_state.transferred);
 624
 625    blk_mig_lock();
 626    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
 627        if (qemu_file_rate_limit(f)) {
 628            break;
 629        }
 630        if (blk->ret < 0) {
 631            ret = blk->ret;
 632            break;
 633        }
 634
 635        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
 636        blk_mig_unlock();
 637        blk_send(f, blk);
 638        blk_mig_lock();
 639
 640        g_free(blk->buf);
 641        g_free(blk);
 642
 643        block_mig_state.read_done--;
 644        block_mig_state.transferred++;
 645        assert(block_mig_state.read_done >= 0);
 646    }
 647    blk_mig_unlock();
 648
 649    trace_migration_block_flush_blks("Exit", block_mig_state.submitted,
 650                                     block_mig_state.read_done,
 651                                     block_mig_state.transferred);
 652    return ret;
 653}
 654
 655/* Called with iothread lock taken.  */
 656
 657static int64_t get_remaining_dirty(void)
 658{
 659    BlkMigDevState *bmds;
 660    int64_t dirty = 0;
 661
 662    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
 663        aio_context_acquire(blk_get_aio_context(bmds->blk));
 664        dirty += bdrv_get_dirty_count(bmds->dirty_bitmap);
 665        aio_context_release(blk_get_aio_context(bmds->blk));
 666    }
 667
 668    return dirty;
 669}
 670
 671
 672
 673/* Called with iothread lock taken.  */
 674static void block_migration_cleanup_bmds(void)
 675{
 676    BlkMigDevState *bmds;
 677    AioContext *ctx;
 678
 679    unset_dirty_tracking();
 680
 681    while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) {
 682        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry);
 683        bdrv_op_unblock_all(blk_bs(bmds->blk), bmds->blocker);
 684        error_free(bmds->blocker);
 685
 686        /* Save ctx, because bmds->blk can disappear during blk_unref.  */
 687        ctx = blk_get_aio_context(bmds->blk);
 688        aio_context_acquire(ctx);
 689        blk_unref(bmds->blk);
 690        aio_context_release(ctx);
 691
 692        g_free(bmds->blk_name);
 693        g_free(bmds->aio_bitmap);
 694        g_free(bmds);
 695    }
 696}
 697
 698/* Called with iothread lock taken.  */
 699static void block_migration_cleanup(void *opaque)
 700{
 701    BlkMigBlock *blk;
 702
 703    bdrv_drain_all();
 704
 705    block_migration_cleanup_bmds();
 706
 707    blk_mig_lock();
 708    while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) {
 709        QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry);
 710        g_free(blk->buf);
 711        g_free(blk);
 712    }
 713    blk_mig_unlock();
 714}
 715
 716static int block_save_setup(QEMUFile *f, void *opaque)
 717{
 718    int ret;
 719
 720    trace_migration_block_save("setup", block_mig_state.submitted,
 721                               block_mig_state.transferred);
 722
 723    qemu_mutex_lock_iothread();
 724    ret = init_blk_migration(f);
 725    if (ret < 0) {
 726        qemu_mutex_unlock_iothread();
 727        return ret;
 728    }
 729
 730    /* start track dirty blocks */
 731    ret = set_dirty_tracking();
 732
 733    qemu_mutex_unlock_iothread();
 734
 735    if (ret) {
 736        return ret;
 737    }
 738
 739    ret = flush_blks(f);
 740    blk_mig_reset_dirty_cursor();
 741    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 742
 743    return ret;
 744}
 745
 746static int block_save_iterate(QEMUFile *f, void *opaque)
 747{
 748    int ret;
 749    int64_t last_bytes = qemu_file_total_transferred(f);
 750    int64_t delta_bytes;
 751
 752    trace_migration_block_save("iterate", block_mig_state.submitted,
 753                               block_mig_state.transferred);
 754
 755    ret = flush_blks(f);
 756    if (ret) {
 757        return ret;
 758    }
 759
 760    blk_mig_reset_dirty_cursor();
 761
 762    /* control the rate of transfer */
 763    blk_mig_lock();
 764    while (block_mig_state.read_done * BLK_MIG_BLOCK_SIZE <
 765           qemu_file_get_rate_limit(f) &&
 766           block_mig_state.submitted < MAX_PARALLEL_IO &&
 767           (block_mig_state.submitted + block_mig_state.read_done) <
 768           MAX_IO_BUFFERS) {
 769        blk_mig_unlock();
 770        if (block_mig_state.bulk_completed == 0) {
 771            /* first finish the bulk phase */
 772            if (blk_mig_save_bulked_block(f) == 0) {
 773                /* finished saving bulk on all devices */
 774                block_mig_state.bulk_completed = 1;
 775            }
 776            ret = 0;
 777        } else {
 778            /* Always called with iothread lock taken for
 779             * simplicity, block_save_complete also calls it.
 780             */
 781            qemu_mutex_lock_iothread();
 782            ret = blk_mig_save_dirty_block(f, 1);
 783            qemu_mutex_unlock_iothread();
 784        }
 785        if (ret < 0) {
 786            return ret;
 787        }
 788        blk_mig_lock();
 789        if (ret != 0) {
 790            /* no more dirty blocks */
 791            break;
 792        }
 793    }
 794    blk_mig_unlock();
 795
 796    ret = flush_blks(f);
 797    if (ret) {
 798        return ret;
 799    }
 800
 801    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 802    delta_bytes = qemu_file_total_transferred(f) - last_bytes;
 803    if (delta_bytes > 0) {
 804        return 1;
 805    } else if (delta_bytes < 0) {
 806        return -1;
 807    } else {
 808        return 0;
 809    }
 810}
 811
 812/* Called with iothread lock taken.  */
 813
 814static int block_save_complete(QEMUFile *f, void *opaque)
 815{
 816    int ret;
 817
 818    trace_migration_block_save("complete", block_mig_state.submitted,
 819                               block_mig_state.transferred);
 820
 821    ret = flush_blks(f);
 822    if (ret) {
 823        return ret;
 824    }
 825
 826    blk_mig_reset_dirty_cursor();
 827
 828    /* we know for sure that save bulk is completed and
 829       all async read completed */
 830    blk_mig_lock();
 831    assert(block_mig_state.submitted == 0);
 832    blk_mig_unlock();
 833
 834    do {
 835        ret = blk_mig_save_dirty_block(f, 0);
 836        if (ret < 0) {
 837            return ret;
 838        }
 839    } while (ret == 0);
 840
 841    /* report completion */
 842    qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS);
 843
 844    trace_migration_block_save_complete();
 845
 846    qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 847
 848    /* Make sure that our BlockBackends are gone, so that the block driver
 849     * nodes can be inactivated. */
 850    block_migration_cleanup_bmds();
 851
 852    return 0;
 853}
 854
 855static void block_state_pending(void *opaque, uint64_t *must_precopy,
 856                                uint64_t *can_postcopy)
 857{
 858    /* Estimate pending number of bytes to send */
 859    uint64_t pending;
 860
 861    qemu_mutex_lock_iothread();
 862    pending = get_remaining_dirty();
 863    qemu_mutex_unlock_iothread();
 864
 865    blk_mig_lock();
 866    pending += block_mig_state.submitted * BLK_MIG_BLOCK_SIZE +
 867               block_mig_state.read_done * BLK_MIG_BLOCK_SIZE;
 868    blk_mig_unlock();
 869
 870    /* Report at least one block pending during bulk phase */
 871    if (!pending && !block_mig_state.bulk_completed) {
 872        pending = BLK_MIG_BLOCK_SIZE;
 873    }
 874
 875    trace_migration_block_state_pending(pending);
 876    /* We don't do postcopy */
 877    *must_precopy += pending;
 878}
 879
 880static int block_load(QEMUFile *f, void *opaque, int version_id)
 881{
 882    static int banner_printed;
 883    int len, flags;
 884    char device_name[256];
 885    int64_t addr;
 886    BlockBackend *blk, *blk_prev = NULL;
 887    Error *local_err = NULL;
 888    uint8_t *buf;
 889    int64_t total_sectors = 0;
 890    int nr_sectors;
 891    int ret;
 892    BlockDriverInfo bdi;
 893    int cluster_size = BLK_MIG_BLOCK_SIZE;
 894
 895    do {
 896        addr = qemu_get_be64(f);
 897
 898        flags = addr & (BDRV_SECTOR_SIZE - 1);
 899        addr >>= BDRV_SECTOR_BITS;
 900
 901        if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) {
 902            /* get device name */
 903            len = qemu_get_byte(f);
 904            qemu_get_buffer(f, (uint8_t *)device_name, len);
 905            device_name[len] = '\0';
 906
 907            blk = blk_by_name(device_name);
 908            if (!blk) {
 909                fprintf(stderr, "Error unknown block device %s\n",
 910                        device_name);
 911                return -EINVAL;
 912            }
 913
 914            if (blk != blk_prev) {
 915                blk_prev = blk;
 916                total_sectors = blk_nb_sectors(blk);
 917                if (total_sectors <= 0) {
 918                    error_report("Error getting length of block device %s",
 919                                 device_name);
 920                    return -EINVAL;
 921                }
 922
 923                blk_activate(blk, &local_err);
 924                if (local_err) {
 925                    error_report_err(local_err);
 926                    return -EINVAL;
 927                }
 928
 929                ret = bdrv_get_info(blk_bs(blk), &bdi);
 930                if (ret == 0 && bdi.cluster_size > 0 &&
 931                    bdi.cluster_size <= BLK_MIG_BLOCK_SIZE &&
 932                    BLK_MIG_BLOCK_SIZE % bdi.cluster_size == 0) {
 933                    cluster_size = bdi.cluster_size;
 934                } else {
 935                    cluster_size = BLK_MIG_BLOCK_SIZE;
 936                }
 937            }
 938
 939            if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
 940                nr_sectors = total_sectors - addr;
 941            } else {
 942                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
 943            }
 944
 945            if (flags & BLK_MIG_FLAG_ZERO_BLOCK) {
 946                ret = blk_pwrite_zeroes(blk, addr * BDRV_SECTOR_SIZE,
 947                                        nr_sectors * BDRV_SECTOR_SIZE,
 948                                        BDRV_REQ_MAY_UNMAP);
 949            } else {
 950                int i;
 951                int64_t cur_addr;
 952                uint8_t *cur_buf;
 953
 954                buf = g_malloc(BLK_MIG_BLOCK_SIZE);
 955                qemu_get_buffer(f, buf, BLK_MIG_BLOCK_SIZE);
 956                for (i = 0; i < BLK_MIG_BLOCK_SIZE / cluster_size; i++) {
 957                    cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size;
 958                    cur_buf = buf + i * cluster_size;
 959
 960                    if ((!block_mig_state.zero_blocks ||
 961                        cluster_size < BLK_MIG_BLOCK_SIZE) &&
 962                        buffer_is_zero(cur_buf, cluster_size)) {
 963                        ret = blk_pwrite_zeroes(blk, cur_addr,
 964                                                cluster_size,
 965                                                BDRV_REQ_MAY_UNMAP);
 966                    } else {
 967                        ret = blk_pwrite(blk, cur_addr, cluster_size, cur_buf,
 968                                         0);
 969                    }
 970                    if (ret < 0) {
 971                        break;
 972                    }
 973                }
 974                g_free(buf);
 975            }
 976
 977            if (ret < 0) {
 978                return ret;
 979            }
 980        } else if (flags & BLK_MIG_FLAG_PROGRESS) {
 981            if (!banner_printed) {
 982                printf("Receiving block device images\n");
 983                banner_printed = 1;
 984            }
 985            printf("Completed %d %%%c", (int)addr,
 986                   (addr == 100) ? '\n' : '\r');
 987            fflush(stdout);
 988        } else if (!(flags & BLK_MIG_FLAG_EOS)) {
 989            fprintf(stderr, "Unknown block migration flags: 0x%x\n", flags);
 990            return -EINVAL;
 991        }
 992        ret = qemu_file_get_error(f);
 993        if (ret != 0) {
 994            return ret;
 995        }
 996    } while (!(flags & BLK_MIG_FLAG_EOS));
 997
 998    return 0;
 999}
1000
1001static bool block_is_active(void *opaque)
1002{
1003    return migrate_use_block();
1004}
1005
1006static SaveVMHandlers savevm_block_handlers = {
1007    .save_setup = block_save_setup,
1008    .save_live_iterate = block_save_iterate,
1009    .save_live_complete_precopy = block_save_complete,
1010    .state_pending_exact = block_state_pending,
1011    .state_pending_estimate = block_state_pending,
1012    .load_state = block_load,
1013    .save_cleanup = block_migration_cleanup,
1014    .is_active = block_is_active,
1015};
1016
1017void blk_mig_init(void)
1018{
1019    QSIMPLEQ_INIT(&block_mig_state.bmds_list);
1020    QSIMPLEQ_INIT(&block_mig_state.blk_list);
1021    qemu_mutex_init(&block_mig_state.lock);
1022
1023    register_savevm_live("block", 0, 1, &savevm_block_handlers,
1024                         &block_mig_state);
1025}
1026