qemu/block/commit.c
<<
>>
Prefs
   1/*
   2 * Live block commit
   3 *
   4 * Copyright Red Hat, Inc. 2012
   5 *
   6 * Authors:
   7 *  Jeff Cody   <jcody@redhat.com>
   8 *  Based on stream.c by Stefan Hajnoczi
   9 *
  10 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
  11 * See the COPYING.LIB file in the top-level directory.
  12 *
  13 */
  14
  15#include "qemu/osdep.h"
  16#include "qemu/cutils.h"
  17#include "trace.h"
  18#include "block/block_int.h"
  19#include "block/blockjob_int.h"
  20#include "qapi/error.h"
  21#include "qapi/qmp/qerror.h"
  22#include "qemu/ratelimit.h"
  23#include "sysemu/block-backend.h"
  24
  25enum {
  26    /*
  27     * Size of data buffer for populating the image file.  This should be large
  28     * enough to process multiple clusters in a single call, so that populating
  29     * contiguous regions of the image is efficient.
  30     */
  31    COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
  32};
  33
  34#define SLICE_TIME 100000000ULL /* ns */
  35
  36typedef struct CommitBlockJob {
  37    BlockJob common;
  38    RateLimit limit;
  39    BlockDriverState *active;
  40    BlockDriverState *commit_top_bs;
  41    BlockBackend *top;
  42    BlockBackend *base;
  43    BlockdevOnError on_error;
  44    int base_flags;
  45    int orig_overlay_flags;
  46    char *backing_file_str;
  47} CommitBlockJob;
  48
  49static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
  50                                        int64_t offset, uint64_t bytes,
  51                                        void *buf)
  52{
  53    int ret = 0;
  54    QEMUIOVector qiov;
  55    struct iovec iov = {
  56        .iov_base = buf,
  57        .iov_len = bytes,
  58    };
  59
  60    assert(bytes < SIZE_MAX);
  61    qemu_iovec_init_external(&qiov, &iov, 1);
  62
  63    ret = blk_co_preadv(bs, offset, qiov.size, &qiov, 0);
  64    if (ret < 0) {
  65        return ret;
  66    }
  67
  68    ret = blk_co_pwritev(base, offset, qiov.size, &qiov, 0);
  69    if (ret < 0) {
  70        return ret;
  71    }
  72
  73    return 0;
  74}
  75
  76typedef struct {
  77    int ret;
  78} CommitCompleteData;
  79
  80static void commit_complete(BlockJob *job, void *opaque)
  81{
  82    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
  83    CommitCompleteData *data = opaque;
  84    BlockDriverState *active = s->active;
  85    BlockDriverState *top = blk_bs(s->top);
  86    BlockDriverState *base = blk_bs(s->base);
  87    BlockDriverState *overlay_bs = bdrv_find_overlay(active, s->commit_top_bs);
  88    int ret = data->ret;
  89    bool remove_commit_top_bs = false;
  90
  91    /* Make sure overlay_bs and top stay around until bdrv_set_backing_hd() */
  92    bdrv_ref(top);
  93    if (overlay_bs) {
  94        bdrv_ref(overlay_bs);
  95    }
  96
  97    /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
  98     * the normal backing chain can be restored. */
  99    blk_unref(s->base);
 100
 101    if (!block_job_is_cancelled(&s->common) && ret == 0) {
 102        /* success */
 103        ret = bdrv_drop_intermediate(active, s->commit_top_bs, base,
 104                                     s->backing_file_str);
 105    } else if (overlay_bs) {
 106        /* XXX Can (or should) we somehow keep 'consistent read' blocked even
 107         * after the failed/cancelled commit job is gone? If we already wrote
 108         * something to base, the intermediate images aren't valid any more. */
 109        remove_commit_top_bs = true;
 110    }
 111
 112    /* restore base open flags here if appropriate (e.g., change the base back
 113     * to r/o). These reopens do not need to be atomic, since we won't abort
 114     * even on failure here */
 115    if (s->base_flags != bdrv_get_flags(base)) {
 116        bdrv_reopen(base, s->base_flags, NULL);
 117    }
 118    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
 119        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
 120    }
 121    g_free(s->backing_file_str);
 122    blk_unref(s->top);
 123
 124    /* If there is more than one reference to the job (e.g. if called from
 125     * block_job_finish_sync()), block_job_completed() won't free it and
 126     * therefore the blockers on the intermediate nodes remain. This would
 127     * cause bdrv_set_backing_hd() to fail. */
 128    block_job_remove_all_bdrv(job);
 129
 130    block_job_completed(&s->common, ret);
 131    g_free(data);
 132
 133    /* If bdrv_drop_intermediate() didn't already do that, remove the commit
 134     * filter driver from the backing chain. Do this as the final step so that
 135     * the 'consistent read' permission can be granted.  */
 136    if (remove_commit_top_bs) {
 137        bdrv_set_backing_hd(overlay_bs, top, &error_abort);
 138    }
 139
 140    bdrv_unref(overlay_bs);
 141    bdrv_unref(top);
 142}
 143
 144static void coroutine_fn commit_run(void *opaque)
 145{
 146    CommitBlockJob *s = opaque;
 147    CommitCompleteData *data;
 148    int64_t offset;
 149    uint64_t delay_ns = 0;
 150    int ret = 0;
 151    int64_t n = 0; /* bytes */
 152    void *buf = NULL;
 153    int bytes_written = 0;
 154    int64_t base_len;
 155
 156    ret = s->common.len = blk_getlength(s->top);
 157
 158    if (s->common.len < 0) {
 159        goto out;
 160    }
 161
 162    ret = base_len = blk_getlength(s->base);
 163    if (base_len < 0) {
 164        goto out;
 165    }
 166
 167    if (base_len < s->common.len) {
 168        ret = blk_truncate(s->base, s->common.len, PREALLOC_MODE_OFF, NULL);
 169        if (ret) {
 170            goto out;
 171        }
 172    }
 173
 174    buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);
 175
 176    for (offset = 0; offset < s->common.len; offset += n) {
 177        bool copy;
 178
 179        /* Note that even when no rate limit is applied we need to yield
 180         * with no pending I/O here so that bdrv_drain_all() returns.
 181         */
 182        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
 183        if (block_job_is_cancelled(&s->common)) {
 184            break;
 185        }
 186        /* Copy if allocated above the base */
 187        ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base),
 188                                      offset, COMMIT_BUFFER_SIZE, &n);
 189        copy = (ret == 1);
 190        trace_commit_one_iteration(s, offset, n, ret);
 191        if (copy) {
 192            ret = commit_populate(s->top, s->base, offset, n, buf);
 193            bytes_written += n;
 194        }
 195        if (ret < 0) {
 196            BlockErrorAction action =
 197                block_job_error_action(&s->common, false, s->on_error, -ret);
 198            if (action == BLOCK_ERROR_ACTION_REPORT) {
 199                goto out;
 200            } else {
 201                n = 0;
 202                continue;
 203            }
 204        }
 205        /* Publish progress */
 206        s->common.offset += n;
 207
 208        if (copy && s->common.speed) {
 209            delay_ns = ratelimit_calculate_delay(&s->limit, n);
 210        }
 211    }
 212
 213    ret = 0;
 214
 215out:
 216    qemu_vfree(buf);
 217
 218    data = g_malloc(sizeof(*data));
 219    data->ret = ret;
 220    block_job_defer_to_main_loop(&s->common, commit_complete, data);
 221}
 222
 223static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
 224{
 225    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
 226
 227    if (speed < 0) {
 228        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
 229        return;
 230    }
 231    ratelimit_set_speed(&s->limit, speed, SLICE_TIME);
 232}
 233
 234static const BlockJobDriver commit_job_driver = {
 235    .instance_size = sizeof(CommitBlockJob),
 236    .job_type      = BLOCK_JOB_TYPE_COMMIT,
 237    .set_speed     = commit_set_speed,
 238    .start         = commit_run,
 239};
 240
 241static int coroutine_fn bdrv_commit_top_preadv(BlockDriverState *bs,
 242    uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
 243{
 244    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
 245}
 246
 247static int64_t coroutine_fn bdrv_commit_top_get_block_status(
 248    BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum,
 249    BlockDriverState **file)
 250{
 251    *pnum = nb_sectors;
 252    *file = bs->backing->bs;
 253    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
 254           (sector_num << BDRV_SECTOR_BITS);
 255}
 256
 257static void bdrv_commit_top_refresh_filename(BlockDriverState *bs, QDict *opts)
 258{
 259    bdrv_refresh_filename(bs->backing->bs);
 260    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
 261            bs->backing->bs->filename);
 262}
 263
 264static void bdrv_commit_top_close(BlockDriverState *bs)
 265{
 266}
 267
 268static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c,
 269                                       const BdrvChildRole *role,
 270                                       uint64_t perm, uint64_t shared,
 271                                       uint64_t *nperm, uint64_t *nshared)
 272{
 273    *nperm = 0;
 274    *nshared = BLK_PERM_ALL;
 275}
 276
 277/* Dummy node that provides consistent read to its users without requiring it
 278 * from its backing file and that allows writes on the backing file chain. */
 279static BlockDriver bdrv_commit_top = {
 280    .format_name                = "commit_top",
 281    .bdrv_co_preadv             = bdrv_commit_top_preadv,
 282    .bdrv_co_get_block_status   = bdrv_commit_top_get_block_status,
 283    .bdrv_refresh_filename      = bdrv_commit_top_refresh_filename,
 284    .bdrv_close                 = bdrv_commit_top_close,
 285    .bdrv_child_perm            = bdrv_commit_top_child_perm,
 286};
 287
 288void commit_start(const char *job_id, BlockDriverState *bs,
 289                  BlockDriverState *base, BlockDriverState *top, int64_t speed,
 290                  BlockdevOnError on_error, const char *backing_file_str,
 291                  const char *filter_node_name, Error **errp)
 292{
 293    CommitBlockJob *s;
 294    BlockReopenQueue *reopen_queue = NULL;
 295    int orig_overlay_flags;
 296    int orig_base_flags;
 297    BlockDriverState *iter;
 298    BlockDriverState *overlay_bs;
 299    BlockDriverState *commit_top_bs = NULL;
 300    Error *local_err = NULL;
 301    int ret;
 302
 303    assert(top != bs);
 304    if (top == base) {
 305        error_setg(errp, "Invalid files for merge: top and base are the same");
 306        return;
 307    }
 308
 309    overlay_bs = bdrv_find_overlay(bs, top);
 310
 311    if (overlay_bs == NULL) {
 312        error_setg(errp, "Could not find overlay image for %s:", top->filename);
 313        return;
 314    }
 315
 316    s = block_job_create(job_id, &commit_job_driver, bs, 0, BLK_PERM_ALL,
 317                         speed, BLOCK_JOB_DEFAULT, NULL, NULL, errp);
 318    if (!s) {
 319        return;
 320    }
 321
 322    orig_base_flags    = bdrv_get_flags(base);
 323    orig_overlay_flags = bdrv_get_flags(overlay_bs);
 324
 325    /* convert base & overlay_bs to r/w, if necessary */
 326    if (!(orig_base_flags & BDRV_O_RDWR)) {
 327        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
 328                                         orig_base_flags | BDRV_O_RDWR);
 329    }
 330    if (!(orig_overlay_flags & BDRV_O_RDWR)) {
 331        reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs, NULL,
 332                                         orig_overlay_flags | BDRV_O_RDWR);
 333    }
 334    if (reopen_queue) {
 335        bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, &local_err);
 336        if (local_err != NULL) {
 337            error_propagate(errp, local_err);
 338            goto fail;
 339        }
 340    }
 341
 342    /* Insert commit_top block node above top, so we can block consistent read
 343     * on the backing chain below it */
 344    commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, filter_node_name, 0,
 345                                         errp);
 346    if (commit_top_bs == NULL) {
 347        goto fail;
 348    }
 349    if (!filter_node_name) {
 350        commit_top_bs->implicit = true;
 351    }
 352    commit_top_bs->total_sectors = top->total_sectors;
 353    bdrv_set_aio_context(commit_top_bs, bdrv_get_aio_context(top));
 354
 355    bdrv_set_backing_hd(commit_top_bs, top, &local_err);
 356    if (local_err) {
 357        bdrv_unref(commit_top_bs);
 358        commit_top_bs = NULL;
 359        error_propagate(errp, local_err);
 360        goto fail;
 361    }
 362    bdrv_set_backing_hd(overlay_bs, commit_top_bs, &local_err);
 363    if (local_err) {
 364        bdrv_unref(commit_top_bs);
 365        commit_top_bs = NULL;
 366        error_propagate(errp, local_err);
 367        goto fail;
 368    }
 369
 370    s->commit_top_bs = commit_top_bs;
 371    bdrv_unref(commit_top_bs);
 372
 373    /* Block all nodes between top and base, because they will
 374     * disappear from the chain after this operation. */
 375    assert(bdrv_chain_contains(top, base));
 376    for (iter = top; iter != base; iter = backing_bs(iter)) {
 377        /* XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves
 378         * at s->base (if writes are blocked for a node, they are also blocked
 379         * for its backing file). The other options would be a second filter
 380         * driver above s->base. */
 381        ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
 382                                 BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE,
 383                                 errp);
 384        if (ret < 0) {
 385            goto fail;
 386        }
 387    }
 388
 389    ret = block_job_add_bdrv(&s->common, "base", base, 0, BLK_PERM_ALL, errp);
 390    if (ret < 0) {
 391        goto fail;
 392    }
 393
 394    /* overlay_bs must be blocked because it needs to be modified to
 395     * update the backing image string. */
 396    ret = block_job_add_bdrv(&s->common, "overlay of top", overlay_bs,
 397                             BLK_PERM_GRAPH_MOD, BLK_PERM_ALL, errp);
 398    if (ret < 0) {
 399        goto fail;
 400    }
 401
 402    s->base = blk_new(BLK_PERM_CONSISTENT_READ
 403                      | BLK_PERM_WRITE
 404                      | BLK_PERM_RESIZE,
 405                      BLK_PERM_CONSISTENT_READ
 406                      | BLK_PERM_GRAPH_MOD
 407                      | BLK_PERM_WRITE_UNCHANGED);
 408    ret = blk_insert_bs(s->base, base, errp);
 409    if (ret < 0) {
 410        goto fail;
 411    }
 412
 413    /* Required permissions are already taken with block_job_add_bdrv() */
 414    s->top = blk_new(0, BLK_PERM_ALL);
 415    ret = blk_insert_bs(s->top, top, errp);
 416    if (ret < 0) {
 417        goto fail;
 418    }
 419
 420    s->active = bs;
 421
 422    s->base_flags          = orig_base_flags;
 423    s->orig_overlay_flags  = orig_overlay_flags;
 424
 425    s->backing_file_str = g_strdup(backing_file_str);
 426
 427    s->on_error = on_error;
 428
 429    trace_commit_start(bs, base, top, s);
 430    block_job_start(&s->common);
 431    return;
 432
 433fail:
 434    if (s->base) {
 435        blk_unref(s->base);
 436    }
 437    if (s->top) {
 438        blk_unref(s->top);
 439    }
 440    if (commit_top_bs) {
 441        bdrv_set_backing_hd(overlay_bs, top, &error_abort);
 442    }
 443    block_job_early_fail(&s->common);
 444}
 445
 446
 447#define COMMIT_BUF_SIZE (2048 * BDRV_SECTOR_SIZE)
 448
 449/* commit COW file into the raw image */
 450int bdrv_commit(BlockDriverState *bs)
 451{
 452    BlockBackend *src, *backing;
 453    BlockDriverState *backing_file_bs = NULL;
 454    BlockDriverState *commit_top_bs = NULL;
 455    BlockDriver *drv = bs->drv;
 456    int64_t offset, length, backing_length;
 457    int ro, open_flags;
 458    int64_t n;
 459    int ret = 0;
 460    uint8_t *buf = NULL;
 461    Error *local_err = NULL;
 462
 463    if (!drv)
 464        return -ENOMEDIUM;
 465
 466    if (!bs->backing) {
 467        return -ENOTSUP;
 468    }
 469
 470    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
 471        bdrv_op_is_blocked(bs->backing->bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) {
 472        return -EBUSY;
 473    }
 474
 475    ro = bs->backing->bs->read_only;
 476    open_flags =  bs->backing->bs->open_flags;
 477
 478    if (ro) {
 479        if (bdrv_reopen(bs->backing->bs, open_flags | BDRV_O_RDWR, NULL)) {
 480            return -EACCES;
 481        }
 482    }
 483
 484    src = blk_new(BLK_PERM_CONSISTENT_READ, BLK_PERM_ALL);
 485    backing = blk_new(BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
 486
 487    ret = blk_insert_bs(src, bs, &local_err);
 488    if (ret < 0) {
 489        error_report_err(local_err);
 490        goto ro_cleanup;
 491    }
 492
 493    /* Insert commit_top block node above backing, so we can write to it */
 494    backing_file_bs = backing_bs(bs);
 495
 496    commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, NULL, BDRV_O_RDWR,
 497                                         &local_err);
 498    if (commit_top_bs == NULL) {
 499        error_report_err(local_err);
 500        goto ro_cleanup;
 501    }
 502    bdrv_set_aio_context(commit_top_bs, bdrv_get_aio_context(backing_file_bs));
 503
 504    bdrv_set_backing_hd(commit_top_bs, backing_file_bs, &error_abort);
 505    bdrv_set_backing_hd(bs, commit_top_bs, &error_abort);
 506
 507    ret = blk_insert_bs(backing, backing_file_bs, &local_err);
 508    if (ret < 0) {
 509        error_report_err(local_err);
 510        goto ro_cleanup;
 511    }
 512
 513    length = blk_getlength(src);
 514    if (length < 0) {
 515        ret = length;
 516        goto ro_cleanup;
 517    }
 518
 519    backing_length = blk_getlength(backing);
 520    if (backing_length < 0) {
 521        ret = backing_length;
 522        goto ro_cleanup;
 523    }
 524
 525    /* If our top snapshot is larger than the backing file image,
 526     * grow the backing file image if possible.  If not possible,
 527     * we must return an error */
 528    if (length > backing_length) {
 529        ret = blk_truncate(backing, length, PREALLOC_MODE_OFF, &local_err);
 530        if (ret < 0) {
 531            error_report_err(local_err);
 532            goto ro_cleanup;
 533        }
 534    }
 535
 536    /* blk_try_blockalign() for src will choose an alignment that works for
 537     * backing as well, so no need to compare the alignment manually. */
 538    buf = blk_try_blockalign(src, COMMIT_BUF_SIZE);
 539    if (buf == NULL) {
 540        ret = -ENOMEM;
 541        goto ro_cleanup;
 542    }
 543
 544    for (offset = 0; offset < length; offset += n) {
 545        ret = bdrv_is_allocated(bs, offset, COMMIT_BUF_SIZE, &n);
 546        if (ret < 0) {
 547            goto ro_cleanup;
 548        }
 549        if (ret) {
 550            ret = blk_pread(src, offset, buf, n);
 551            if (ret < 0) {
 552                goto ro_cleanup;
 553            }
 554
 555            ret = blk_pwrite(backing, offset, buf, n, 0);
 556            if (ret < 0) {
 557                goto ro_cleanup;
 558            }
 559        }
 560    }
 561
 562    if (drv->bdrv_make_empty) {
 563        ret = drv->bdrv_make_empty(bs);
 564        if (ret < 0) {
 565            goto ro_cleanup;
 566        }
 567        blk_flush(src);
 568    }
 569
 570    /*
 571     * Make sure all data we wrote to the backing device is actually
 572     * stable on disk.
 573     */
 574    blk_flush(backing);
 575
 576    ret = 0;
 577ro_cleanup:
 578    qemu_vfree(buf);
 579
 580    blk_unref(backing);
 581    if (backing_file_bs) {
 582        bdrv_set_backing_hd(bs, backing_file_bs, &error_abort);
 583    }
 584    bdrv_unref(commit_top_bs);
 585    blk_unref(src);
 586
 587    if (ro) {
 588        /* ignoring error return here */
 589        bdrv_reopen(bs->backing->bs, open_flags & ~BDRV_O_RDWR, NULL);
 590    }
 591
 592    return ret;
 593}
 594