qemu/block/commit.c
<<
>>
Prefs
   1/*
   2 * Live block commit
   3 *
   4 * Copyright Red Hat, Inc. 2012
   5 *
   6 * Authors:
   7 *  Jeff Cody   <jcody@redhat.com>
   8 *  Based on stream.c by Stefan Hajnoczi
   9 *
  10 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
  11 * See the COPYING.LIB file in the top-level directory.
  12 *
  13 */
  14
  15#include "qemu/osdep.h"
  16#include "qemu/cutils.h"
  17#include "trace.h"
  18#include "block/block_int.h"
  19#include "block/blockjob_int.h"
  20#include "qapi/error.h"
  21#include "qapi/qmp/qerror.h"
  22#include "qemu/ratelimit.h"
  23#include "qemu/memalign.h"
  24#include "sysemu/block-backend.h"
  25
  26enum {
  27    /*
  28     * Size of data buffer for populating the image file.  This should be large
  29     * enough to process multiple clusters in a single call, so that populating
  30     * contiguous regions of the image is efficient.
  31     */
  32    COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
  33};
  34
  35typedef struct CommitBlockJob {
  36    BlockJob common;
  37    BlockDriverState *commit_top_bs;
  38    BlockBackend *top;
  39    BlockBackend *base;
  40    BlockDriverState *base_bs;
  41    BlockDriverState *base_overlay;
  42    BlockdevOnError on_error;
  43    bool base_read_only;
  44    bool chain_frozen;
  45    char *backing_file_str;
  46} CommitBlockJob;
  47
  48static int commit_prepare(Job *job)
  49{
  50    CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
  51
  52    bdrv_unfreeze_backing_chain(s->commit_top_bs, s->base_bs);
  53    s->chain_frozen = false;
  54
  55    /* Remove base node parent that still uses BLK_PERM_WRITE/RESIZE before
  56     * the normal backing chain can be restored. */
  57    blk_unref(s->base);
  58    s->base = NULL;
  59
  60    /* FIXME: bdrv_drop_intermediate treats total failures and partial failures
  61     * identically. Further work is needed to disambiguate these cases. */
  62    return bdrv_drop_intermediate(s->commit_top_bs, s->base_bs,
  63                                  s->backing_file_str);
  64}
  65
  66static void commit_abort(Job *job)
  67{
  68    CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
  69    BlockDriverState *top_bs = blk_bs(s->top);
  70
  71    if (s->chain_frozen) {
  72        bdrv_unfreeze_backing_chain(s->commit_top_bs, s->base_bs);
  73    }
  74
  75    /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
  76    bdrv_ref(top_bs);
  77    bdrv_ref(s->commit_top_bs);
  78
  79    if (s->base) {
  80        blk_unref(s->base);
  81    }
  82
  83    /* free the blockers on the intermediate nodes so that bdrv_replace_nodes
  84     * can succeed */
  85    block_job_remove_all_bdrv(&s->common);
  86
  87    /* If bdrv_drop_intermediate() failed (or was not invoked), remove the
  88     * commit filter driver from the backing chain now. Do this as the final
  89     * step so that the 'consistent read' permission can be granted.
  90     *
  91     * XXX Can (or should) we somehow keep 'consistent read' blocked even
  92     * after the failed/cancelled commit job is gone? If we already wrote
  93     * something to base, the intermediate images aren't valid any more. */
  94    bdrv_replace_node(s->commit_top_bs, s->commit_top_bs->backing->bs,
  95                      &error_abort);
  96
  97    bdrv_unref(s->commit_top_bs);
  98    bdrv_unref(top_bs);
  99}
 100
 101static void commit_clean(Job *job)
 102{
 103    CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
 104
 105    /* restore base open flags here if appropriate (e.g., change the base back
 106     * to r/o). These reopens do not need to be atomic, since we won't abort
 107     * even on failure here */
 108    if (s->base_read_only) {
 109        bdrv_reopen_set_read_only(s->base_bs, true, NULL);
 110    }
 111
 112    g_free(s->backing_file_str);
 113    blk_unref(s->top);
 114}
 115
 116static int coroutine_fn commit_run(Job *job, Error **errp)
 117{
 118    CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
 119    int64_t offset;
 120    uint64_t delay_ns = 0;
 121    int ret = 0;
 122    int64_t n = 0; /* bytes */
 123    QEMU_AUTO_VFREE void *buf = NULL;
 124    int64_t len, base_len;
 125
 126    len = blk_getlength(s->top);
 127    if (len < 0) {
 128        return len;
 129    }
 130    job_progress_set_remaining(&s->common.job, len);
 131
 132    base_len = blk_getlength(s->base);
 133    if (base_len < 0) {
 134        return base_len;
 135    }
 136
 137    if (base_len < len) {
 138        ret = blk_truncate(s->base, len, false, PREALLOC_MODE_OFF, 0, NULL);
 139        if (ret) {
 140            return ret;
 141        }
 142    }
 143
 144    buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);
 145
 146    for (offset = 0; offset < len; offset += n) {
 147        bool copy;
 148        bool error_in_source = true;
 149
 150        /* Note that even when no rate limit is applied we need to yield
 151         * with no pending I/O here so that bdrv_drain_all() returns.
 152         */
 153        job_sleep_ns(&s->common.job, delay_ns);
 154        if (job_is_cancelled(&s->common.job)) {
 155            break;
 156        }
 157        /* Copy if allocated above the base */
 158        ret = bdrv_is_allocated_above(blk_bs(s->top), s->base_overlay, true,
 159                                      offset, COMMIT_BUFFER_SIZE, &n);
 160        copy = (ret > 0);
 161        trace_commit_one_iteration(s, offset, n, ret);
 162        if (copy) {
 163            assert(n < SIZE_MAX);
 164
 165            ret = blk_co_pread(s->top, offset, n, buf, 0);
 166            if (ret >= 0) {
 167                ret = blk_co_pwrite(s->base, offset, n, buf, 0);
 168                if (ret < 0) {
 169                    error_in_source = false;
 170                }
 171            }
 172        }
 173        if (ret < 0) {
 174            BlockErrorAction action =
 175                block_job_error_action(&s->common, s->on_error,
 176                                       error_in_source, -ret);
 177            if (action == BLOCK_ERROR_ACTION_REPORT) {
 178                return ret;
 179            } else {
 180                n = 0;
 181                continue;
 182            }
 183        }
 184        /* Publish progress */
 185        job_progress_update(&s->common.job, n);
 186
 187        if (copy) {
 188            delay_ns = block_job_ratelimit_get_delay(&s->common, n);
 189        } else {
 190            delay_ns = 0;
 191        }
 192    }
 193
 194    return 0;
 195}
 196
 197static const BlockJobDriver commit_job_driver = {
 198    .job_driver = {
 199        .instance_size = sizeof(CommitBlockJob),
 200        .job_type      = JOB_TYPE_COMMIT,
 201        .free          = block_job_free,
 202        .user_resume   = block_job_user_resume,
 203        .run           = commit_run,
 204        .prepare       = commit_prepare,
 205        .abort         = commit_abort,
 206        .clean         = commit_clean
 207    },
 208};
 209
 210static int coroutine_fn bdrv_commit_top_preadv(BlockDriverState *bs,
 211    int64_t offset, int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
 212{
 213    return bdrv_co_preadv(bs->backing, offset, bytes, qiov, flags);
 214}
 215
 216static void bdrv_commit_top_refresh_filename(BlockDriverState *bs)
 217{
 218    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
 219            bs->backing->bs->filename);
 220}
 221
 222static void bdrv_commit_top_child_perm(BlockDriverState *bs, BdrvChild *c,
 223                                       BdrvChildRole role,
 224                                       BlockReopenQueue *reopen_queue,
 225                                       uint64_t perm, uint64_t shared,
 226                                       uint64_t *nperm, uint64_t *nshared)
 227{
 228    *nperm = 0;
 229    *nshared = BLK_PERM_ALL;
 230}
 231
 232/* Dummy node that provides consistent read to its users without requiring it
 233 * from its backing file and that allows writes on the backing file chain. */
 234static BlockDriver bdrv_commit_top = {
 235    .format_name                = "commit_top",
 236    .bdrv_co_preadv             = bdrv_commit_top_preadv,
 237    .bdrv_refresh_filename      = bdrv_commit_top_refresh_filename,
 238    .bdrv_child_perm            = bdrv_commit_top_child_perm,
 239
 240    .is_filter                  = true,
 241};
 242
 243void commit_start(const char *job_id, BlockDriverState *bs,
 244                  BlockDriverState *base, BlockDriverState *top,
 245                  int creation_flags, int64_t speed,
 246                  BlockdevOnError on_error, const char *backing_file_str,
 247                  const char *filter_node_name, Error **errp)
 248{
 249    CommitBlockJob *s;
 250    BlockDriverState *iter;
 251    BlockDriverState *commit_top_bs = NULL;
 252    BlockDriverState *filtered_base;
 253    int64_t base_size, top_size;
 254    uint64_t base_perms, iter_shared_perms;
 255    int ret;
 256
 257    GLOBAL_STATE_CODE();
 258
 259    assert(top != bs);
 260    if (bdrv_skip_filters(top) == bdrv_skip_filters(base)) {
 261        error_setg(errp, "Invalid files for merge: top and base are the same");
 262        return;
 263    }
 264
 265    base_size = bdrv_getlength(base);
 266    if (base_size < 0) {
 267        error_setg_errno(errp, -base_size, "Could not inquire base image size");
 268        return;
 269    }
 270
 271    top_size = bdrv_getlength(top);
 272    if (top_size < 0) {
 273        error_setg_errno(errp, -top_size, "Could not inquire top image size");
 274        return;
 275    }
 276
 277    base_perms = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
 278    if (base_size < top_size) {
 279        base_perms |= BLK_PERM_RESIZE;
 280    }
 281
 282    s = block_job_create(job_id, &commit_job_driver, NULL, bs, 0, BLK_PERM_ALL,
 283                         speed, creation_flags, NULL, NULL, errp);
 284    if (!s) {
 285        return;
 286    }
 287
 288    /* convert base to r/w, if necessary */
 289    s->base_read_only = bdrv_is_read_only(base);
 290    if (s->base_read_only) {
 291        if (bdrv_reopen_set_read_only(base, false, errp) != 0) {
 292            goto fail;
 293        }
 294    }
 295
 296    /* Insert commit_top block node above top, so we can block consistent read
 297     * on the backing chain below it */
 298    commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, filter_node_name, 0,
 299                                         errp);
 300    if (commit_top_bs == NULL) {
 301        goto fail;
 302    }
 303    if (!filter_node_name) {
 304        commit_top_bs->implicit = true;
 305    }
 306
 307    /* So that we can always drop this node */
 308    commit_top_bs->never_freeze = true;
 309
 310    commit_top_bs->total_sectors = top->total_sectors;
 311
 312    ret = bdrv_append(commit_top_bs, top, errp);
 313    bdrv_unref(commit_top_bs); /* referenced by new parents or failed */
 314    if (ret < 0) {
 315        commit_top_bs = NULL;
 316        goto fail;
 317    }
 318
 319    s->commit_top_bs = commit_top_bs;
 320
 321    /*
 322     * Block all nodes between top and base, because they will
 323     * disappear from the chain after this operation.
 324     * Note that this assumes that the user is fine with removing all
 325     * nodes (including R/W filters) between top and base.  Assuring
 326     * this is the responsibility of the interface (i.e. whoever calls
 327     * commit_start()).
 328     */
 329    s->base_overlay = bdrv_find_overlay(top, base);
 330    assert(s->base_overlay);
 331
 332    /*
 333     * The topmost node with
 334     * bdrv_skip_filters(filtered_base) == bdrv_skip_filters(base)
 335     */
 336    filtered_base = bdrv_cow_bs(s->base_overlay);
 337    assert(bdrv_skip_filters(filtered_base) == bdrv_skip_filters(base));
 338
 339    /*
 340     * XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves
 341     * at s->base (if writes are blocked for a node, they are also blocked
 342     * for its backing file). The other options would be a second filter
 343     * driver above s->base.
 344     */
 345    iter_shared_perms = BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE;
 346
 347    for (iter = top; iter != base; iter = bdrv_filter_or_cow_bs(iter)) {
 348        if (iter == filtered_base) {
 349            /*
 350             * From here on, all nodes are filters on the base.  This
 351             * allows us to share BLK_PERM_CONSISTENT_READ.
 352             */
 353            iter_shared_perms |= BLK_PERM_CONSISTENT_READ;
 354        }
 355
 356        ret = block_job_add_bdrv(&s->common, "intermediate node", iter, 0,
 357                                 iter_shared_perms, errp);
 358        if (ret < 0) {
 359            goto fail;
 360        }
 361    }
 362
 363    if (bdrv_freeze_backing_chain(commit_top_bs, base, errp) < 0) {
 364        goto fail;
 365    }
 366    s->chain_frozen = true;
 367
 368    ret = block_job_add_bdrv(&s->common, "base", base, 0, BLK_PERM_ALL, errp);
 369    if (ret < 0) {
 370        goto fail;
 371    }
 372
 373    s->base = blk_new(s->common.job.aio_context,
 374                      base_perms,
 375                      BLK_PERM_CONSISTENT_READ
 376                      | BLK_PERM_WRITE_UNCHANGED);
 377    ret = blk_insert_bs(s->base, base, errp);
 378    if (ret < 0) {
 379        goto fail;
 380    }
 381    blk_set_disable_request_queuing(s->base, true);
 382    s->base_bs = base;
 383
 384    /* Required permissions are already taken with block_job_add_bdrv() */
 385    s->top = blk_new(s->common.job.aio_context, 0, BLK_PERM_ALL);
 386    ret = blk_insert_bs(s->top, top, errp);
 387    if (ret < 0) {
 388        goto fail;
 389    }
 390    blk_set_disable_request_queuing(s->top, true);
 391
 392    s->backing_file_str = g_strdup(backing_file_str);
 393    s->on_error = on_error;
 394
 395    trace_commit_start(bs, base, top, s);
 396    job_start(&s->common.job);
 397    return;
 398
 399fail:
 400    if (s->chain_frozen) {
 401        bdrv_unfreeze_backing_chain(commit_top_bs, base);
 402    }
 403    if (s->base) {
 404        blk_unref(s->base);
 405    }
 406    if (s->top) {
 407        blk_unref(s->top);
 408    }
 409    if (s->base_read_only) {
 410        bdrv_reopen_set_read_only(base, true, NULL);
 411    }
 412    job_early_fail(&s->common.job);
 413    /* commit_top_bs has to be replaced after deleting the block job,
 414     * otherwise this would fail because of lack of permissions. */
 415    if (commit_top_bs) {
 416        bdrv_replace_node(commit_top_bs, top, &error_abort);
 417    }
 418}
 419
 420
 421#define COMMIT_BUF_SIZE (2048 * BDRV_SECTOR_SIZE)
 422
 423/* commit COW file into the raw image */
 424int bdrv_commit(BlockDriverState *bs)
 425{
 426    BlockBackend *src, *backing;
 427    BlockDriverState *backing_file_bs = NULL;
 428    BlockDriverState *commit_top_bs = NULL;
 429    BlockDriver *drv = bs->drv;
 430    AioContext *ctx;
 431    int64_t offset, length, backing_length;
 432    int ro;
 433    int64_t n;
 434    int ret = 0;
 435    QEMU_AUTO_VFREE uint8_t *buf = NULL;
 436    Error *local_err = NULL;
 437
 438    GLOBAL_STATE_CODE();
 439
 440    if (!drv)
 441        return -ENOMEDIUM;
 442
 443    backing_file_bs = bdrv_cow_bs(bs);
 444
 445    if (!backing_file_bs) {
 446        return -ENOTSUP;
 447    }
 448
 449    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
 450        bdrv_op_is_blocked(backing_file_bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL))
 451    {
 452        return -EBUSY;
 453    }
 454
 455    ro = bdrv_is_read_only(backing_file_bs);
 456
 457    if (ro) {
 458        if (bdrv_reopen_set_read_only(backing_file_bs, false, NULL)) {
 459            return -EACCES;
 460        }
 461    }
 462
 463    ctx = bdrv_get_aio_context(bs);
 464    /* WRITE_UNCHANGED is required for bdrv_make_empty() */
 465    src = blk_new(ctx, BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED,
 466                  BLK_PERM_ALL);
 467    backing = blk_new(ctx, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL);
 468
 469    ret = blk_insert_bs(src, bs, &local_err);
 470    if (ret < 0) {
 471        error_report_err(local_err);
 472        goto ro_cleanup;
 473    }
 474
 475    /* Insert commit_top block node above backing, so we can write to it */
 476    commit_top_bs = bdrv_new_open_driver(&bdrv_commit_top, NULL, BDRV_O_RDWR,
 477                                         &local_err);
 478    if (commit_top_bs == NULL) {
 479        error_report_err(local_err);
 480        goto ro_cleanup;
 481    }
 482
 483    bdrv_set_backing_hd(commit_top_bs, backing_file_bs, &error_abort);
 484    bdrv_set_backing_hd(bs, commit_top_bs, &error_abort);
 485
 486    ret = blk_insert_bs(backing, backing_file_bs, &local_err);
 487    if (ret < 0) {
 488        error_report_err(local_err);
 489        goto ro_cleanup;
 490    }
 491
 492    length = blk_getlength(src);
 493    if (length < 0) {
 494        ret = length;
 495        goto ro_cleanup;
 496    }
 497
 498    backing_length = blk_getlength(backing);
 499    if (backing_length < 0) {
 500        ret = backing_length;
 501        goto ro_cleanup;
 502    }
 503
 504    /* If our top snapshot is larger than the backing file image,
 505     * grow the backing file image if possible.  If not possible,
 506     * we must return an error */
 507    if (length > backing_length) {
 508        ret = blk_truncate(backing, length, false, PREALLOC_MODE_OFF, 0,
 509                           &local_err);
 510        if (ret < 0) {
 511            error_report_err(local_err);
 512            goto ro_cleanup;
 513        }
 514    }
 515
 516    /* blk_try_blockalign() for src will choose an alignment that works for
 517     * backing as well, so no need to compare the alignment manually. */
 518    buf = blk_try_blockalign(src, COMMIT_BUF_SIZE);
 519    if (buf == NULL) {
 520        ret = -ENOMEM;
 521        goto ro_cleanup;
 522    }
 523
 524    for (offset = 0; offset < length; offset += n) {
 525        ret = bdrv_is_allocated(bs, offset, COMMIT_BUF_SIZE, &n);
 526        if (ret < 0) {
 527            goto ro_cleanup;
 528        }
 529        if (ret) {
 530            ret = blk_pread(src, offset, buf, n);
 531            if (ret < 0) {
 532                goto ro_cleanup;
 533            }
 534
 535            ret = blk_pwrite(backing, offset, buf, n, 0);
 536            if (ret < 0) {
 537                goto ro_cleanup;
 538            }
 539        }
 540    }
 541
 542    ret = blk_make_empty(src, NULL);
 543    /* Ignore -ENOTSUP */
 544    if (ret < 0 && ret != -ENOTSUP) {
 545        goto ro_cleanup;
 546    }
 547
 548    blk_flush(src);
 549
 550    /*
 551     * Make sure all data we wrote to the backing device is actually
 552     * stable on disk.
 553     */
 554    blk_flush(backing);
 555
 556    ret = 0;
 557ro_cleanup:
 558    blk_unref(backing);
 559    if (bdrv_cow_bs(bs) != backing_file_bs) {
 560        bdrv_set_backing_hd(bs, backing_file_bs, &error_abort);
 561    }
 562    bdrv_unref(commit_top_bs);
 563    blk_unref(src);
 564
 565    if (ro) {
 566        /* ignoring error return here */
 567        bdrv_reopen_set_read_only(backing_file_bs, true, NULL);
 568    }
 569
 570    return ret;
 571}
 572