qemu/block/commit.c
<<
>>
Prefs
   1/*
   2 * Live block commit
   3 *
   4 * Copyright Red Hat, Inc. 2012
   5 *
   6 * Authors:
   7 *  Jeff Cody   <jcody@redhat.com>
   8 *  Based on stream.c by Stefan Hajnoczi
   9 *
  10 * This work is licensed under the terms of the GNU LGPL, version 2 or later.
  11 * See the COPYING.LIB file in the top-level directory.
  12 *
  13 */
  14
  15#include "qemu/osdep.h"
  16#include "trace.h"
  17#include "block/block_int.h"
  18#include "block/blockjob.h"
  19#include "qapi/error.h"
  20#include "qapi/qmp/qerror.h"
  21#include "qemu/ratelimit.h"
  22#include "sysemu/block-backend.h"
  23
  24enum {
  25    /*
  26     * Size of data buffer for populating the image file.  This should be large
  27     * enough to process multiple clusters in a single call, so that populating
  28     * contiguous regions of the image is efficient.
  29     */
  30    COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
  31};
  32
  33#define SLICE_TIME 100000000ULL /* ns */
  34
  35typedef struct CommitBlockJob {
  36    BlockJob common;
  37    RateLimit limit;
  38    BlockDriverState *active;
  39    BlockBackend *top;
  40    BlockBackend *base;
  41    BlockdevOnError on_error;
  42    int base_flags;
  43    int orig_overlay_flags;
  44    char *backing_file_str;
  45} CommitBlockJob;
  46
  47static int coroutine_fn commit_populate(BlockBackend *bs, BlockBackend *base,
  48                                        int64_t sector_num, int nb_sectors,
  49                                        void *buf)
  50{
  51    int ret = 0;
  52    QEMUIOVector qiov;
  53    struct iovec iov = {
  54        .iov_base = buf,
  55        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
  56    };
  57
  58    qemu_iovec_init_external(&qiov, &iov, 1);
  59
  60    ret = blk_co_preadv(bs, sector_num * BDRV_SECTOR_SIZE,
  61                        qiov.size, &qiov, 0);
  62    if (ret < 0) {
  63        return ret;
  64    }
  65
  66    ret = blk_co_pwritev(base, sector_num * BDRV_SECTOR_SIZE,
  67                         qiov.size, &qiov, 0);
  68    if (ret < 0) {
  69        return ret;
  70    }
  71
  72    return 0;
  73}
  74
  75typedef struct {
  76    int ret;
  77} CommitCompleteData;
  78
  79static void commit_complete(BlockJob *job, void *opaque)
  80{
  81    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
  82    CommitCompleteData *data = opaque;
  83    BlockDriverState *active = s->active;
  84    BlockDriverState *top = blk_bs(s->top);
  85    BlockDriverState *base = blk_bs(s->base);
  86    BlockDriverState *overlay_bs;
  87    int ret = data->ret;
  88
  89    if (!block_job_is_cancelled(&s->common) && ret == 0) {
  90        /* success */
  91        ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str);
  92    }
  93
  94    /* restore base open flags here if appropriate (e.g., change the base back
  95     * to r/o). These reopens do not need to be atomic, since we won't abort
  96     * even on failure here */
  97    if (s->base_flags != bdrv_get_flags(base)) {
  98        bdrv_reopen(base, s->base_flags, NULL);
  99    }
 100    overlay_bs = bdrv_find_overlay(active, top);
 101    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
 102        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
 103    }
 104    g_free(s->backing_file_str);
 105    blk_unref(s->top);
 106    blk_unref(s->base);
 107    block_job_completed(&s->common, ret);
 108    g_free(data);
 109}
 110
 111static void coroutine_fn commit_run(void *opaque)
 112{
 113    CommitBlockJob *s = opaque;
 114    CommitCompleteData *data;
 115    int64_t sector_num, end;
 116    uint64_t delay_ns = 0;
 117    int ret = 0;
 118    int n = 0;
 119    void *buf = NULL;
 120    int bytes_written = 0;
 121    int64_t base_len;
 122
 123    ret = s->common.len = blk_getlength(s->top);
 124
 125
 126    if (s->common.len < 0) {
 127        goto out;
 128    }
 129
 130    ret = base_len = blk_getlength(s->base);
 131    if (base_len < 0) {
 132        goto out;
 133    }
 134
 135    if (base_len < s->common.len) {
 136        ret = blk_truncate(s->base, s->common.len);
 137        if (ret) {
 138            goto out;
 139        }
 140    }
 141
 142    end = s->common.len >> BDRV_SECTOR_BITS;
 143    buf = blk_blockalign(s->top, COMMIT_BUFFER_SIZE);
 144
 145    for (sector_num = 0; sector_num < end; sector_num += n) {
 146        bool copy;
 147
 148        /* Note that even when no rate limit is applied we need to yield
 149         * with no pending I/O here so that bdrv_drain_all() returns.
 150         */
 151        block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns);
 152        if (block_job_is_cancelled(&s->common)) {
 153            break;
 154        }
 155        /* Copy if allocated above the base */
 156        ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base),
 157                                      sector_num,
 158                                      COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
 159                                      &n);
 160        copy = (ret == 1);
 161        trace_commit_one_iteration(s, sector_num, n, ret);
 162        if (copy) {
 163            ret = commit_populate(s->top, s->base, sector_num, n, buf);
 164            bytes_written += n * BDRV_SECTOR_SIZE;
 165        }
 166        if (ret < 0) {
 167            BlockErrorAction action =
 168                block_job_error_action(&s->common, false, s->on_error, -ret);
 169            if (action == BLOCK_ERROR_ACTION_REPORT) {
 170                goto out;
 171            } else {
 172                n = 0;
 173                continue;
 174            }
 175        }
 176        /* Publish progress */
 177        s->common.offset += n * BDRV_SECTOR_SIZE;
 178
 179        if (copy && s->common.speed) {
 180            delay_ns = ratelimit_calculate_delay(&s->limit, n);
 181        }
 182    }
 183
 184    ret = 0;
 185
 186out:
 187    qemu_vfree(buf);
 188
 189    data = g_malloc(sizeof(*data));
 190    data->ret = ret;
 191    block_job_defer_to_main_loop(&s->common, commit_complete, data);
 192}
 193
 194static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp)
 195{
 196    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
 197
 198    if (speed < 0) {
 199        error_setg(errp, QERR_INVALID_PARAMETER, "speed");
 200        return;
 201    }
 202    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
 203}
 204
 205static const BlockJobDriver commit_job_driver = {
 206    .instance_size = sizeof(CommitBlockJob),
 207    .job_type      = BLOCK_JOB_TYPE_COMMIT,
 208    .set_speed     = commit_set_speed,
 209};
 210
 211void commit_start(const char *job_id, BlockDriverState *bs,
 212                  BlockDriverState *base, BlockDriverState *top, int64_t speed,
 213                  BlockdevOnError on_error, BlockCompletionFunc *cb,
 214                  void *opaque, const char *backing_file_str, Error **errp)
 215{
 216    CommitBlockJob *s;
 217    BlockReopenQueue *reopen_queue = NULL;
 218    int orig_overlay_flags;
 219    int orig_base_flags;
 220    BlockDriverState *overlay_bs;
 221    Error *local_err = NULL;
 222
 223    assert(top != bs);
 224    if (top == base) {
 225        error_setg(errp, "Invalid files for merge: top and base are the same");
 226        return;
 227    }
 228
 229    overlay_bs = bdrv_find_overlay(bs, top);
 230
 231    if (overlay_bs == NULL) {
 232        error_setg(errp, "Could not find overlay image for %s:", top->filename);
 233        return;
 234    }
 235
 236    s = block_job_create(job_id, &commit_job_driver, bs, speed,
 237                         cb, opaque, errp);
 238    if (!s) {
 239        return;
 240    }
 241
 242    orig_base_flags    = bdrv_get_flags(base);
 243    orig_overlay_flags = bdrv_get_flags(overlay_bs);
 244
 245    /* convert base & overlay_bs to r/w, if necessary */
 246    if (!(orig_overlay_flags & BDRV_O_RDWR)) {
 247        reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs, NULL,
 248                                         orig_overlay_flags | BDRV_O_RDWR);
 249    }
 250    if (!(orig_base_flags & BDRV_O_RDWR)) {
 251        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
 252                                         orig_base_flags | BDRV_O_RDWR);
 253    }
 254    if (reopen_queue) {
 255        bdrv_reopen_multiple(reopen_queue, &local_err);
 256        if (local_err != NULL) {
 257            error_propagate(errp, local_err);
 258            block_job_unref(&s->common);
 259            return;
 260        }
 261    }
 262
 263
 264    s->base = blk_new();
 265    blk_insert_bs(s->base, base);
 266
 267    s->top = blk_new();
 268    blk_insert_bs(s->top, top);
 269
 270    s->active = bs;
 271
 272    s->base_flags          = orig_base_flags;
 273    s->orig_overlay_flags  = orig_overlay_flags;
 274
 275    s->backing_file_str = g_strdup(backing_file_str);
 276
 277    s->on_error = on_error;
 278    s->common.co = qemu_coroutine_create(commit_run, s);
 279
 280    trace_commit_start(bs, base, top, s, s->common.co, opaque);
 281    qemu_coroutine_enter(s->common.co);
 282}
 283
 284
 285#define COMMIT_BUF_SECTORS 2048
 286
 287/* commit COW file into the raw image */
 288int bdrv_commit(BlockDriverState *bs)
 289{
 290    BlockBackend *src, *backing;
 291    BlockDriver *drv = bs->drv;
 292    int64_t sector, total_sectors, length, backing_length;
 293    int n, ro, open_flags;
 294    int ret = 0;
 295    uint8_t *buf = NULL;
 296
 297    if (!drv)
 298        return -ENOMEDIUM;
 299
 300    if (!bs->backing) {
 301        return -ENOTSUP;
 302    }
 303
 304    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_COMMIT_SOURCE, NULL) ||
 305        bdrv_op_is_blocked(bs->backing->bs, BLOCK_OP_TYPE_COMMIT_TARGET, NULL)) {
 306        return -EBUSY;
 307    }
 308
 309    ro = bs->backing->bs->read_only;
 310    open_flags =  bs->backing->bs->open_flags;
 311
 312    if (ro) {
 313        if (bdrv_reopen(bs->backing->bs, open_flags | BDRV_O_RDWR, NULL)) {
 314            return -EACCES;
 315        }
 316    }
 317
 318    src = blk_new();
 319    blk_insert_bs(src, bs);
 320
 321    backing = blk_new();
 322    blk_insert_bs(backing, bs->backing->bs);
 323
 324    length = blk_getlength(src);
 325    if (length < 0) {
 326        ret = length;
 327        goto ro_cleanup;
 328    }
 329
 330    backing_length = blk_getlength(backing);
 331    if (backing_length < 0) {
 332        ret = backing_length;
 333        goto ro_cleanup;
 334    }
 335
 336    /* If our top snapshot is larger than the backing file image,
 337     * grow the backing file image if possible.  If not possible,
 338     * we must return an error */
 339    if (length > backing_length) {
 340        ret = blk_truncate(backing, length);
 341        if (ret < 0) {
 342            goto ro_cleanup;
 343        }
 344    }
 345
 346    total_sectors = length >> BDRV_SECTOR_BITS;
 347
 348    /* blk_try_blockalign() for src will choose an alignment that works for
 349     * backing as well, so no need to compare the alignment manually. */
 350    buf = blk_try_blockalign(src, COMMIT_BUF_SECTORS * BDRV_SECTOR_SIZE);
 351    if (buf == NULL) {
 352        ret = -ENOMEM;
 353        goto ro_cleanup;
 354    }
 355
 356    for (sector = 0; sector < total_sectors; sector += n) {
 357        ret = bdrv_is_allocated(bs, sector, COMMIT_BUF_SECTORS, &n);
 358        if (ret < 0) {
 359            goto ro_cleanup;
 360        }
 361        if (ret) {
 362            ret = blk_pread(src, sector * BDRV_SECTOR_SIZE, buf,
 363                            n * BDRV_SECTOR_SIZE);
 364            if (ret < 0) {
 365                goto ro_cleanup;
 366            }
 367
 368            ret = blk_pwrite(backing, sector * BDRV_SECTOR_SIZE, buf,
 369                             n * BDRV_SECTOR_SIZE, 0);
 370            if (ret < 0) {
 371                goto ro_cleanup;
 372            }
 373        }
 374    }
 375
 376    if (drv->bdrv_make_empty) {
 377        ret = drv->bdrv_make_empty(bs);
 378        if (ret < 0) {
 379            goto ro_cleanup;
 380        }
 381        blk_flush(src);
 382    }
 383
 384    /*
 385     * Make sure all data we wrote to the backing device is actually
 386     * stable on disk.
 387     */
 388    blk_flush(backing);
 389
 390    ret = 0;
 391ro_cleanup:
 392    qemu_vfree(buf);
 393
 394    blk_unref(src);
 395    blk_unref(backing);
 396
 397    if (ro) {
 398        /* ignoring error return here */
 399        bdrv_reopen(bs->backing->bs, open_flags & ~BDRV_O_RDWR, NULL);
 400    }
 401
 402    return ret;
 403}
 404