qemu/block/copy-before-write.c
<<
>>
Prefs
   1/*
   2 * copy-before-write filter driver
   3 *
   4 * The driver performs Copy-Before-Write (CBW) operation: it is injected above
   5 * some node, and before each write it copies _old_ data to the target node.
   6 *
   7 * Copyright (c) 2018-2021 Virtuozzo International GmbH.
   8 *
   9 * Author:
  10 *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
  11 *
  12 * This program is free software; you can redistribute it and/or modify
  13 * it under the terms of the GNU General Public License as published by
  14 * the Free Software Foundation; either version 2 of the License, or
  15 * (at your option) any later version.
  16 *
  17 * This program is distributed in the hope that it will be useful,
  18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 * GNU General Public License for more details.
  21 *
  22 * You should have received a copy of the GNU General Public License
  23 * along with this program. If not, see <http://www.gnu.org/licenses/>.
  24 */
  25
  26#include "qemu/osdep.h"
  27
  28#include "sysemu/block-backend.h"
  29#include "qemu/cutils.h"
  30#include "qapi/error.h"
  31#include "block/block_int.h"
  32#include "block/qdict.h"
  33#include "block/block-copy.h"
  34
  35#include "block/copy-before-write.h"
  36#include "block/reqlist.h"
  37
  38#include "qapi/qapi-visit-block-core.h"
  39
  40typedef struct BDRVCopyBeforeWriteState {
  41    BlockCopyState *bcs;
  42    BdrvChild *target;
  43
  44    /*
  45     * @lock: protects access to @access_bitmap, @done_bitmap and
  46     * @frozen_read_reqs
  47     */
  48    CoMutex lock;
  49
  50    /*
  51     * @access_bitmap: represents areas allowed for reading by fleecing user.
  52     * Reading from non-dirty areas leads to -EACCES.
  53     */
  54    BdrvDirtyBitmap *access_bitmap;
  55
  56    /*
  57     * @done_bitmap: represents areas that was successfully copied to @target by
  58     * copy-before-write operations.
  59     */
  60    BdrvDirtyBitmap *done_bitmap;
  61
  62    /*
  63     * @frozen_read_reqs: current read requests for fleecing user in bs->file
  64     * node. These areas must not be rewritten by guest.
  65     */
  66    BlockReqList frozen_read_reqs;
  67} BDRVCopyBeforeWriteState;
  68
  69static coroutine_fn int cbw_co_preadv(
  70        BlockDriverState *bs, int64_t offset, int64_t bytes,
  71        QEMUIOVector *qiov, BdrvRequestFlags flags)
  72{
  73    return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
  74}
  75
  76/*
  77 * Do copy-before-write operation.
  78 *
  79 * On failure guest request must be failed too.
  80 *
  81 * On success, we also wait for all in-flight fleecing read requests in source
  82 * node, and it's guaranteed that after cbw_do_copy_before_write() successful
  83 * return there are no such requests and they will never appear.
  84 */
  85static coroutine_fn int cbw_do_copy_before_write(BlockDriverState *bs,
  86        uint64_t offset, uint64_t bytes, BdrvRequestFlags flags)
  87{
  88    BDRVCopyBeforeWriteState *s = bs->opaque;
  89    int ret;
  90    uint64_t off, end;
  91    int64_t cluster_size = block_copy_cluster_size(s->bcs);
  92
  93    if (flags & BDRV_REQ_WRITE_UNCHANGED) {
  94        return 0;
  95    }
  96
  97    off = QEMU_ALIGN_DOWN(offset, cluster_size);
  98    end = QEMU_ALIGN_UP(offset + bytes, cluster_size);
  99
 100    ret = block_copy(s->bcs, off, end - off, true);
 101    if (ret < 0) {
 102        return ret;
 103    }
 104
 105    WITH_QEMU_LOCK_GUARD(&s->lock) {
 106        bdrv_set_dirty_bitmap(s->done_bitmap, off, end - off);
 107        reqlist_wait_all(&s->frozen_read_reqs, off, end - off, &s->lock);
 108    }
 109
 110    return 0;
 111}
 112
 113static int coroutine_fn cbw_co_pdiscard(BlockDriverState *bs,
 114                                        int64_t offset, int64_t bytes)
 115{
 116    int ret = cbw_do_copy_before_write(bs, offset, bytes, 0);
 117    if (ret < 0) {
 118        return ret;
 119    }
 120
 121    return bdrv_co_pdiscard(bs->file, offset, bytes);
 122}
 123
 124static int coroutine_fn cbw_co_pwrite_zeroes(BlockDriverState *bs,
 125        int64_t offset, int64_t bytes, BdrvRequestFlags flags)
 126{
 127    int ret = cbw_do_copy_before_write(bs, offset, bytes, flags);
 128    if (ret < 0) {
 129        return ret;
 130    }
 131
 132    return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
 133}
 134
 135static coroutine_fn int cbw_co_pwritev(BlockDriverState *bs,
 136                                       int64_t offset,
 137                                       int64_t bytes,
 138                                       QEMUIOVector *qiov,
 139                                       BdrvRequestFlags flags)
 140{
 141    int ret = cbw_do_copy_before_write(bs, offset, bytes, flags);
 142    if (ret < 0) {
 143        return ret;
 144    }
 145
 146    return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
 147}
 148
 149static int coroutine_fn cbw_co_flush(BlockDriverState *bs)
 150{
 151    if (!bs->file) {
 152        return 0;
 153    }
 154
 155    return bdrv_co_flush(bs->file->bs);
 156}
 157
 158/*
 159 * If @offset not accessible - return NULL.
 160 *
 161 * Otherwise, set @pnum to some bytes that accessible from @file (@file is set
 162 * to bs->file or to s->target). Return newly allocated BlockReq object that
 163 * should be than passed to cbw_snapshot_read_unlock().
 164 *
 165 * It's guaranteed that guest writes will not interact in the region until
 166 * cbw_snapshot_read_unlock() called.
 167 */
 168static BlockReq *cbw_snapshot_read_lock(BlockDriverState *bs,
 169                                        int64_t offset, int64_t bytes,
 170                                        int64_t *pnum, BdrvChild **file)
 171{
 172    BDRVCopyBeforeWriteState *s = bs->opaque;
 173    BlockReq *req = g_new(BlockReq, 1);
 174    bool done;
 175
 176    QEMU_LOCK_GUARD(&s->lock);
 177
 178    if (bdrv_dirty_bitmap_next_zero(s->access_bitmap, offset, bytes) != -1) {
 179        g_free(req);
 180        return NULL;
 181    }
 182
 183    done = bdrv_dirty_bitmap_status(s->done_bitmap, offset, bytes, pnum);
 184    if (done) {
 185        /*
 186         * Special invalid BlockReq, that is handled in
 187         * cbw_snapshot_read_unlock(). We don't need to lock something to read
 188         * from s->target.
 189         */
 190        *req = (BlockReq) {.offset = -1, .bytes = -1};
 191        *file = s->target;
 192    } else {
 193        reqlist_init_req(&s->frozen_read_reqs, req, offset, bytes);
 194        *file = bs->file;
 195    }
 196
 197    return req;
 198}
 199
 200static void cbw_snapshot_read_unlock(BlockDriverState *bs, BlockReq *req)
 201{
 202    BDRVCopyBeforeWriteState *s = bs->opaque;
 203
 204    if (req->offset == -1 && req->bytes == -1) {
 205        g_free(req);
 206        return;
 207    }
 208
 209    QEMU_LOCK_GUARD(&s->lock);
 210
 211    reqlist_remove_req(req);
 212    g_free(req);
 213}
 214
 215static coroutine_fn int
 216cbw_co_preadv_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes,
 217                       QEMUIOVector *qiov, size_t qiov_offset)
 218{
 219    BlockReq *req;
 220    BdrvChild *file;
 221    int ret;
 222
 223    /* TODO: upgrade to async loop using AioTask */
 224    while (bytes) {
 225        int64_t cur_bytes;
 226
 227        req = cbw_snapshot_read_lock(bs, offset, bytes, &cur_bytes, &file);
 228        if (!req) {
 229            return -EACCES;
 230        }
 231
 232        ret = bdrv_co_preadv_part(file, offset, cur_bytes,
 233                                  qiov, qiov_offset, 0);
 234        cbw_snapshot_read_unlock(bs, req);
 235        if (ret < 0) {
 236            return ret;
 237        }
 238
 239        bytes -= cur_bytes;
 240        offset += cur_bytes;
 241        qiov_offset += cur_bytes;
 242    }
 243
 244    return 0;
 245}
 246
 247static int coroutine_fn
 248cbw_co_snapshot_block_status(BlockDriverState *bs,
 249                             bool want_zero, int64_t offset, int64_t bytes,
 250                             int64_t *pnum, int64_t *map,
 251                             BlockDriverState **file)
 252{
 253    BDRVCopyBeforeWriteState *s = bs->opaque;
 254    BlockReq *req;
 255    int ret;
 256    int64_t cur_bytes;
 257    BdrvChild *child;
 258
 259    req = cbw_snapshot_read_lock(bs, offset, bytes, &cur_bytes, &child);
 260    if (!req) {
 261        return -EACCES;
 262    }
 263
 264    ret = bdrv_block_status(child->bs, offset, cur_bytes, pnum, map, file);
 265    if (child == s->target) {
 266        /*
 267         * We refer to s->target only for areas that we've written to it.
 268         * And we can not report unallocated blocks in s->target: this will
 269         * break generic block-status-above logic, that will go to
 270         * copy-before-write filtered child in this case.
 271         */
 272        assert(ret & BDRV_BLOCK_ALLOCATED);
 273    }
 274
 275    cbw_snapshot_read_unlock(bs, req);
 276
 277    return ret;
 278}
 279
 280static int coroutine_fn cbw_co_pdiscard_snapshot(BlockDriverState *bs,
 281                                                 int64_t offset, int64_t bytes)
 282{
 283    BDRVCopyBeforeWriteState *s = bs->opaque;
 284
 285    WITH_QEMU_LOCK_GUARD(&s->lock) {
 286        bdrv_reset_dirty_bitmap(s->access_bitmap, offset, bytes);
 287    }
 288
 289    block_copy_reset(s->bcs, offset, bytes);
 290
 291    return bdrv_co_pdiscard(s->target, offset, bytes);
 292}
 293
 294static void cbw_refresh_filename(BlockDriverState *bs)
 295{
 296    pstrcpy(bs->exact_filename, sizeof(bs->exact_filename),
 297            bs->file->bs->filename);
 298}
 299
 300static void cbw_child_perm(BlockDriverState *bs, BdrvChild *c,
 301                           BdrvChildRole role,
 302                           BlockReopenQueue *reopen_queue,
 303                           uint64_t perm, uint64_t shared,
 304                           uint64_t *nperm, uint64_t *nshared)
 305{
 306    if (!(role & BDRV_CHILD_FILTERED)) {
 307        /*
 308         * Target child
 309         *
 310         * Share write to target (child_file), to not interfere
 311         * with guest writes to its disk which may be in target backing chain.
 312         * Can't resize during a backup block job because we check the size
 313         * only upfront.
 314         */
 315        *nshared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
 316        *nperm = BLK_PERM_WRITE;
 317    } else {
 318        /* Source child */
 319        bdrv_default_perms(bs, c, role, reopen_queue,
 320                           perm, shared, nperm, nshared);
 321
 322        if (!QLIST_EMPTY(&bs->parents)) {
 323            if (perm & BLK_PERM_WRITE) {
 324                *nperm = *nperm | BLK_PERM_CONSISTENT_READ;
 325            }
 326            *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
 327        }
 328    }
 329}
 330
 331static bool cbw_parse_bitmap_option(QDict *options, BdrvDirtyBitmap **bitmap,
 332                                    Error **errp)
 333{
 334    QDict *bitmap_qdict = NULL;
 335    BlockDirtyBitmap *bmp_param = NULL;
 336    Visitor *v = NULL;
 337    bool ret = false;
 338
 339    *bitmap = NULL;
 340
 341    qdict_extract_subqdict(options, &bitmap_qdict, "bitmap.");
 342    if (!qdict_size(bitmap_qdict)) {
 343        ret = true;
 344        goto out;
 345    }
 346
 347    v = qobject_input_visitor_new_flat_confused(bitmap_qdict, errp);
 348    if (!v) {
 349        goto out;
 350    }
 351
 352    visit_type_BlockDirtyBitmap(v, NULL, &bmp_param, errp);
 353    if (!bmp_param) {
 354        goto out;
 355    }
 356
 357    *bitmap = block_dirty_bitmap_lookup(bmp_param->node, bmp_param->name, NULL,
 358                                        errp);
 359    if (!*bitmap) {
 360        goto out;
 361    }
 362
 363    ret = true;
 364
 365out:
 366    qapi_free_BlockDirtyBitmap(bmp_param);
 367    visit_free(v);
 368    qobject_unref(bitmap_qdict);
 369
 370    return ret;
 371}
 372
 373static int cbw_open(BlockDriverState *bs, QDict *options, int flags,
 374                    Error **errp)
 375{
 376    BDRVCopyBeforeWriteState *s = bs->opaque;
 377    BdrvDirtyBitmap *bitmap = NULL;
 378    int64_t cluster_size;
 379
 380    bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
 381                               BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
 382                               false, errp);
 383    if (!bs->file) {
 384        return -EINVAL;
 385    }
 386
 387    s->target = bdrv_open_child(NULL, options, "target", bs, &child_of_bds,
 388                                BDRV_CHILD_DATA, false, errp);
 389    if (!s->target) {
 390        return -EINVAL;
 391    }
 392
 393    if (!cbw_parse_bitmap_option(options, &bitmap, errp)) {
 394        return -EINVAL;
 395    }
 396
 397    bs->total_sectors = bs->file->bs->total_sectors;
 398    bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
 399            (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
 400    bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
 401            ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
 402             bs->file->bs->supported_zero_flags);
 403
 404    s->bcs = block_copy_state_new(bs->file, s->target, bitmap, errp);
 405    if (!s->bcs) {
 406        error_prepend(errp, "Cannot create block-copy-state: ");
 407        return -EINVAL;
 408    }
 409
 410    cluster_size = block_copy_cluster_size(s->bcs);
 411
 412    s->done_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp);
 413    if (!s->done_bitmap) {
 414        return -EINVAL;
 415    }
 416    bdrv_disable_dirty_bitmap(s->done_bitmap);
 417
 418    /* s->access_bitmap starts equal to bcs bitmap */
 419    s->access_bitmap = bdrv_create_dirty_bitmap(bs, cluster_size, NULL, errp);
 420    if (!s->access_bitmap) {
 421        return -EINVAL;
 422    }
 423    bdrv_disable_dirty_bitmap(s->access_bitmap);
 424    bdrv_dirty_bitmap_merge_internal(s->access_bitmap,
 425                                     block_copy_dirty_bitmap(s->bcs), NULL,
 426                                     true);
 427
 428    qemu_co_mutex_init(&s->lock);
 429    QLIST_INIT(&s->frozen_read_reqs);
 430
 431    return 0;
 432}
 433
 434static void cbw_close(BlockDriverState *bs)
 435{
 436    BDRVCopyBeforeWriteState *s = bs->opaque;
 437
 438    bdrv_release_dirty_bitmap(s->access_bitmap);
 439    bdrv_release_dirty_bitmap(s->done_bitmap);
 440
 441    block_copy_state_free(s->bcs);
 442    s->bcs = NULL;
 443}
 444
 445BlockDriver bdrv_cbw_filter = {
 446    .format_name = "copy-before-write",
 447    .instance_size = sizeof(BDRVCopyBeforeWriteState),
 448
 449    .bdrv_open                  = cbw_open,
 450    .bdrv_close                 = cbw_close,
 451
 452    .bdrv_co_preadv             = cbw_co_preadv,
 453    .bdrv_co_pwritev            = cbw_co_pwritev,
 454    .bdrv_co_pwrite_zeroes      = cbw_co_pwrite_zeroes,
 455    .bdrv_co_pdiscard           = cbw_co_pdiscard,
 456    .bdrv_co_flush              = cbw_co_flush,
 457
 458    .bdrv_co_preadv_snapshot       = cbw_co_preadv_snapshot,
 459    .bdrv_co_pdiscard_snapshot     = cbw_co_pdiscard_snapshot,
 460    .bdrv_co_snapshot_block_status = cbw_co_snapshot_block_status,
 461
 462    .bdrv_refresh_filename      = cbw_refresh_filename,
 463
 464    .bdrv_child_perm            = cbw_child_perm,
 465
 466    .is_filter = true,
 467};
 468
 469BlockDriverState *bdrv_cbw_append(BlockDriverState *source,
 470                                  BlockDriverState *target,
 471                                  const char *filter_node_name,
 472                                  BlockCopyState **bcs,
 473                                  Error **errp)
 474{
 475    ERRP_GUARD();
 476    BDRVCopyBeforeWriteState *state;
 477    BlockDriverState *top;
 478    QDict *opts;
 479
 480    assert(source->total_sectors == target->total_sectors);
 481    GLOBAL_STATE_CODE();
 482
 483    opts = qdict_new();
 484    qdict_put_str(opts, "driver", "copy-before-write");
 485    if (filter_node_name) {
 486        qdict_put_str(opts, "node-name", filter_node_name);
 487    }
 488    qdict_put_str(opts, "file", bdrv_get_node_name(source));
 489    qdict_put_str(opts, "target", bdrv_get_node_name(target));
 490
 491    top = bdrv_insert_node(source, opts, BDRV_O_RDWR, errp);
 492    if (!top) {
 493        return NULL;
 494    }
 495
 496    state = top->opaque;
 497    *bcs = state->bcs;
 498
 499    return top;
 500}
 501
 502void bdrv_cbw_drop(BlockDriverState *bs)
 503{
 504    GLOBAL_STATE_CODE();
 505    bdrv_drop_filter(bs, &error_abort);
 506    bdrv_unref(bs);
 507}
 508
 509static void cbw_init(void)
 510{
 511    bdrv_register(&bdrv_cbw_filter);
 512}
 513
 514block_init(cbw_init);
 515