qemu/block/block-copy.c
<<
>>
Prefs
   1/*
   2 * block_copy API
   3 *
   4 * Copyright (C) 2013 Proxmox Server Solutions
   5 * Copyright (c) 2019 Virtuozzo International GmbH.
   6 *
   7 * Authors:
   8 *  Dietmar Maurer (dietmar@proxmox.com)
   9 *  Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
  10 *
  11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
  12 * See the COPYING file in the top-level directory.
  13 */
  14
  15#include "qemu/osdep.h"
  16
  17#include "trace.h"
  18#include "qapi/error.h"
  19#include "block/block-copy.h"
  20#include "sysemu/block-backend.h"
  21#include "qemu/units.h"
  22
  23#define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
  24#define BLOCK_COPY_MAX_BUFFER (1 * MiB)
  25#define BLOCK_COPY_MAX_MEM (128 * MiB)
  26
  27typedef struct BlockCopyInFlightReq {
  28    int64_t offset;
  29    int64_t bytes;
  30    QLIST_ENTRY(BlockCopyInFlightReq) list;
  31    CoQueue wait_queue; /* coroutines blocked on this request */
  32} BlockCopyInFlightReq;
  33
  34typedef struct BlockCopyState {
  35    /*
  36     * BdrvChild objects are not owned or managed by block-copy. They are
  37     * provided by block-copy user and user is responsible for appropriate
  38     * permissions on these children.
  39     */
  40    BdrvChild *source;
  41    BdrvChild *target;
  42    BdrvDirtyBitmap *copy_bitmap;
  43    int64_t in_flight_bytes;
  44    int64_t cluster_size;
  45    bool use_copy_range;
  46    int64_t copy_size;
  47    uint64_t len;
  48    QLIST_HEAD(, BlockCopyInFlightReq) inflight_reqs;
  49
  50    BdrvRequestFlags write_flags;
  51
  52    /*
  53     * skip_unallocated:
  54     *
  55     * Used by sync=top jobs, which first scan the source node for unallocated
  56     * areas and clear them in the copy_bitmap.  During this process, the bitmap
  57     * is thus not fully initialized: It may still have bits set for areas that
  58     * are unallocated and should actually not be copied.
  59     *
  60     * This is indicated by skip_unallocated.
  61     *
  62     * In this case, block_copy() will query the source’s allocation status,
  63     * skip unallocated regions, clear them in the copy_bitmap, and invoke
  64     * block_copy_reset_unallocated() every time it does.
  65     */
  66    bool skip_unallocated;
  67
  68    ProgressMeter *progress;
  69    /* progress_bytes_callback: called when some copying progress is done. */
  70    ProgressBytesCallbackFunc progress_bytes_callback;
  71    void *progress_opaque;
  72
  73    SharedResource *mem;
  74} BlockCopyState;
  75
  76static BlockCopyInFlightReq *find_conflicting_inflight_req(BlockCopyState *s,
  77                                                           int64_t offset,
  78                                                           int64_t bytes)
  79{
  80    BlockCopyInFlightReq *req;
  81
  82    QLIST_FOREACH(req, &s->inflight_reqs, list) {
  83        if (offset + bytes > req->offset && offset < req->offset + req->bytes) {
  84            return req;
  85        }
  86    }
  87
  88    return NULL;
  89}
  90
  91/*
  92 * If there are no intersecting requests return false. Otherwise, wait for the
  93 * first found intersecting request to finish and return true.
  94 */
  95static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t offset,
  96                                             int64_t bytes)
  97{
  98    BlockCopyInFlightReq *req = find_conflicting_inflight_req(s, offset, bytes);
  99
 100    if (!req) {
 101        return false;
 102    }
 103
 104    qemu_co_queue_wait(&req->wait_queue, NULL);
 105
 106    return true;
 107}
 108
 109/* Called only on full-dirty region */
 110static void block_copy_inflight_req_begin(BlockCopyState *s,
 111                                          BlockCopyInFlightReq *req,
 112                                          int64_t offset, int64_t bytes)
 113{
 114    assert(!find_conflicting_inflight_req(s, offset, bytes));
 115
 116    bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
 117    s->in_flight_bytes += bytes;
 118
 119    req->offset = offset;
 120    req->bytes = bytes;
 121    qemu_co_queue_init(&req->wait_queue);
 122    QLIST_INSERT_HEAD(&s->inflight_reqs, req, list);
 123}
 124
 125/*
 126 * block_copy_inflight_req_shrink
 127 *
 128 * Drop the tail of the request to be handled later. Set dirty bits back and
 129 * wake up all requests waiting for us (may be some of them are not intersecting
 130 * with shrunk request)
 131 */
 132static void coroutine_fn block_copy_inflight_req_shrink(BlockCopyState *s,
 133        BlockCopyInFlightReq *req, int64_t new_bytes)
 134{
 135    if (new_bytes == req->bytes) {
 136        return;
 137    }
 138
 139    assert(new_bytes > 0 && new_bytes < req->bytes);
 140
 141    s->in_flight_bytes -= req->bytes - new_bytes;
 142    bdrv_set_dirty_bitmap(s->copy_bitmap,
 143                          req->offset + new_bytes, req->bytes - new_bytes);
 144
 145    req->bytes = new_bytes;
 146    qemu_co_queue_restart_all(&req->wait_queue);
 147}
 148
 149static void coroutine_fn block_copy_inflight_req_end(BlockCopyState *s,
 150                                                     BlockCopyInFlightReq *req,
 151                                                     int ret)
 152{
 153    s->in_flight_bytes -= req->bytes;
 154    if (ret < 0) {
 155        bdrv_set_dirty_bitmap(s->copy_bitmap, req->offset, req->bytes);
 156    }
 157    QLIST_REMOVE(req, list);
 158    qemu_co_queue_restart_all(&req->wait_queue);
 159}
 160
 161void block_copy_state_free(BlockCopyState *s)
 162{
 163    if (!s) {
 164        return;
 165    }
 166
 167    bdrv_release_dirty_bitmap(s->copy_bitmap);
 168    shres_destroy(s->mem);
 169    g_free(s);
 170}
 171
 172static uint32_t block_copy_max_transfer(BdrvChild *source, BdrvChild *target)
 173{
 174    return MIN_NON_ZERO(INT_MAX,
 175                        MIN_NON_ZERO(source->bs->bl.max_transfer,
 176                                     target->bs->bl.max_transfer));
 177}
 178
 179BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
 180                                     int64_t cluster_size,
 181                                     BdrvRequestFlags write_flags, Error **errp)
 182{
 183    BlockCopyState *s;
 184    BdrvDirtyBitmap *copy_bitmap;
 185
 186    copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
 187                                           errp);
 188    if (!copy_bitmap) {
 189        return NULL;
 190    }
 191    bdrv_disable_dirty_bitmap(copy_bitmap);
 192
 193    s = g_new(BlockCopyState, 1);
 194    *s = (BlockCopyState) {
 195        .source = source,
 196        .target = target,
 197        .copy_bitmap = copy_bitmap,
 198        .cluster_size = cluster_size,
 199        .len = bdrv_dirty_bitmap_size(copy_bitmap),
 200        .write_flags = write_flags,
 201        .mem = shres_create(BLOCK_COPY_MAX_MEM),
 202    };
 203
 204    if (block_copy_max_transfer(source, target) < cluster_size) {
 205        /*
 206         * copy_range does not respect max_transfer. We don't want to bother
 207         * with requests smaller than block-copy cluster size, so fallback to
 208         * buffered copying (read and write respect max_transfer on their
 209         * behalf).
 210         */
 211        s->use_copy_range = false;
 212        s->copy_size = cluster_size;
 213    } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) {
 214        /* Compression supports only cluster-size writes and no copy-range. */
 215        s->use_copy_range = false;
 216        s->copy_size = cluster_size;
 217    } else {
 218        /*
 219         * We enable copy-range, but keep small copy_size, until first
 220         * successful copy_range (look at block_copy_do_copy).
 221         */
 222        s->use_copy_range = true;
 223        s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
 224    }
 225
 226    QLIST_INIT(&s->inflight_reqs);
 227
 228    return s;
 229}
 230
 231void block_copy_set_progress_callback(
 232        BlockCopyState *s,
 233        ProgressBytesCallbackFunc progress_bytes_callback,
 234        void *progress_opaque)
 235{
 236    s->progress_bytes_callback = progress_bytes_callback;
 237    s->progress_opaque = progress_opaque;
 238}
 239
 240void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm)
 241{
 242    s->progress = pm;
 243}
 244
 245/*
 246 * block_copy_do_copy
 247 *
 248 * Do copy of cluster-aligned chunk. Requested region is allowed to exceed
 249 * s->len only to cover last cluster when s->len is not aligned to clusters.
 250 *
 251 * No sync here: nor bitmap neighter intersecting requests handling, only copy.
 252 *
 253 * Returns 0 on success.
 254 */
 255static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
 256                                           int64_t offset, int64_t bytes,
 257                                           bool zeroes, bool *error_is_read)
 258{
 259    int ret;
 260    int64_t nbytes = MIN(offset + bytes, s->len) - offset;
 261    void *bounce_buffer = NULL;
 262
 263    assert(offset >= 0 && bytes > 0 && INT64_MAX - offset >= bytes);
 264    assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
 265    assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
 266    assert(offset < s->len);
 267    assert(offset + bytes <= s->len ||
 268           offset + bytes == QEMU_ALIGN_UP(s->len, s->cluster_size));
 269    assert(nbytes < INT_MAX);
 270
 271    if (zeroes) {
 272        ret = bdrv_co_pwrite_zeroes(s->target, offset, nbytes, s->write_flags &
 273                                    ~BDRV_REQ_WRITE_COMPRESSED);
 274        if (ret < 0) {
 275            trace_block_copy_write_zeroes_fail(s, offset, ret);
 276            if (error_is_read) {
 277                *error_is_read = false;
 278            }
 279        }
 280        return ret;
 281    }
 282
 283    if (s->use_copy_range) {
 284        ret = bdrv_co_copy_range(s->source, offset, s->target, offset, nbytes,
 285                                 0, s->write_flags);
 286        if (ret < 0) {
 287            trace_block_copy_copy_range_fail(s, offset, ret);
 288            s->use_copy_range = false;
 289            s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
 290            /* Fallback to read+write with allocated buffer */
 291        } else {
 292            if (s->use_copy_range) {
 293                /*
 294                 * Successful copy-range. Now increase copy_size.  copy_range
 295                 * does not respect max_transfer (it's a TODO), so we factor
 296                 * that in here.
 297                 *
 298                 * Note: we double-check s->use_copy_range for the case when
 299                 * parallel block-copy request unsets it during previous
 300                 * bdrv_co_copy_range call.
 301                 */
 302                s->copy_size =
 303                        MIN(MAX(s->cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
 304                            QEMU_ALIGN_DOWN(block_copy_max_transfer(s->source,
 305                                                                    s->target),
 306                                            s->cluster_size));
 307            }
 308            goto out;
 309        }
 310    }
 311
 312    /*
 313     * In case of failed copy_range request above, we may proceed with buffered
 314     * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
 315     * be properly limited, so don't care too much. Moreover the most likely
 316     * case (copy_range is unsupported for the configuration, so the very first
 317     * copy_range request fails) is handled by setting large copy_size only
 318     * after first successful copy_range.
 319     */
 320
 321    bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
 322
 323    ret = bdrv_co_pread(s->source, offset, nbytes, bounce_buffer, 0);
 324    if (ret < 0) {
 325        trace_block_copy_read_fail(s, offset, ret);
 326        if (error_is_read) {
 327            *error_is_read = true;
 328        }
 329        goto out;
 330    }
 331
 332    ret = bdrv_co_pwrite(s->target, offset, nbytes, bounce_buffer,
 333                         s->write_flags);
 334    if (ret < 0) {
 335        trace_block_copy_write_fail(s, offset, ret);
 336        if (error_is_read) {
 337            *error_is_read = false;
 338        }
 339        goto out;
 340    }
 341
 342out:
 343    qemu_vfree(bounce_buffer);
 344
 345    return ret;
 346}
 347
 348static int block_copy_block_status(BlockCopyState *s, int64_t offset,
 349                                   int64_t bytes, int64_t *pnum)
 350{
 351    int64_t num;
 352    BlockDriverState *base;
 353    int ret;
 354
 355    if (s->skip_unallocated && s->source->bs->backing) {
 356        base = s->source->bs->backing->bs;
 357    } else {
 358        base = NULL;
 359    }
 360
 361    ret = bdrv_block_status_above(s->source->bs, base, offset, bytes, &num,
 362                                  NULL, NULL);
 363    if (ret < 0 || num < s->cluster_size) {
 364        /*
 365         * On error or if failed to obtain large enough chunk just fallback to
 366         * copy one cluster.
 367         */
 368        num = s->cluster_size;
 369        ret = BDRV_BLOCK_ALLOCATED | BDRV_BLOCK_DATA;
 370    } else if (offset + num == s->len) {
 371        num = QEMU_ALIGN_UP(num, s->cluster_size);
 372    } else {
 373        num = QEMU_ALIGN_DOWN(num, s->cluster_size);
 374    }
 375
 376    *pnum = num;
 377    return ret;
 378}
 379
 380/*
 381 * Check if the cluster starting at offset is allocated or not.
 382 * return via pnum the number of contiguous clusters sharing this allocation.
 383 */
 384static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
 385                                           int64_t *pnum)
 386{
 387    BlockDriverState *bs = s->source->bs;
 388    int64_t count, total_count = 0;
 389    int64_t bytes = s->len - offset;
 390    int ret;
 391
 392    assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
 393
 394    while (true) {
 395        ret = bdrv_is_allocated(bs, offset, bytes, &count);
 396        if (ret < 0) {
 397            return ret;
 398        }
 399
 400        total_count += count;
 401
 402        if (ret || count == 0) {
 403            /*
 404             * ret: partial segment(s) are considered allocated.
 405             * otherwise: unallocated tail is treated as an entire segment.
 406             */
 407            *pnum = DIV_ROUND_UP(total_count, s->cluster_size);
 408            return ret;
 409        }
 410
 411        /* Unallocated segment(s) with uncertain following segment(s) */
 412        if (total_count >= s->cluster_size) {
 413            *pnum = total_count / s->cluster_size;
 414            return 0;
 415        }
 416
 417        offset += count;
 418        bytes -= count;
 419    }
 420}
 421
 422/*
 423 * Reset bits in copy_bitmap starting at offset if they represent unallocated
 424 * data in the image. May reset subsequent contiguous bits.
 425 * @return 0 when the cluster at @offset was unallocated,
 426 *         1 otherwise, and -ret on error.
 427 */
 428int64_t block_copy_reset_unallocated(BlockCopyState *s,
 429                                     int64_t offset, int64_t *count)
 430{
 431    int ret;
 432    int64_t clusters, bytes;
 433
 434    ret = block_copy_is_cluster_allocated(s, offset, &clusters);
 435    if (ret < 0) {
 436        return ret;
 437    }
 438
 439    bytes = clusters * s->cluster_size;
 440
 441    if (!ret) {
 442        bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
 443        progress_set_remaining(s->progress,
 444                               bdrv_get_dirty_count(s->copy_bitmap) +
 445                               s->in_flight_bytes);
 446    }
 447
 448    *count = bytes;
 449    return ret;
 450}
 451
 452/*
 453 * block_copy_dirty_clusters
 454 *
 455 * Copy dirty clusters in @offset/@bytes range.
 456 * Returns 1 if dirty clusters found and successfully copied, 0 if no dirty
 457 * clusters found and -errno on failure.
 458 */
 459static int coroutine_fn block_copy_dirty_clusters(BlockCopyState *s,
 460                                                  int64_t offset, int64_t bytes,
 461                                                  bool *error_is_read)
 462{
 463    int ret = 0;
 464    bool found_dirty = false;
 465
 466    /*
 467     * block_copy() user is responsible for keeping source and target in same
 468     * aio context
 469     */
 470    assert(bdrv_get_aio_context(s->source->bs) ==
 471           bdrv_get_aio_context(s->target->bs));
 472
 473    assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
 474    assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
 475
 476    while (bytes) {
 477        BlockCopyInFlightReq req;
 478        int64_t next_zero, cur_bytes, status_bytes;
 479
 480        if (!bdrv_dirty_bitmap_get(s->copy_bitmap, offset)) {
 481            trace_block_copy_skip(s, offset);
 482            offset += s->cluster_size;
 483            bytes -= s->cluster_size;
 484            continue; /* already copied */
 485        }
 486
 487        found_dirty = true;
 488
 489        cur_bytes = MIN(bytes, s->copy_size);
 490
 491        next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, offset,
 492                                                cur_bytes);
 493        if (next_zero >= 0) {
 494            assert(next_zero > offset); /* offset is dirty */
 495            assert(next_zero < offset + cur_bytes); /* no need to do MIN() */
 496            cur_bytes = next_zero - offset;
 497        }
 498        block_copy_inflight_req_begin(s, &req, offset, cur_bytes);
 499
 500        ret = block_copy_block_status(s, offset, cur_bytes, &status_bytes);
 501        assert(ret >= 0); /* never fail */
 502        cur_bytes = MIN(cur_bytes, status_bytes);
 503        block_copy_inflight_req_shrink(s, &req, cur_bytes);
 504        if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) {
 505            block_copy_inflight_req_end(s, &req, 0);
 506            progress_set_remaining(s->progress,
 507                                   bdrv_get_dirty_count(s->copy_bitmap) +
 508                                   s->in_flight_bytes);
 509            trace_block_copy_skip_range(s, offset, status_bytes);
 510            offset += status_bytes;
 511            bytes -= status_bytes;
 512            continue;
 513        }
 514
 515        trace_block_copy_process(s, offset);
 516
 517        co_get_from_shres(s->mem, cur_bytes);
 518        ret = block_copy_do_copy(s, offset, cur_bytes, ret & BDRV_BLOCK_ZERO,
 519                                 error_is_read);
 520        co_put_to_shres(s->mem, cur_bytes);
 521        block_copy_inflight_req_end(s, &req, ret);
 522        if (ret < 0) {
 523            return ret;
 524        }
 525
 526        progress_work_done(s->progress, cur_bytes);
 527        s->progress_bytes_callback(cur_bytes, s->progress_opaque);
 528        offset += cur_bytes;
 529        bytes -= cur_bytes;
 530    }
 531
 532    return found_dirty;
 533}
 534
 535/*
 536 * block_copy
 537 *
 538 * Copy requested region, accordingly to dirty bitmap.
 539 * Collaborate with parallel block_copy requests: if they succeed it will help
 540 * us. If they fail, we will retry not-copied regions. So, if we return error,
 541 * it means that some I/O operation failed in context of _this_ block_copy call,
 542 * not some parallel operation.
 543 */
 544int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes,
 545                            bool *error_is_read)
 546{
 547    int ret;
 548
 549    do {
 550        ret = block_copy_dirty_clusters(s, offset, bytes, error_is_read);
 551
 552        if (ret == 0) {
 553            ret = block_copy_wait_one(s, offset, bytes);
 554        }
 555
 556        /*
 557         * We retry in two cases:
 558         * 1. Some progress done
 559         *    Something was copied, which means that there were yield points
 560         *    and some new dirty bits may have appeared (due to failed parallel
 561         *    block-copy requests).
 562         * 2. We have waited for some intersecting block-copy request
 563         *    It may have failed and produced new dirty bits.
 564         */
 565    } while (ret > 0);
 566
 567    return ret;
 568}
 569
 570BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s)
 571{
 572    return s->copy_bitmap;
 573}
 574
 575void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip)
 576{
 577    s->skip_unallocated = skip;
 578}
 579