qemu/block/block-copy.c
<<
>>
Prefs
   1/*
   2 * block_copy API
   3 *
   4 * Copyright (C) 2013 Proxmox Server Solutions
   5 * Copyright (c) 2019 Virtuozzo International GmbH.
   6 *
   7 * Authors:
   8 *  Dietmar Maurer (dietmar@proxmox.com)
   9 *  Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
  10 *
  11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
  12 * See the COPYING file in the top-level directory.
  13 */
  14
  15#include "qemu/osdep.h"
  16
  17#include "trace.h"
  18#include "qapi/error.h"
  19#include "block/block-copy.h"
  20#include "sysemu/block-backend.h"
  21#include "qemu/units.h"
  22
  23#define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
  24#define BLOCK_COPY_MAX_BUFFER (1 * MiB)
  25#define BLOCK_COPY_MAX_MEM (128 * MiB)
  26
  27static void coroutine_fn block_copy_wait_inflight_reqs(BlockCopyState *s,
  28                                                       int64_t start,
  29                                                       int64_t end)
  30{
  31    BlockCopyInFlightReq *req;
  32    bool waited;
  33
  34    do {
  35        waited = false;
  36        QLIST_FOREACH(req, &s->inflight_reqs, list) {
  37            if (end > req->start_byte && start < req->end_byte) {
  38                qemu_co_queue_wait(&req->wait_queue, NULL);
  39                waited = true;
  40                break;
  41            }
  42        }
  43    } while (waited);
  44}
  45
  46static void block_copy_inflight_req_begin(BlockCopyState *s,
  47                                          BlockCopyInFlightReq *req,
  48                                          int64_t start, int64_t end)
  49{
  50    req->start_byte = start;
  51    req->end_byte = end;
  52    qemu_co_queue_init(&req->wait_queue);
  53    QLIST_INSERT_HEAD(&s->inflight_reqs, req, list);
  54}
  55
  56static void coroutine_fn block_copy_inflight_req_end(BlockCopyInFlightReq *req)
  57{
  58    QLIST_REMOVE(req, list);
  59    qemu_co_queue_restart_all(&req->wait_queue);
  60}
  61
  62void block_copy_state_free(BlockCopyState *s)
  63{
  64    if (!s) {
  65        return;
  66    }
  67
  68    bdrv_release_dirty_bitmap(s->copy_bitmap);
  69    shres_destroy(s->mem);
  70    g_free(s);
  71}
  72
  73BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
  74                                     int64_t cluster_size,
  75                                     BdrvRequestFlags write_flags, Error **errp)
  76{
  77    BlockCopyState *s;
  78    BdrvDirtyBitmap *copy_bitmap;
  79    uint32_t max_transfer =
  80            MIN_NON_ZERO(INT_MAX,
  81                         MIN_NON_ZERO(source->bs->bl.max_transfer,
  82                                      target->bs->bl.max_transfer));
  83
  84    copy_bitmap = bdrv_create_dirty_bitmap(source->bs, cluster_size, NULL,
  85                                           errp);
  86    if (!copy_bitmap) {
  87        return NULL;
  88    }
  89    bdrv_disable_dirty_bitmap(copy_bitmap);
  90
  91    s = g_new(BlockCopyState, 1);
  92    *s = (BlockCopyState) {
  93        .source = source,
  94        .target = target,
  95        .copy_bitmap = copy_bitmap,
  96        .cluster_size = cluster_size,
  97        .len = bdrv_dirty_bitmap_size(copy_bitmap),
  98        .write_flags = write_flags,
  99        .mem = shres_create(BLOCK_COPY_MAX_MEM),
 100    };
 101
 102    if (max_transfer < cluster_size) {
 103        /*
 104         * copy_range does not respect max_transfer. We don't want to bother
 105         * with requests smaller than block-copy cluster size, so fallback to
 106         * buffered copying (read and write respect max_transfer on their
 107         * behalf).
 108         */
 109        s->use_copy_range = false;
 110        s->copy_size = cluster_size;
 111    } else if (write_flags & BDRV_REQ_WRITE_COMPRESSED) {
 112        /* Compression supports only cluster-size writes and no copy-range. */
 113        s->use_copy_range = false;
 114        s->copy_size = cluster_size;
 115    } else {
 116        /*
 117         * copy_range does not respect max_transfer (it's a TODO), so we factor
 118         * that in here.
 119         */
 120        s->use_copy_range = true;
 121        s->copy_size = MIN(MAX(cluster_size, BLOCK_COPY_MAX_COPY_RANGE),
 122                           QEMU_ALIGN_DOWN(max_transfer, cluster_size));
 123    }
 124
 125    QLIST_INIT(&s->inflight_reqs);
 126
 127    return s;
 128}
 129
 130void block_copy_set_callbacks(
 131        BlockCopyState *s,
 132        ProgressBytesCallbackFunc progress_bytes_callback,
 133        ProgressResetCallbackFunc progress_reset_callback,
 134        void *progress_opaque)
 135{
 136    s->progress_bytes_callback = progress_bytes_callback;
 137    s->progress_reset_callback = progress_reset_callback;
 138    s->progress_opaque = progress_opaque;
 139}
 140
 141/*
 142 * block_copy_do_copy
 143 *
 144 * Do copy of cluser-aligned chunk. @end is allowed to exceed s->len only to
 145 * cover last cluster when s->len is not aligned to clusters.
 146 *
 147 * No sync here: nor bitmap neighter intersecting requests handling, only copy.
 148 *
 149 * Returns 0 on success.
 150 */
 151static int coroutine_fn block_copy_do_copy(BlockCopyState *s,
 152                                           int64_t start, int64_t end,
 153                                           bool *error_is_read)
 154{
 155    int ret;
 156    int nbytes = MIN(end, s->len) - start;
 157    void *bounce_buffer = NULL;
 158
 159    assert(QEMU_IS_ALIGNED(start, s->cluster_size));
 160    assert(QEMU_IS_ALIGNED(end, s->cluster_size));
 161    assert(end < s->len || end == QEMU_ALIGN_UP(s->len, s->cluster_size));
 162
 163    if (s->use_copy_range) {
 164        ret = bdrv_co_copy_range(s->source, start, s->target, start, nbytes,
 165                                 0, s->write_flags);
 166        if (ret < 0) {
 167            trace_block_copy_copy_range_fail(s, start, ret);
 168            s->use_copy_range = false;
 169            s->copy_size = MAX(s->cluster_size, BLOCK_COPY_MAX_BUFFER);
 170            /* Fallback to read+write with allocated buffer */
 171        } else {
 172            goto out;
 173        }
 174    }
 175
 176    /*
 177     * In case of failed copy_range request above, we may proceed with buffered
 178     * request larger than BLOCK_COPY_MAX_BUFFER. Still, further requests will
 179     * be properly limited, so don't care too much.
 180     */
 181
 182    bounce_buffer = qemu_blockalign(s->source->bs, nbytes);
 183
 184    ret = bdrv_co_pread(s->source, start, nbytes, bounce_buffer, 0);
 185    if (ret < 0) {
 186        trace_block_copy_read_fail(s, start, ret);
 187        if (error_is_read) {
 188            *error_is_read = true;
 189        }
 190        goto out;
 191    }
 192
 193    ret = bdrv_co_pwrite(s->target, start, nbytes, bounce_buffer,
 194                         s->write_flags);
 195    if (ret < 0) {
 196        trace_block_copy_write_fail(s, start, ret);
 197        if (error_is_read) {
 198            *error_is_read = false;
 199        }
 200        goto out;
 201    }
 202
 203out:
 204    qemu_vfree(bounce_buffer);
 205
 206    return ret;
 207}
 208
 209/*
 210 * Check if the cluster starting at offset is allocated or not.
 211 * return via pnum the number of contiguous clusters sharing this allocation.
 212 */
 213static int block_copy_is_cluster_allocated(BlockCopyState *s, int64_t offset,
 214                                           int64_t *pnum)
 215{
 216    BlockDriverState *bs = s->source->bs;
 217    int64_t count, total_count = 0;
 218    int64_t bytes = s->len - offset;
 219    int ret;
 220
 221    assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
 222
 223    while (true) {
 224        ret = bdrv_is_allocated(bs, offset, bytes, &count);
 225        if (ret < 0) {
 226            return ret;
 227        }
 228
 229        total_count += count;
 230
 231        if (ret || count == 0) {
 232            /*
 233             * ret: partial segment(s) are considered allocated.
 234             * otherwise: unallocated tail is treated as an entire segment.
 235             */
 236            *pnum = DIV_ROUND_UP(total_count, s->cluster_size);
 237            return ret;
 238        }
 239
 240        /* Unallocated segment(s) with uncertain following segment(s) */
 241        if (total_count >= s->cluster_size) {
 242            *pnum = total_count / s->cluster_size;
 243            return 0;
 244        }
 245
 246        offset += count;
 247        bytes -= count;
 248    }
 249}
 250
 251/*
 252 * Reset bits in copy_bitmap starting at offset if they represent unallocated
 253 * data in the image. May reset subsequent contiguous bits.
 254 * @return 0 when the cluster at @offset was unallocated,
 255 *         1 otherwise, and -ret on error.
 256 */
 257int64_t block_copy_reset_unallocated(BlockCopyState *s,
 258                                     int64_t offset, int64_t *count)
 259{
 260    int ret;
 261    int64_t clusters, bytes;
 262
 263    ret = block_copy_is_cluster_allocated(s, offset, &clusters);
 264    if (ret < 0) {
 265        return ret;
 266    }
 267
 268    bytes = clusters * s->cluster_size;
 269
 270    if (!ret) {
 271        bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
 272        s->progress_reset_callback(s->progress_opaque);
 273    }
 274
 275    *count = bytes;
 276    return ret;
 277}
 278
 279int coroutine_fn block_copy(BlockCopyState *s,
 280                            int64_t start, uint64_t bytes,
 281                            bool *error_is_read)
 282{
 283    int ret = 0;
 284    int64_t end = bytes + start; /* bytes */
 285    int64_t status_bytes;
 286    BlockCopyInFlightReq req;
 287
 288    /*
 289     * block_copy() user is responsible for keeping source and target in same
 290     * aio context
 291     */
 292    assert(bdrv_get_aio_context(s->source->bs) ==
 293           bdrv_get_aio_context(s->target->bs));
 294
 295    assert(QEMU_IS_ALIGNED(start, s->cluster_size));
 296    assert(QEMU_IS_ALIGNED(end, s->cluster_size));
 297
 298    block_copy_wait_inflight_reqs(s, start, bytes);
 299    block_copy_inflight_req_begin(s, &req, start, end);
 300
 301    while (start < end) {
 302        int64_t next_zero, chunk_end;
 303
 304        if (!bdrv_dirty_bitmap_get(s->copy_bitmap, start)) {
 305            trace_block_copy_skip(s, start);
 306            start += s->cluster_size;
 307            continue; /* already copied */
 308        }
 309
 310        chunk_end = MIN(end, start + s->copy_size);
 311
 312        next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, start,
 313                                                chunk_end - start);
 314        if (next_zero >= 0) {
 315            assert(next_zero > start); /* start is dirty */
 316            assert(next_zero < chunk_end); /* no need to do MIN() */
 317            chunk_end = next_zero;
 318        }
 319
 320        if (s->skip_unallocated) {
 321            ret = block_copy_reset_unallocated(s, start, &status_bytes);
 322            if (ret == 0) {
 323                trace_block_copy_skip_range(s, start, status_bytes);
 324                start += status_bytes;
 325                continue;
 326            }
 327            /* Clamp to known allocated region */
 328            chunk_end = MIN(chunk_end, start + status_bytes);
 329        }
 330
 331        trace_block_copy_process(s, start);
 332
 333        bdrv_reset_dirty_bitmap(s->copy_bitmap, start, chunk_end - start);
 334
 335        co_get_from_shres(s->mem, chunk_end - start);
 336        ret = block_copy_do_copy(s, start, chunk_end, error_is_read);
 337        co_put_to_shres(s->mem, chunk_end - start);
 338        if (ret < 0) {
 339            bdrv_set_dirty_bitmap(s->copy_bitmap, start, chunk_end - start);
 340            break;
 341        }
 342
 343        s->progress_bytes_callback(chunk_end - start, s->progress_opaque);
 344        start = chunk_end;
 345        ret = 0;
 346    }
 347
 348    block_copy_inflight_req_end(&req);
 349
 350    return ret;
 351}
 352