qemu/block/io.c
<<
>>
Prefs
   1/*
   2 * Block layer I/O functions
   3 *
   4 * Copyright (c) 2003 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "trace.h"
  27#include "sysemu/block-backend.h"
  28#include "block/aio-wait.h"
  29#include "block/blockjob.h"
  30#include "block/blockjob_int.h"
  31#include "block/block_int.h"
  32#include "qemu/cutils.h"
  33#include "qapi/error.h"
  34#include "qemu/error-report.h"
  35#include "qemu/main-loop.h"
  36#include "sysemu/replay.h"
  37
  38#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  39
  40/* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
  41#define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
  42
  43static void bdrv_parent_cb_resize(BlockDriverState *bs);
  44static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
  45    int64_t offset, int bytes, BdrvRequestFlags flags);
  46
  47static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
  48                                      bool ignore_bds_parents)
  49{
  50    BdrvChild *c, *next;
  51
  52    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
  53        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
  54            continue;
  55        }
  56        bdrv_parent_drained_begin_single(c, false);
  57    }
  58}
  59
  60static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c,
  61                                                   int *drained_end_counter)
  62{
  63    assert(c->parent_quiesce_counter > 0);
  64    c->parent_quiesce_counter--;
  65    if (c->role->drained_end) {
  66        c->role->drained_end(c, drained_end_counter);
  67    }
  68}
  69
  70void bdrv_parent_drained_end_single(BdrvChild *c)
  71{
  72    int drained_end_counter = 0;
  73    bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter);
  74    BDRV_POLL_WHILE(c->bs, atomic_read(&drained_end_counter) > 0);
  75}
  76
  77static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
  78                                    bool ignore_bds_parents,
  79                                    int *drained_end_counter)
  80{
  81    BdrvChild *c;
  82
  83    QLIST_FOREACH(c, &bs->parents, next_parent) {
  84        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
  85            continue;
  86        }
  87        bdrv_parent_drained_end_single_no_poll(c, drained_end_counter);
  88    }
  89}
  90
  91static bool bdrv_parent_drained_poll_single(BdrvChild *c)
  92{
  93    if (c->role->drained_poll) {
  94        return c->role->drained_poll(c);
  95    }
  96    return false;
  97}
  98
  99static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
 100                                     bool ignore_bds_parents)
 101{
 102    BdrvChild *c, *next;
 103    bool busy = false;
 104
 105    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
 106        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
 107            continue;
 108        }
 109        busy |= bdrv_parent_drained_poll_single(c);
 110    }
 111
 112    return busy;
 113}
 114
 115void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
 116{
 117    c->parent_quiesce_counter++;
 118    if (c->role->drained_begin) {
 119        c->role->drained_begin(c);
 120    }
 121    if (poll) {
 122        BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c));
 123    }
 124}
 125
 126static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
 127{
 128    dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
 129    dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
 130    dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
 131                                 src->opt_mem_alignment);
 132    dst->min_mem_alignment = MAX(dst->min_mem_alignment,
 133                                 src->min_mem_alignment);
 134    dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
 135}
 136
 137void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
 138{
 139    BlockDriver *drv = bs->drv;
 140    Error *local_err = NULL;
 141
 142    memset(&bs->bl, 0, sizeof(bs->bl));
 143
 144    if (!drv) {
 145        return;
 146    }
 147
 148    /* Default alignment based on whether driver has byte interface */
 149    bs->bl.request_alignment = (drv->bdrv_co_preadv ||
 150                                drv->bdrv_aio_preadv ||
 151                                drv->bdrv_co_preadv_part) ? 1 : 512;
 152
 153    /* Take some limits from the children as a default */
 154    if (bs->file) {
 155        bdrv_refresh_limits(bs->file->bs, &local_err);
 156        if (local_err) {
 157            error_propagate(errp, local_err);
 158            return;
 159        }
 160        bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
 161    } else {
 162        bs->bl.min_mem_alignment = 512;
 163        bs->bl.opt_mem_alignment = qemu_real_host_page_size;
 164
 165        /* Safe default since most protocols use readv()/writev()/etc */
 166        bs->bl.max_iov = IOV_MAX;
 167    }
 168
 169    if (bs->backing) {
 170        bdrv_refresh_limits(bs->backing->bs, &local_err);
 171        if (local_err) {
 172            error_propagate(errp, local_err);
 173            return;
 174        }
 175        bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
 176    }
 177
 178    /* Then let the driver override it */
 179    if (drv->bdrv_refresh_limits) {
 180        drv->bdrv_refresh_limits(bs, errp);
 181    }
 182}
 183
 184/**
 185 * The copy-on-read flag is actually a reference count so multiple users may
 186 * use the feature without worrying about clobbering its previous state.
 187 * Copy-on-read stays enabled until all users have called to disable it.
 188 */
 189void bdrv_enable_copy_on_read(BlockDriverState *bs)
 190{
 191    atomic_inc(&bs->copy_on_read);
 192}
 193
 194void bdrv_disable_copy_on_read(BlockDriverState *bs)
 195{
 196    int old = atomic_fetch_dec(&bs->copy_on_read);
 197    assert(old >= 1);
 198}
 199
 200typedef struct {
 201    Coroutine *co;
 202    BlockDriverState *bs;
 203    bool done;
 204    bool begin;
 205    bool recursive;
 206    bool poll;
 207    BdrvChild *parent;
 208    bool ignore_bds_parents;
 209    int *drained_end_counter;
 210} BdrvCoDrainData;
 211
 212static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
 213{
 214    BdrvCoDrainData *data = opaque;
 215    BlockDriverState *bs = data->bs;
 216
 217    if (data->begin) {
 218        bs->drv->bdrv_co_drain_begin(bs);
 219    } else {
 220        bs->drv->bdrv_co_drain_end(bs);
 221    }
 222
 223    /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */
 224    atomic_mb_set(&data->done, true);
 225    if (!data->begin) {
 226        atomic_dec(data->drained_end_counter);
 227    }
 228    bdrv_dec_in_flight(bs);
 229
 230    g_free(data);
 231}
 232
 233/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
 234static void bdrv_drain_invoke(BlockDriverState *bs, bool begin,
 235                              int *drained_end_counter)
 236{
 237    BdrvCoDrainData *data;
 238
 239    if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
 240            (!begin && !bs->drv->bdrv_co_drain_end)) {
 241        return;
 242    }
 243
 244    data = g_new(BdrvCoDrainData, 1);
 245    *data = (BdrvCoDrainData) {
 246        .bs = bs,
 247        .done = false,
 248        .begin = begin,
 249        .drained_end_counter = drained_end_counter,
 250    };
 251
 252    if (!begin) {
 253        atomic_inc(drained_end_counter);
 254    }
 255
 256    /* Make sure the driver callback completes during the polling phase for
 257     * drain_begin. */
 258    bdrv_inc_in_flight(bs);
 259    data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
 260    aio_co_schedule(bdrv_get_aio_context(bs), data->co);
 261}
 262
 263/* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
 264bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
 265                     BdrvChild *ignore_parent, bool ignore_bds_parents)
 266{
 267    BdrvChild *child, *next;
 268
 269    if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
 270        return true;
 271    }
 272
 273    if (atomic_read(&bs->in_flight)) {
 274        return true;
 275    }
 276
 277    if (recursive) {
 278        assert(!ignore_bds_parents);
 279        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 280            if (bdrv_drain_poll(child->bs, recursive, child, false)) {
 281                return true;
 282            }
 283        }
 284    }
 285
 286    return false;
 287}
 288
 289static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
 290                                      BdrvChild *ignore_parent)
 291{
 292    return bdrv_drain_poll(bs, recursive, ignore_parent, false);
 293}
 294
 295static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 296                                  BdrvChild *parent, bool ignore_bds_parents,
 297                                  bool poll);
 298static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 299                                BdrvChild *parent, bool ignore_bds_parents,
 300                                int *drained_end_counter);
 301
 302static void bdrv_co_drain_bh_cb(void *opaque)
 303{
 304    BdrvCoDrainData *data = opaque;
 305    Coroutine *co = data->co;
 306    BlockDriverState *bs = data->bs;
 307
 308    if (bs) {
 309        AioContext *ctx = bdrv_get_aio_context(bs);
 310        AioContext *co_ctx = qemu_coroutine_get_aio_context(co);
 311
 312        /*
 313         * When the coroutine yielded, the lock for its home context was
 314         * released, so we need to re-acquire it here. If it explicitly
 315         * acquired a different context, the lock is still held and we don't
 316         * want to lock it a second time (or AIO_WAIT_WHILE() would hang).
 317         */
 318        if (ctx == co_ctx) {
 319            aio_context_acquire(ctx);
 320        }
 321        bdrv_dec_in_flight(bs);
 322        if (data->begin) {
 323            assert(!data->drained_end_counter);
 324            bdrv_do_drained_begin(bs, data->recursive, data->parent,
 325                                  data->ignore_bds_parents, data->poll);
 326        } else {
 327            assert(!data->poll);
 328            bdrv_do_drained_end(bs, data->recursive, data->parent,
 329                                data->ignore_bds_parents,
 330                                data->drained_end_counter);
 331        }
 332        if (ctx == co_ctx) {
 333            aio_context_release(ctx);
 334        }
 335    } else {
 336        assert(data->begin);
 337        bdrv_drain_all_begin();
 338    }
 339
 340    data->done = true;
 341    aio_co_wake(co);
 342}
 343
 344static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
 345                                                bool begin, bool recursive,
 346                                                BdrvChild *parent,
 347                                                bool ignore_bds_parents,
 348                                                bool poll,
 349                                                int *drained_end_counter)
 350{
 351    BdrvCoDrainData data;
 352
 353    /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
 354     * other coroutines run if they were queued by aio_co_enter(). */
 355
 356    assert(qemu_in_coroutine());
 357    data = (BdrvCoDrainData) {
 358        .co = qemu_coroutine_self(),
 359        .bs = bs,
 360        .done = false,
 361        .begin = begin,
 362        .recursive = recursive,
 363        .parent = parent,
 364        .ignore_bds_parents = ignore_bds_parents,
 365        .poll = poll,
 366        .drained_end_counter = drained_end_counter,
 367    };
 368
 369    if (bs) {
 370        bdrv_inc_in_flight(bs);
 371    }
 372    replay_bh_schedule_oneshot_event(bdrv_get_aio_context(bs),
 373                                     bdrv_co_drain_bh_cb, &data);
 374
 375    qemu_coroutine_yield();
 376    /* If we are resumed from some other event (such as an aio completion or a
 377     * timer callback), it is a bug in the caller that should be fixed. */
 378    assert(data.done);
 379}
 380
 381void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
 382                                   BdrvChild *parent, bool ignore_bds_parents)
 383{
 384    assert(!qemu_in_coroutine());
 385
 386    /* Stop things in parent-to-child order */
 387    if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
 388        aio_disable_external(bdrv_get_aio_context(bs));
 389    }
 390
 391    bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
 392    bdrv_drain_invoke(bs, true, NULL);
 393}
 394
 395static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 396                                  BdrvChild *parent, bool ignore_bds_parents,
 397                                  bool poll)
 398{
 399    BdrvChild *child, *next;
 400
 401    if (qemu_in_coroutine()) {
 402        bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
 403                               poll, NULL);
 404        return;
 405    }
 406
 407    bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
 408
 409    if (recursive) {
 410        assert(!ignore_bds_parents);
 411        bs->recursive_quiesce_counter++;
 412        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 413            bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
 414                                  false);
 415        }
 416    }
 417
 418    /*
 419     * Wait for drained requests to finish.
 420     *
 421     * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
 422     * call is needed so things in this AioContext can make progress even
 423     * though we don't return to the main AioContext loop - this automatically
 424     * includes other nodes in the same AioContext and therefore all child
 425     * nodes.
 426     */
 427    if (poll) {
 428        assert(!ignore_bds_parents);
 429        BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
 430    }
 431}
 432
 433void bdrv_drained_begin(BlockDriverState *bs)
 434{
 435    bdrv_do_drained_begin(bs, false, NULL, false, true);
 436}
 437
 438void bdrv_subtree_drained_begin(BlockDriverState *bs)
 439{
 440    bdrv_do_drained_begin(bs, true, NULL, false, true);
 441}
 442
 443/**
 444 * This function does not poll, nor must any of its recursively called
 445 * functions.  The *drained_end_counter pointee will be incremented
 446 * once for every background operation scheduled, and decremented once
 447 * the operation settles.  Therefore, the pointer must remain valid
 448 * until the pointee reaches 0.  That implies that whoever sets up the
 449 * pointee has to poll until it is 0.
 450 *
 451 * We use atomic operations to access *drained_end_counter, because
 452 * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of
 453 *     @bs may contain nodes in different AioContexts,
 454 * (2) bdrv_drain_all_end() uses the same counter for all nodes,
 455 *     regardless of which AioContext they are in.
 456 */
 457static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 458                                BdrvChild *parent, bool ignore_bds_parents,
 459                                int *drained_end_counter)
 460{
 461    BdrvChild *child;
 462    int old_quiesce_counter;
 463
 464    assert(drained_end_counter != NULL);
 465
 466    if (qemu_in_coroutine()) {
 467        bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
 468                               false, drained_end_counter);
 469        return;
 470    }
 471    assert(bs->quiesce_counter > 0);
 472
 473    /* Re-enable things in child-to-parent order */
 474    bdrv_drain_invoke(bs, false, drained_end_counter);
 475    bdrv_parent_drained_end(bs, parent, ignore_bds_parents,
 476                            drained_end_counter);
 477
 478    old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
 479    if (old_quiesce_counter == 1) {
 480        aio_enable_external(bdrv_get_aio_context(bs));
 481    }
 482
 483    if (recursive) {
 484        assert(!ignore_bds_parents);
 485        bs->recursive_quiesce_counter--;
 486        QLIST_FOREACH(child, &bs->children, next) {
 487            bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents,
 488                                drained_end_counter);
 489        }
 490    }
 491}
 492
 493void bdrv_drained_end(BlockDriverState *bs)
 494{
 495    int drained_end_counter = 0;
 496    bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter);
 497    BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0);
 498}
 499
 500void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter)
 501{
 502    bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter);
 503}
 504
 505void bdrv_subtree_drained_end(BlockDriverState *bs)
 506{
 507    int drained_end_counter = 0;
 508    bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter);
 509    BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0);
 510}
 511
 512void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
 513{
 514    int i;
 515
 516    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
 517        bdrv_do_drained_begin(child->bs, true, child, false, true);
 518    }
 519}
 520
 521void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
 522{
 523    int drained_end_counter = 0;
 524    int i;
 525
 526    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
 527        bdrv_do_drained_end(child->bs, true, child, false,
 528                            &drained_end_counter);
 529    }
 530
 531    BDRV_POLL_WHILE(child->bs, atomic_read(&drained_end_counter) > 0);
 532}
 533
 534/*
 535 * Wait for pending requests to complete on a single BlockDriverState subtree,
 536 * and suspend block driver's internal I/O until next request arrives.
 537 *
 538 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
 539 * AioContext.
 540 */
 541void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
 542{
 543    assert(qemu_in_coroutine());
 544    bdrv_drained_begin(bs);
 545    bdrv_drained_end(bs);
 546}
 547
 548void bdrv_drain(BlockDriverState *bs)
 549{
 550    bdrv_drained_begin(bs);
 551    bdrv_drained_end(bs);
 552}
 553
 554static void bdrv_drain_assert_idle(BlockDriverState *bs)
 555{
 556    BdrvChild *child, *next;
 557
 558    assert(atomic_read(&bs->in_flight) == 0);
 559    QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 560        bdrv_drain_assert_idle(child->bs);
 561    }
 562}
 563
 564unsigned int bdrv_drain_all_count = 0;
 565
 566static bool bdrv_drain_all_poll(void)
 567{
 568    BlockDriverState *bs = NULL;
 569    bool result = false;
 570
 571    /* bdrv_drain_poll() can't make changes to the graph and we are holding the
 572     * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
 573    while ((bs = bdrv_next_all_states(bs))) {
 574        AioContext *aio_context = bdrv_get_aio_context(bs);
 575        aio_context_acquire(aio_context);
 576        result |= bdrv_drain_poll(bs, false, NULL, true);
 577        aio_context_release(aio_context);
 578    }
 579
 580    return result;
 581}
 582
 583/*
 584 * Wait for pending requests to complete across all BlockDriverStates
 585 *
 586 * This function does not flush data to disk, use bdrv_flush_all() for that
 587 * after calling this function.
 588 *
 589 * This pauses all block jobs and disables external clients. It must
 590 * be paired with bdrv_drain_all_end().
 591 *
 592 * NOTE: no new block jobs or BlockDriverStates can be created between
 593 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
 594 */
 595void bdrv_drain_all_begin(void)
 596{
 597    BlockDriverState *bs = NULL;
 598
 599    if (qemu_in_coroutine()) {
 600        bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL);
 601        return;
 602    }
 603
 604    /*
 605     * bdrv queue is managed by record/replay,
 606     * waiting for finishing the I/O requests may
 607     * be infinite
 608     */
 609    if (replay_events_enabled()) {
 610        return;
 611    }
 612
 613    /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
 614     * loop AioContext, so make sure we're in the main context. */
 615    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 616    assert(bdrv_drain_all_count < INT_MAX);
 617    bdrv_drain_all_count++;
 618
 619    /* Quiesce all nodes, without polling in-flight requests yet. The graph
 620     * cannot change during this loop. */
 621    while ((bs = bdrv_next_all_states(bs))) {
 622        AioContext *aio_context = bdrv_get_aio_context(bs);
 623
 624        aio_context_acquire(aio_context);
 625        bdrv_do_drained_begin(bs, false, NULL, true, false);
 626        aio_context_release(aio_context);
 627    }
 628
 629    /* Now poll the in-flight requests */
 630    AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll());
 631
 632    while ((bs = bdrv_next_all_states(bs))) {
 633        bdrv_drain_assert_idle(bs);
 634    }
 635}
 636
 637void bdrv_drain_all_end(void)
 638{
 639    BlockDriverState *bs = NULL;
 640    int drained_end_counter = 0;
 641
 642    /*
 643     * bdrv queue is managed by record/replay,
 644     * waiting for finishing the I/O requests may
 645     * be endless
 646     */
 647    if (replay_events_enabled()) {
 648        return;
 649    }
 650
 651    while ((bs = bdrv_next_all_states(bs))) {
 652        AioContext *aio_context = bdrv_get_aio_context(bs);
 653
 654        aio_context_acquire(aio_context);
 655        bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter);
 656        aio_context_release(aio_context);
 657    }
 658
 659    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 660    AIO_WAIT_WHILE(NULL, atomic_read(&drained_end_counter) > 0);
 661
 662    assert(bdrv_drain_all_count > 0);
 663    bdrv_drain_all_count--;
 664}
 665
 666void bdrv_drain_all(void)
 667{
 668    bdrv_drain_all_begin();
 669    bdrv_drain_all_end();
 670}
 671
 672/**
 673 * Remove an active request from the tracked requests list
 674 *
 675 * This function should be called when a tracked request is completing.
 676 */
 677static void tracked_request_end(BdrvTrackedRequest *req)
 678{
 679    if (req->serialising) {
 680        atomic_dec(&req->bs->serialising_in_flight);
 681    }
 682
 683    qemu_co_mutex_lock(&req->bs->reqs_lock);
 684    QLIST_REMOVE(req, list);
 685    qemu_co_queue_restart_all(&req->wait_queue);
 686    qemu_co_mutex_unlock(&req->bs->reqs_lock);
 687}
 688
 689/**
 690 * Add an active request to the tracked requests list
 691 */
 692static void tracked_request_begin(BdrvTrackedRequest *req,
 693                                  BlockDriverState *bs,
 694                                  int64_t offset,
 695                                  uint64_t bytes,
 696                                  enum BdrvTrackedRequestType type)
 697{
 698    assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes);
 699
 700    *req = (BdrvTrackedRequest){
 701        .bs = bs,
 702        .offset         = offset,
 703        .bytes          = bytes,
 704        .type           = type,
 705        .co             = qemu_coroutine_self(),
 706        .serialising    = false,
 707        .overlap_offset = offset,
 708        .overlap_bytes  = bytes,
 709    };
 710
 711    qemu_co_queue_init(&req->wait_queue);
 712
 713    qemu_co_mutex_lock(&bs->reqs_lock);
 714    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
 715    qemu_co_mutex_unlock(&bs->reqs_lock);
 716}
 717
 718void bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
 719{
 720    int64_t overlap_offset = req->offset & ~(align - 1);
 721    uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
 722                               - overlap_offset;
 723
 724    if (!req->serialising) {
 725        atomic_inc(&req->bs->serialising_in_flight);
 726        req->serialising = true;
 727    }
 728
 729    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
 730    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
 731}
 732
 733static bool is_request_serialising_and_aligned(BdrvTrackedRequest *req)
 734{
 735    /*
 736     * If the request is serialising, overlap_offset and overlap_bytes are set,
 737     * so we can check if the request is aligned. Otherwise, don't care and
 738     * return false.
 739     */
 740
 741    return req->serialising && (req->offset == req->overlap_offset) &&
 742           (req->bytes == req->overlap_bytes);
 743}
 744
 745/**
 746 * Return the tracked request on @bs for the current coroutine, or
 747 * NULL if there is none.
 748 */
 749BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs)
 750{
 751    BdrvTrackedRequest *req;
 752    Coroutine *self = qemu_coroutine_self();
 753
 754    QLIST_FOREACH(req, &bs->tracked_requests, list) {
 755        if (req->co == self) {
 756            return req;
 757        }
 758    }
 759
 760    return NULL;
 761}
 762
 763/**
 764 * Round a region to cluster boundaries
 765 */
 766void bdrv_round_to_clusters(BlockDriverState *bs,
 767                            int64_t offset, int64_t bytes,
 768                            int64_t *cluster_offset,
 769                            int64_t *cluster_bytes)
 770{
 771    BlockDriverInfo bdi;
 772
 773    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
 774        *cluster_offset = offset;
 775        *cluster_bytes = bytes;
 776    } else {
 777        int64_t c = bdi.cluster_size;
 778        *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
 779        *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
 780    }
 781}
 782
 783static int bdrv_get_cluster_size(BlockDriverState *bs)
 784{
 785    BlockDriverInfo bdi;
 786    int ret;
 787
 788    ret = bdrv_get_info(bs, &bdi);
 789    if (ret < 0 || bdi.cluster_size == 0) {
 790        return bs->bl.request_alignment;
 791    } else {
 792        return bdi.cluster_size;
 793    }
 794}
 795
 796static bool tracked_request_overlaps(BdrvTrackedRequest *req,
 797                                     int64_t offset, uint64_t bytes)
 798{
 799    /*        aaaa   bbbb */
 800    if (offset >= req->overlap_offset + req->overlap_bytes) {
 801        return false;
 802    }
 803    /* bbbb   aaaa        */
 804    if (req->overlap_offset >= offset + bytes) {
 805        return false;
 806    }
 807    return true;
 808}
 809
 810void bdrv_inc_in_flight(BlockDriverState *bs)
 811{
 812    atomic_inc(&bs->in_flight);
 813}
 814
 815void bdrv_wakeup(BlockDriverState *bs)
 816{
 817    aio_wait_kick();
 818}
 819
 820void bdrv_dec_in_flight(BlockDriverState *bs)
 821{
 822    atomic_dec(&bs->in_flight);
 823    bdrv_wakeup(bs);
 824}
 825
 826bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self)
 827{
 828    BlockDriverState *bs = self->bs;
 829    BdrvTrackedRequest *req;
 830    bool retry;
 831    bool waited = false;
 832
 833    if (!atomic_read(&bs->serialising_in_flight)) {
 834        return false;
 835    }
 836
 837    do {
 838        retry = false;
 839        qemu_co_mutex_lock(&bs->reqs_lock);
 840        QLIST_FOREACH(req, &bs->tracked_requests, list) {
 841            if (req == self || (!req->serialising && !self->serialising)) {
 842                continue;
 843            }
 844            if (tracked_request_overlaps(req, self->overlap_offset,
 845                                         self->overlap_bytes))
 846            {
 847                /* Hitting this means there was a reentrant request, for
 848                 * example, a block driver issuing nested requests.  This must
 849                 * never happen since it means deadlock.
 850                 */
 851                assert(qemu_coroutine_self() != req->co);
 852
 853                /* If the request is already (indirectly) waiting for us, or
 854                 * will wait for us as soon as it wakes up, then just go on
 855                 * (instead of producing a deadlock in the former case). */
 856                if (!req->waiting_for) {
 857                    self->waiting_for = req;
 858                    qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
 859                    self->waiting_for = NULL;
 860                    retry = true;
 861                    waited = true;
 862                    break;
 863                }
 864            }
 865        }
 866        qemu_co_mutex_unlock(&bs->reqs_lock);
 867    } while (retry);
 868
 869    return waited;
 870}
 871
 872static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 873                                   size_t size)
 874{
 875    if (size > BDRV_REQUEST_MAX_BYTES) {
 876        return -EIO;
 877    }
 878
 879    if (!bdrv_is_inserted(bs)) {
 880        return -ENOMEDIUM;
 881    }
 882
 883    if (offset < 0) {
 884        return -EIO;
 885    }
 886
 887    return 0;
 888}
 889
 890typedef struct RwCo {
 891    BdrvChild *child;
 892    int64_t offset;
 893    QEMUIOVector *qiov;
 894    bool is_write;
 895    int ret;
 896    BdrvRequestFlags flags;
 897} RwCo;
 898
 899static void coroutine_fn bdrv_rw_co_entry(void *opaque)
 900{
 901    RwCo *rwco = opaque;
 902
 903    if (!rwco->is_write) {
 904        rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
 905                                   rwco->qiov->size, rwco->qiov,
 906                                   rwco->flags);
 907    } else {
 908        rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
 909                                    rwco->qiov->size, rwco->qiov,
 910                                    rwco->flags);
 911    }
 912    aio_wait_kick();
 913}
 914
 915/*
 916 * Process a vectored synchronous request using coroutines
 917 */
 918static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
 919                        QEMUIOVector *qiov, bool is_write,
 920                        BdrvRequestFlags flags)
 921{
 922    Coroutine *co;
 923    RwCo rwco = {
 924        .child = child,
 925        .offset = offset,
 926        .qiov = qiov,
 927        .is_write = is_write,
 928        .ret = NOT_DONE,
 929        .flags = flags,
 930    };
 931
 932    if (qemu_in_coroutine()) {
 933        /* Fast-path if already in coroutine context */
 934        bdrv_rw_co_entry(&rwco);
 935    } else {
 936        co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
 937        bdrv_coroutine_enter(child->bs, co);
 938        BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
 939    }
 940    return rwco.ret;
 941}
 942
 943int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
 944                       int bytes, BdrvRequestFlags flags)
 945{
 946    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
 947
 948    return bdrv_prwv_co(child, offset, &qiov, true,
 949                        BDRV_REQ_ZERO_WRITE | flags);
 950}
 951
 952/*
 953 * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
 954 * The operation is sped up by checking the block status and only writing
 955 * zeroes to the device if they currently do not return zeroes. Optional
 956 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
 957 * BDRV_REQ_FUA).
 958 *
 959 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
 960 */
 961int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
 962{
 963    int ret;
 964    int64_t target_size, bytes, offset = 0;
 965    BlockDriverState *bs = child->bs;
 966
 967    target_size = bdrv_getlength(bs);
 968    if (target_size < 0) {
 969        return target_size;
 970    }
 971
 972    for (;;) {
 973        bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
 974        if (bytes <= 0) {
 975            return 0;
 976        }
 977        ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
 978        if (ret < 0) {
 979            return ret;
 980        }
 981        if (ret & BDRV_BLOCK_ZERO) {
 982            offset += bytes;
 983            continue;
 984        }
 985        ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
 986        if (ret < 0) {
 987            return ret;
 988        }
 989        offset += bytes;
 990    }
 991}
 992
 993int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
 994{
 995    int ret;
 996
 997    ret = bdrv_prwv_co(child, offset, qiov, false, 0);
 998    if (ret < 0) {
 999        return ret;
1000    }
1001
1002    return qiov->size;
1003}
1004
1005/* See bdrv_pwrite() for the return codes */
1006int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
1007{
1008    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1009
1010    if (bytes < 0) {
1011        return -EINVAL;
1012    }
1013
1014    return bdrv_preadv(child, offset, &qiov);
1015}
1016
1017int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
1018{
1019    int ret;
1020
1021    ret = bdrv_prwv_co(child, offset, qiov, true, 0);
1022    if (ret < 0) {
1023        return ret;
1024    }
1025
1026    return qiov->size;
1027}
1028
1029/* Return no. of bytes on success or < 0 on error. Important errors are:
1030  -EIO         generic I/O error (may happen for all errors)
1031  -ENOMEDIUM   No media inserted.
1032  -EINVAL      Invalid offset or number of bytes
1033  -EACCES      Trying to write a read-only device
1034*/
1035int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
1036{
1037    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1038
1039    if (bytes < 0) {
1040        return -EINVAL;
1041    }
1042
1043    return bdrv_pwritev(child, offset, &qiov);
1044}
1045
1046/*
1047 * Writes to the file and ensures that no writes are reordered across this
1048 * request (acts as a barrier)
1049 *
1050 * Returns 0 on success, -errno in error cases.
1051 */
1052int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
1053                     const void *buf, int count)
1054{
1055    int ret;
1056
1057    ret = bdrv_pwrite(child, offset, buf, count);
1058    if (ret < 0) {
1059        return ret;
1060    }
1061
1062    ret = bdrv_flush(child->bs);
1063    if (ret < 0) {
1064        return ret;
1065    }
1066
1067    return 0;
1068}
1069
1070typedef struct CoroutineIOCompletion {
1071    Coroutine *coroutine;
1072    int ret;
1073} CoroutineIOCompletion;
1074
1075static void bdrv_co_io_em_complete(void *opaque, int ret)
1076{
1077    CoroutineIOCompletion *co = opaque;
1078
1079    co->ret = ret;
1080    aio_co_wake(co->coroutine);
1081}
1082
1083static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
1084                                           uint64_t offset, uint64_t bytes,
1085                                           QEMUIOVector *qiov,
1086                                           size_t qiov_offset, int flags)
1087{
1088    BlockDriver *drv = bs->drv;
1089    int64_t sector_num;
1090    unsigned int nb_sectors;
1091    QEMUIOVector local_qiov;
1092    int ret;
1093
1094    assert(!(flags & ~BDRV_REQ_MASK));
1095    assert(!(flags & BDRV_REQ_NO_FALLBACK));
1096
1097    if (!drv) {
1098        return -ENOMEDIUM;
1099    }
1100
1101    if (drv->bdrv_co_preadv_part) {
1102        return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset,
1103                                        flags);
1104    }
1105
1106    if (qiov_offset > 0 || bytes != qiov->size) {
1107        qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1108        qiov = &local_qiov;
1109    }
1110
1111    if (drv->bdrv_co_preadv) {
1112        ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
1113        goto out;
1114    }
1115
1116    if (drv->bdrv_aio_preadv) {
1117        BlockAIOCB *acb;
1118        CoroutineIOCompletion co = {
1119            .coroutine = qemu_coroutine_self(),
1120        };
1121
1122        acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
1123                                   bdrv_co_io_em_complete, &co);
1124        if (acb == NULL) {
1125            ret = -EIO;
1126            goto out;
1127        } else {
1128            qemu_coroutine_yield();
1129            ret = co.ret;
1130            goto out;
1131        }
1132    }
1133
1134    sector_num = offset >> BDRV_SECTOR_BITS;
1135    nb_sectors = bytes >> BDRV_SECTOR_BITS;
1136
1137    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
1138    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
1139    assert(bytes <= BDRV_REQUEST_MAX_BYTES);
1140    assert(drv->bdrv_co_readv);
1141
1142    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1143
1144out:
1145    if (qiov == &local_qiov) {
1146        qemu_iovec_destroy(&local_qiov);
1147    }
1148
1149    return ret;
1150}
1151
1152static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
1153                                            uint64_t offset, uint64_t bytes,
1154                                            QEMUIOVector *qiov,
1155                                            size_t qiov_offset, int flags)
1156{
1157    BlockDriver *drv = bs->drv;
1158    int64_t sector_num;
1159    unsigned int nb_sectors;
1160    QEMUIOVector local_qiov;
1161    int ret;
1162
1163    assert(!(flags & ~BDRV_REQ_MASK));
1164    assert(!(flags & BDRV_REQ_NO_FALLBACK));
1165
1166    if (!drv) {
1167        return -ENOMEDIUM;
1168    }
1169
1170    if (drv->bdrv_co_pwritev_part) {
1171        ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset,
1172                                        flags & bs->supported_write_flags);
1173        flags &= ~bs->supported_write_flags;
1174        goto emulate_flags;
1175    }
1176
1177    if (qiov_offset > 0 || bytes != qiov->size) {
1178        qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1179        qiov = &local_qiov;
1180    }
1181
1182    if (drv->bdrv_co_pwritev) {
1183        ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
1184                                   flags & bs->supported_write_flags);
1185        flags &= ~bs->supported_write_flags;
1186        goto emulate_flags;
1187    }
1188
1189    if (drv->bdrv_aio_pwritev) {
1190        BlockAIOCB *acb;
1191        CoroutineIOCompletion co = {
1192            .coroutine = qemu_coroutine_self(),
1193        };
1194
1195        acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
1196                                    flags & bs->supported_write_flags,
1197                                    bdrv_co_io_em_complete, &co);
1198        flags &= ~bs->supported_write_flags;
1199        if (acb == NULL) {
1200            ret = -EIO;
1201        } else {
1202            qemu_coroutine_yield();
1203            ret = co.ret;
1204        }
1205        goto emulate_flags;
1206    }
1207
1208    sector_num = offset >> BDRV_SECTOR_BITS;
1209    nb_sectors = bytes >> BDRV_SECTOR_BITS;
1210
1211    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
1212    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
1213    assert(bytes <= BDRV_REQUEST_MAX_BYTES);
1214
1215    assert(drv->bdrv_co_writev);
1216    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
1217                              flags & bs->supported_write_flags);
1218    flags &= ~bs->supported_write_flags;
1219
1220emulate_flags:
1221    if (ret == 0 && (flags & BDRV_REQ_FUA)) {
1222        ret = bdrv_co_flush(bs);
1223    }
1224
1225    if (qiov == &local_qiov) {
1226        qemu_iovec_destroy(&local_qiov);
1227    }
1228
1229    return ret;
1230}
1231
1232static int coroutine_fn
1233bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
1234                               uint64_t bytes, QEMUIOVector *qiov,
1235                               size_t qiov_offset)
1236{
1237    BlockDriver *drv = bs->drv;
1238    QEMUIOVector local_qiov;
1239    int ret;
1240
1241    if (!drv) {
1242        return -ENOMEDIUM;
1243    }
1244
1245    if (!block_driver_can_compress(drv)) {
1246        return -ENOTSUP;
1247    }
1248
1249    if (drv->bdrv_co_pwritev_compressed_part) {
1250        return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes,
1251                                                    qiov, qiov_offset);
1252    }
1253
1254    if (qiov_offset == 0) {
1255        return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
1256    }
1257
1258    qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1259    ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov);
1260    qemu_iovec_destroy(&local_qiov);
1261
1262    return ret;
1263}
1264
1265static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
1266        int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1267        size_t qiov_offset, int flags)
1268{
1269    BlockDriverState *bs = child->bs;
1270
1271    /* Perform I/O through a temporary buffer so that users who scribble over
1272     * their read buffer while the operation is in progress do not end up
1273     * modifying the image file.  This is critical for zero-copy guest I/O
1274     * where anything might happen inside guest memory.
1275     */
1276    void *bounce_buffer = NULL;
1277
1278    BlockDriver *drv = bs->drv;
1279    int64_t cluster_offset;
1280    int64_t cluster_bytes;
1281    size_t skip_bytes;
1282    int ret;
1283    int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1284                                    BDRV_REQUEST_MAX_BYTES);
1285    unsigned int progress = 0;
1286    bool skip_write;
1287
1288    if (!drv) {
1289        return -ENOMEDIUM;
1290    }
1291
1292    /*
1293     * Do not write anything when the BDS is inactive.  That is not
1294     * allowed, and it would not help.
1295     */
1296    skip_write = (bs->open_flags & BDRV_O_INACTIVE);
1297
1298    /* FIXME We cannot require callers to have write permissions when all they
1299     * are doing is a read request. If we did things right, write permissions
1300     * would be obtained anyway, but internally by the copy-on-read code. As
1301     * long as it is implemented here rather than in a separate filter driver,
1302     * the copy-on-read code doesn't have its own BdrvChild, however, for which
1303     * it could request permissions. Therefore we have to bypass the permission
1304     * system for the moment. */
1305    // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1306
1307    /* Cover entire cluster so no additional backing file I/O is required when
1308     * allocating cluster in the image file.  Note that this value may exceed
1309     * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
1310     * is one reason we loop rather than doing it all at once.
1311     */
1312    bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
1313    skip_bytes = offset - cluster_offset;
1314
1315    trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
1316                                   cluster_offset, cluster_bytes);
1317
1318    while (cluster_bytes) {
1319        int64_t pnum;
1320
1321        if (skip_write) {
1322            ret = 1; /* "already allocated", so nothing will be copied */
1323            pnum = MIN(cluster_bytes, max_transfer);
1324        } else {
1325            ret = bdrv_is_allocated(bs, cluster_offset,
1326                                    MIN(cluster_bytes, max_transfer), &pnum);
1327            if (ret < 0) {
1328                /*
1329                 * Safe to treat errors in querying allocation as if
1330                 * unallocated; we'll probably fail again soon on the
1331                 * read, but at least that will set a decent errno.
1332                 */
1333                pnum = MIN(cluster_bytes, max_transfer);
1334            }
1335
1336            /* Stop at EOF if the image ends in the middle of the cluster */
1337            if (ret == 0 && pnum == 0) {
1338                assert(progress >= bytes);
1339                break;
1340            }
1341
1342            assert(skip_bytes < pnum);
1343        }
1344
1345        if (ret <= 0) {
1346            QEMUIOVector local_qiov;
1347
1348            /* Must copy-on-read; use the bounce buffer */
1349            pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
1350            if (!bounce_buffer) {
1351                int64_t max_we_need = MAX(pnum, cluster_bytes - pnum);
1352                int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER);
1353                int64_t bounce_buffer_len = MIN(max_we_need, max_allowed);
1354
1355                bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len);
1356                if (!bounce_buffer) {
1357                    ret = -ENOMEM;
1358                    goto err;
1359                }
1360            }
1361            qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
1362
1363            ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
1364                                     &local_qiov, 0, 0);
1365            if (ret < 0) {
1366                goto err;
1367            }
1368
1369            bdrv_debug_event(bs, BLKDBG_COR_WRITE);
1370            if (drv->bdrv_co_pwrite_zeroes &&
1371                buffer_is_zero(bounce_buffer, pnum)) {
1372                /* FIXME: Should we (perhaps conditionally) be setting
1373                 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
1374                 * that still correctly reads as zero? */
1375                ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
1376                                               BDRV_REQ_WRITE_UNCHANGED);
1377            } else {
1378                /* This does not change the data on the disk, it is not
1379                 * necessary to flush even in cache=writethrough mode.
1380                 */
1381                ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
1382                                          &local_qiov, 0,
1383                                          BDRV_REQ_WRITE_UNCHANGED);
1384            }
1385
1386            if (ret < 0) {
1387                /* It might be okay to ignore write errors for guest
1388                 * requests.  If this is a deliberate copy-on-read
1389                 * then we don't want to ignore the error.  Simply
1390                 * report it in all cases.
1391                 */
1392                goto err;
1393            }
1394
1395            if (!(flags & BDRV_REQ_PREFETCH)) {
1396                qemu_iovec_from_buf(qiov, qiov_offset + progress,
1397                                    bounce_buffer + skip_bytes,
1398                                    MIN(pnum - skip_bytes, bytes - progress));
1399            }
1400        } else if (!(flags & BDRV_REQ_PREFETCH)) {
1401            /* Read directly into the destination */
1402            ret = bdrv_driver_preadv(bs, offset + progress,
1403                                     MIN(pnum - skip_bytes, bytes - progress),
1404                                     qiov, qiov_offset + progress, 0);
1405            if (ret < 0) {
1406                goto err;
1407            }
1408        }
1409
1410        cluster_offset += pnum;
1411        cluster_bytes -= pnum;
1412        progress += pnum - skip_bytes;
1413        skip_bytes = 0;
1414    }
1415    ret = 0;
1416
1417err:
1418    qemu_vfree(bounce_buffer);
1419    return ret;
1420}
1421
1422/*
1423 * Forwards an already correctly aligned request to the BlockDriver. This
1424 * handles copy on read, zeroing after EOF, and fragmentation of large
1425 * reads; any other features must be implemented by the caller.
1426 */
1427static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
1428    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1429    int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags)
1430{
1431    BlockDriverState *bs = child->bs;
1432    int64_t total_bytes, max_bytes;
1433    int ret = 0;
1434    uint64_t bytes_remaining = bytes;
1435    int max_transfer;
1436
1437    assert(is_power_of_2(align));
1438    assert((offset & (align - 1)) == 0);
1439    assert((bytes & (align - 1)) == 0);
1440    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1441    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1442                                   align);
1443
1444    /* TODO: We would need a per-BDS .supported_read_flags and
1445     * potential fallback support, if we ever implement any read flags
1446     * to pass through to drivers.  For now, there aren't any
1447     * passthrough flags.  */
1448    assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ |
1449                       BDRV_REQ_PREFETCH)));
1450
1451    /* Handle Copy on Read and associated serialisation */
1452    if (flags & BDRV_REQ_COPY_ON_READ) {
1453        /* If we touch the same cluster it counts as an overlap.  This
1454         * guarantees that allocating writes will be serialized and not race
1455         * with each other for the same cluster.  For example, in copy-on-read
1456         * it ensures that the CoR read and write operations are atomic and
1457         * guest writes cannot interleave between them. */
1458        bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
1459    }
1460
1461    /* BDRV_REQ_SERIALISING is only for write operation */
1462    assert(!(flags & BDRV_REQ_SERIALISING));
1463
1464    if (!(flags & BDRV_REQ_NO_SERIALISING)) {
1465        bdrv_wait_serialising_requests(req);
1466    }
1467
1468    if (flags & BDRV_REQ_COPY_ON_READ) {
1469        int64_t pnum;
1470
1471        ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
1472        if (ret < 0) {
1473            goto out;
1474        }
1475
1476        if (!ret || pnum != bytes) {
1477            ret = bdrv_co_do_copy_on_readv(child, offset, bytes,
1478                                           qiov, qiov_offset, flags);
1479            goto out;
1480        } else if (flags & BDRV_REQ_PREFETCH) {
1481            goto out;
1482        }
1483    }
1484
1485    /* Forward the request to the BlockDriver, possibly fragmenting it */
1486    total_bytes = bdrv_getlength(bs);
1487    if (total_bytes < 0) {
1488        ret = total_bytes;
1489        goto out;
1490    }
1491
1492    max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1493    if (bytes <= max_bytes && bytes <= max_transfer) {
1494        ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0);
1495        goto out;
1496    }
1497
1498    while (bytes_remaining) {
1499        int num;
1500
1501        if (max_bytes) {
1502            num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1503            assert(num);
1504
1505            ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1506                                     num, qiov, bytes - bytes_remaining, 0);
1507            max_bytes -= num;
1508        } else {
1509            num = bytes_remaining;
1510            ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
1511                                    bytes_remaining);
1512        }
1513        if (ret < 0) {
1514            goto out;
1515        }
1516        bytes_remaining -= num;
1517    }
1518
1519out:
1520    return ret < 0 ? ret : 0;
1521}
1522
1523/*
1524 * Request padding
1525 *
1526 *  |<---- align ----->|                     |<----- align ---->|
1527 *  |<- head ->|<------------- bytes ------------->|<-- tail -->|
1528 *  |          |       |                     |     |            |
1529 * -*----------$-------*-------- ... --------*-----$------------*---
1530 *  |          |       |                     |     |            |
1531 *  |          offset  |                     |     end          |
1532 *  ALIGN_DOWN(offset) ALIGN_UP(offset)      ALIGN_DOWN(end)   ALIGN_UP(end)
1533 *  [buf   ... )                             [tail_buf          )
1534 *
1535 * @buf is an aligned allocation needed to store @head and @tail paddings. @head
1536 * is placed at the beginning of @buf and @tail at the @end.
1537 *
1538 * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk
1539 * around tail, if tail exists.
1540 *
1541 * @merge_reads is true for small requests,
1542 * if @buf_len == @head + bytes + @tail. In this case it is possible that both
1543 * head and tail exist but @buf_len == align and @tail_buf == @buf.
1544 */
1545typedef struct BdrvRequestPadding {
1546    uint8_t *buf;
1547    size_t buf_len;
1548    uint8_t *tail_buf;
1549    size_t head;
1550    size_t tail;
1551    bool merge_reads;
1552    QEMUIOVector local_qiov;
1553} BdrvRequestPadding;
1554
1555static bool bdrv_init_padding(BlockDriverState *bs,
1556                              int64_t offset, int64_t bytes,
1557                              BdrvRequestPadding *pad)
1558{
1559    uint64_t align = bs->bl.request_alignment;
1560    size_t sum;
1561
1562    memset(pad, 0, sizeof(*pad));
1563
1564    pad->head = offset & (align - 1);
1565    pad->tail = ((offset + bytes) & (align - 1));
1566    if (pad->tail) {
1567        pad->tail = align - pad->tail;
1568    }
1569
1570    if (!pad->head && !pad->tail) {
1571        return false;
1572    }
1573
1574    assert(bytes); /* Nothing good in aligning zero-length requests */
1575
1576    sum = pad->head + bytes + pad->tail;
1577    pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align;
1578    pad->buf = qemu_blockalign(bs, pad->buf_len);
1579    pad->merge_reads = sum == pad->buf_len;
1580    if (pad->tail) {
1581        pad->tail_buf = pad->buf + pad->buf_len - align;
1582    }
1583
1584    return true;
1585}
1586
1587static int bdrv_padding_rmw_read(BdrvChild *child,
1588                                 BdrvTrackedRequest *req,
1589                                 BdrvRequestPadding *pad,
1590                                 bool zero_middle)
1591{
1592    QEMUIOVector local_qiov;
1593    BlockDriverState *bs = child->bs;
1594    uint64_t align = bs->bl.request_alignment;
1595    int ret;
1596
1597    assert(req->serialising && pad->buf);
1598
1599    if (pad->head || pad->merge_reads) {
1600        uint64_t bytes = pad->merge_reads ? pad->buf_len : align;
1601
1602        qemu_iovec_init_buf(&local_qiov, pad->buf, bytes);
1603
1604        if (pad->head) {
1605            bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1606        }
1607        if (pad->merge_reads && pad->tail) {
1608            bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1609        }
1610        ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes,
1611                                  align, &local_qiov, 0, 0);
1612        if (ret < 0) {
1613            return ret;
1614        }
1615        if (pad->head) {
1616            bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1617        }
1618        if (pad->merge_reads && pad->tail) {
1619            bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1620        }
1621
1622        if (pad->merge_reads) {
1623            goto zero_mem;
1624        }
1625    }
1626
1627    if (pad->tail) {
1628        qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align);
1629
1630        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1631        ret = bdrv_aligned_preadv(
1632                child, req,
1633                req->overlap_offset + req->overlap_bytes - align,
1634                align, align, &local_qiov, 0, 0);
1635        if (ret < 0) {
1636            return ret;
1637        }
1638        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1639    }
1640
1641zero_mem:
1642    if (zero_middle) {
1643        memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail);
1644    }
1645
1646    return 0;
1647}
1648
1649static void bdrv_padding_destroy(BdrvRequestPadding *pad)
1650{
1651    if (pad->buf) {
1652        qemu_vfree(pad->buf);
1653        qemu_iovec_destroy(&pad->local_qiov);
1654    }
1655}
1656
1657/*
1658 * bdrv_pad_request
1659 *
1660 * Exchange request parameters with padded request if needed. Don't include RMW
1661 * read of padding, bdrv_padding_rmw_read() should be called separately if
1662 * needed.
1663 *
1664 * All parameters except @bs are in-out: they represent original request at
1665 * function call and padded (if padding needed) at function finish.
1666 *
1667 * Function always succeeds.
1668 */
1669static bool bdrv_pad_request(BlockDriverState *bs,
1670                             QEMUIOVector **qiov, size_t *qiov_offset,
1671                             int64_t *offset, unsigned int *bytes,
1672                             BdrvRequestPadding *pad)
1673{
1674    if (!bdrv_init_padding(bs, *offset, *bytes, pad)) {
1675        return false;
1676    }
1677
1678    qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head,
1679                             *qiov, *qiov_offset, *bytes,
1680                             pad->buf + pad->buf_len - pad->tail, pad->tail);
1681    *bytes += pad->head + pad->tail;
1682    *offset -= pad->head;
1683    *qiov = &pad->local_qiov;
1684    *qiov_offset = 0;
1685
1686    return true;
1687}
1688
1689int coroutine_fn bdrv_co_preadv(BdrvChild *child,
1690    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1691    BdrvRequestFlags flags)
1692{
1693    return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags);
1694}
1695
1696int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
1697    int64_t offset, unsigned int bytes,
1698    QEMUIOVector *qiov, size_t qiov_offset,
1699    BdrvRequestFlags flags)
1700{
1701    BlockDriverState *bs = child->bs;
1702    BdrvTrackedRequest req;
1703    BdrvRequestPadding pad;
1704    int ret;
1705
1706    trace_bdrv_co_preadv(bs, offset, bytes, flags);
1707
1708    ret = bdrv_check_byte_request(bs, offset, bytes);
1709    if (ret < 0) {
1710        return ret;
1711    }
1712
1713    if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
1714        /*
1715         * Aligning zero request is nonsense. Even if driver has special meaning
1716         * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
1717         * it to driver due to request_alignment.
1718         *
1719         * Still, no reason to return an error if someone do unaligned
1720         * zero-length read occasionally.
1721         */
1722        return 0;
1723    }
1724
1725    bdrv_inc_in_flight(bs);
1726
1727    /* Don't do copy-on-read if we read data before write operation */
1728    if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
1729        flags |= BDRV_REQ_COPY_ON_READ;
1730    }
1731
1732    bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad);
1733
1734    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1735    ret = bdrv_aligned_preadv(child, &req, offset, bytes,
1736                              bs->bl.request_alignment,
1737                              qiov, qiov_offset, flags);
1738    tracked_request_end(&req);
1739    bdrv_dec_in_flight(bs);
1740
1741    bdrv_padding_destroy(&pad);
1742
1743    return ret;
1744}
1745
1746static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1747    int64_t offset, int bytes, BdrvRequestFlags flags)
1748{
1749    BlockDriver *drv = bs->drv;
1750    QEMUIOVector qiov;
1751    void *buf = NULL;
1752    int ret = 0;
1753    bool need_flush = false;
1754    int head = 0;
1755    int tail = 0;
1756
1757    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1758    int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1759                        bs->bl.request_alignment);
1760    int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
1761
1762    if (!drv) {
1763        return -ENOMEDIUM;
1764    }
1765
1766    if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) {
1767        return -ENOTSUP;
1768    }
1769
1770    assert(alignment % bs->bl.request_alignment == 0);
1771    head = offset % alignment;
1772    tail = (offset + bytes) % alignment;
1773    max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1774    assert(max_write_zeroes >= bs->bl.request_alignment);
1775
1776    while (bytes > 0 && !ret) {
1777        int num = bytes;
1778
1779        /* Align request.  Block drivers can expect the "bulk" of the request
1780         * to be aligned, and that unaligned requests do not cross cluster
1781         * boundaries.
1782         */
1783        if (head) {
1784            /* Make a small request up to the first aligned sector. For
1785             * convenience, limit this request to max_transfer even if
1786             * we don't need to fall back to writes.  */
1787            num = MIN(MIN(bytes, max_transfer), alignment - head);
1788            head = (head + num) % alignment;
1789            assert(num < max_write_zeroes);
1790        } else if (tail && num > alignment) {
1791            /* Shorten the request to the last aligned sector.  */
1792            num -= tail;
1793        }
1794
1795        /* limit request size */
1796        if (num > max_write_zeroes) {
1797            num = max_write_zeroes;
1798        }
1799
1800        ret = -ENOTSUP;
1801        /* First try the efficient write zeroes operation */
1802        if (drv->bdrv_co_pwrite_zeroes) {
1803            ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1804                                             flags & bs->supported_zero_flags);
1805            if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1806                !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1807                need_flush = true;
1808            }
1809        } else {
1810            assert(!bs->supported_zero_flags);
1811        }
1812
1813        if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) {
1814            /* Fall back to bounce buffer if write zeroes is unsupported */
1815            BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1816
1817            if ((flags & BDRV_REQ_FUA) &&
1818                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1819                /* No need for bdrv_driver_pwrite() to do a fallback
1820                 * flush on each chunk; use just one at the end */
1821                write_flags &= ~BDRV_REQ_FUA;
1822                need_flush = true;
1823            }
1824            num = MIN(num, max_transfer);
1825            if (buf == NULL) {
1826                buf = qemu_try_blockalign0(bs, num);
1827                if (buf == NULL) {
1828                    ret = -ENOMEM;
1829                    goto fail;
1830                }
1831            }
1832            qemu_iovec_init_buf(&qiov, buf, num);
1833
1834            ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags);
1835
1836            /* Keep bounce buffer around if it is big enough for all
1837             * all future requests.
1838             */
1839            if (num < max_transfer) {
1840                qemu_vfree(buf);
1841                buf = NULL;
1842            }
1843        }
1844
1845        offset += num;
1846        bytes -= num;
1847    }
1848
1849fail:
1850    if (ret == 0 && need_flush) {
1851        ret = bdrv_co_flush(bs);
1852    }
1853    qemu_vfree(buf);
1854    return ret;
1855}
1856
1857static inline int coroutine_fn
1858bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes,
1859                          BdrvTrackedRequest *req, int flags)
1860{
1861    BlockDriverState *bs = child->bs;
1862    bool waited;
1863    int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1864
1865    if (bs->read_only) {
1866        return -EPERM;
1867    }
1868
1869    /* BDRV_REQ_NO_SERIALISING is only for read operation */
1870    assert(!(flags & BDRV_REQ_NO_SERIALISING));
1871    assert(!(bs->open_flags & BDRV_O_INACTIVE));
1872    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1873    assert(!(flags & ~BDRV_REQ_MASK));
1874
1875    if (flags & BDRV_REQ_SERIALISING) {
1876        bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
1877    }
1878
1879    waited = bdrv_wait_serialising_requests(req);
1880
1881    assert(!waited || !req->serialising ||
1882           is_request_serialising_and_aligned(req));
1883    assert(req->overlap_offset <= offset);
1884    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1885    assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
1886
1887    switch (req->type) {
1888    case BDRV_TRACKED_WRITE:
1889    case BDRV_TRACKED_DISCARD:
1890        if (flags & BDRV_REQ_WRITE_UNCHANGED) {
1891            assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1892        } else {
1893            assert(child->perm & BLK_PERM_WRITE);
1894        }
1895        return notifier_with_return_list_notify(&bs->before_write_notifiers,
1896                                                req);
1897    case BDRV_TRACKED_TRUNCATE:
1898        assert(child->perm & BLK_PERM_RESIZE);
1899        return 0;
1900    default:
1901        abort();
1902    }
1903}
1904
1905static inline void coroutine_fn
1906bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes,
1907                         BdrvTrackedRequest *req, int ret)
1908{
1909    int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1910    BlockDriverState *bs = child->bs;
1911
1912    atomic_inc(&bs->write_gen);
1913
1914    /*
1915     * Discard cannot extend the image, but in error handling cases, such as
1916     * when reverting a qcow2 cluster allocation, the discarded range can pass
1917     * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD
1918     * here. Instead, just skip it, since semantically a discard request
1919     * beyond EOF cannot expand the image anyway.
1920     */
1921    if (ret == 0 &&
1922        (req->type == BDRV_TRACKED_TRUNCATE ||
1923         end_sector > bs->total_sectors) &&
1924        req->type != BDRV_TRACKED_DISCARD) {
1925        bs->total_sectors = end_sector;
1926        bdrv_parent_cb_resize(bs);
1927        bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS);
1928    }
1929    if (req->bytes) {
1930        switch (req->type) {
1931        case BDRV_TRACKED_WRITE:
1932            stat64_max(&bs->wr_highest_offset, offset + bytes);
1933            /* fall through, to set dirty bits */
1934        case BDRV_TRACKED_DISCARD:
1935            bdrv_set_dirty(bs, offset, bytes);
1936            break;
1937        default:
1938            break;
1939        }
1940    }
1941}
1942
1943/*
1944 * Forwards an already correctly aligned write request to the BlockDriver,
1945 * after possibly fragmenting it.
1946 */
1947static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
1948    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1949    int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags)
1950{
1951    BlockDriverState *bs = child->bs;
1952    BlockDriver *drv = bs->drv;
1953    int ret;
1954
1955    uint64_t bytes_remaining = bytes;
1956    int max_transfer;
1957
1958    if (!drv) {
1959        return -ENOMEDIUM;
1960    }
1961
1962    if (bdrv_has_readonly_bitmaps(bs)) {
1963        return -EPERM;
1964    }
1965
1966    assert(is_power_of_2(align));
1967    assert((offset & (align - 1)) == 0);
1968    assert((bytes & (align - 1)) == 0);
1969    assert(!qiov || qiov_offset + bytes <= qiov->size);
1970    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1971                                   align);
1972
1973    ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags);
1974
1975    if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1976        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
1977        qemu_iovec_is_zero(qiov, qiov_offset, bytes)) {
1978        flags |= BDRV_REQ_ZERO_WRITE;
1979        if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1980            flags |= BDRV_REQ_MAY_UNMAP;
1981        }
1982    }
1983
1984    if (ret < 0) {
1985        /* Do nothing, write notifier decided to fail this request */
1986    } else if (flags & BDRV_REQ_ZERO_WRITE) {
1987        bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1988        ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
1989    } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
1990        ret = bdrv_driver_pwritev_compressed(bs, offset, bytes,
1991                                             qiov, qiov_offset);
1992    } else if (bytes <= max_transfer) {
1993        bdrv_debug_event(bs, BLKDBG_PWRITEV);
1994        ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags);
1995    } else {
1996        bdrv_debug_event(bs, BLKDBG_PWRITEV);
1997        while (bytes_remaining) {
1998            int num = MIN(bytes_remaining, max_transfer);
1999            int local_flags = flags;
2000
2001            assert(num);
2002            if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
2003                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
2004                /* If FUA is going to be emulated by flush, we only
2005                 * need to flush on the last iteration */
2006                local_flags &= ~BDRV_REQ_FUA;
2007            }
2008
2009            ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
2010                                      num, qiov, bytes - bytes_remaining,
2011                                      local_flags);
2012            if (ret < 0) {
2013                break;
2014            }
2015            bytes_remaining -= num;
2016        }
2017    }
2018    bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
2019
2020    if (ret >= 0) {
2021        ret = 0;
2022    }
2023    bdrv_co_write_req_finish(child, offset, bytes, req, ret);
2024
2025    return ret;
2026}
2027
2028static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
2029                                                int64_t offset,
2030                                                unsigned int bytes,
2031                                                BdrvRequestFlags flags,
2032                                                BdrvTrackedRequest *req)
2033{
2034    BlockDriverState *bs = child->bs;
2035    QEMUIOVector local_qiov;
2036    uint64_t align = bs->bl.request_alignment;
2037    int ret = 0;
2038    bool padding;
2039    BdrvRequestPadding pad;
2040
2041    padding = bdrv_init_padding(bs, offset, bytes, &pad);
2042    if (padding) {
2043        bdrv_mark_request_serialising(req, align);
2044        bdrv_wait_serialising_requests(req);
2045
2046        bdrv_padding_rmw_read(child, req, &pad, true);
2047
2048        if (pad.head || pad.merge_reads) {
2049            int64_t aligned_offset = offset & ~(align - 1);
2050            int64_t write_bytes = pad.merge_reads ? pad.buf_len : align;
2051
2052            qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes);
2053            ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes,
2054                                       align, &local_qiov, 0,
2055                                       flags & ~BDRV_REQ_ZERO_WRITE);
2056            if (ret < 0 || pad.merge_reads) {
2057                /* Error or all work is done */
2058                goto out;
2059            }
2060            offset += write_bytes - pad.head;
2061            bytes -= write_bytes - pad.head;
2062        }
2063    }
2064
2065    assert(!bytes || (offset & (align - 1)) == 0);
2066    if (bytes >= align) {
2067        /* Write the aligned part in the middle. */
2068        uint64_t aligned_bytes = bytes & ~(align - 1);
2069        ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
2070                                   NULL, 0, flags);
2071        if (ret < 0) {
2072            goto out;
2073        }
2074        bytes -= aligned_bytes;
2075        offset += aligned_bytes;
2076    }
2077
2078    assert(!bytes || (offset & (align - 1)) == 0);
2079    if (bytes) {
2080        assert(align == pad.tail + bytes);
2081
2082        qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align);
2083        ret = bdrv_aligned_pwritev(child, req, offset, align, align,
2084                                   &local_qiov, 0,
2085                                   flags & ~BDRV_REQ_ZERO_WRITE);
2086    }
2087
2088out:
2089    bdrv_padding_destroy(&pad);
2090
2091    return ret;
2092}
2093
2094/*
2095 * Handle a write request in coroutine context
2096 */
2097int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
2098    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
2099    BdrvRequestFlags flags)
2100{
2101    return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags);
2102}
2103
2104int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
2105    int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset,
2106    BdrvRequestFlags flags)
2107{
2108    BlockDriverState *bs = child->bs;
2109    BdrvTrackedRequest req;
2110    uint64_t align = bs->bl.request_alignment;
2111    BdrvRequestPadding pad;
2112    int ret;
2113
2114    trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
2115
2116    if (!bs->drv) {
2117        return -ENOMEDIUM;
2118    }
2119
2120    ret = bdrv_check_byte_request(bs, offset, bytes);
2121    if (ret < 0) {
2122        return ret;
2123    }
2124
2125    /* If the request is misaligned then we can't make it efficient */
2126    if ((flags & BDRV_REQ_NO_FALLBACK) &&
2127        !QEMU_IS_ALIGNED(offset | bytes, align))
2128    {
2129        return -ENOTSUP;
2130    }
2131
2132    if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
2133        /*
2134         * Aligning zero request is nonsense. Even if driver has special meaning
2135         * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
2136         * it to driver due to request_alignment.
2137         *
2138         * Still, no reason to return an error if someone do unaligned
2139         * zero-length write occasionally.
2140         */
2141        return 0;
2142    }
2143
2144    bdrv_inc_in_flight(bs);
2145    /*
2146     * Align write if necessary by performing a read-modify-write cycle.
2147     * Pad qiov with the read parts and be sure to have a tracked request not
2148     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
2149     */
2150    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
2151
2152    if (flags & BDRV_REQ_ZERO_WRITE) {
2153        ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
2154        goto out;
2155    }
2156
2157    if (bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad)) {
2158        bdrv_mark_request_serialising(&req, align);
2159        bdrv_wait_serialising_requests(&req);
2160        bdrv_padding_rmw_read(child, &req, &pad, false);
2161    }
2162
2163    ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
2164                               qiov, qiov_offset, flags);
2165
2166    bdrv_padding_destroy(&pad);
2167
2168out:
2169    tracked_request_end(&req);
2170    bdrv_dec_in_flight(bs);
2171
2172    return ret;
2173}
2174
2175int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
2176                                       int bytes, BdrvRequestFlags flags)
2177{
2178    trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
2179
2180    if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
2181        flags &= ~BDRV_REQ_MAY_UNMAP;
2182    }
2183
2184    return bdrv_co_pwritev(child, offset, bytes, NULL,
2185                           BDRV_REQ_ZERO_WRITE | flags);
2186}
2187
2188/*
2189 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
2190 */
2191int bdrv_flush_all(void)
2192{
2193    BdrvNextIterator it;
2194    BlockDriverState *bs = NULL;
2195    int result = 0;
2196
2197    /*
2198     * bdrv queue is managed by record/replay,
2199     * creating new flush request for stopping
2200     * the VM may break the determinism
2201     */
2202    if (replay_events_enabled()) {
2203        return result;
2204    }
2205
2206    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
2207        AioContext *aio_context = bdrv_get_aio_context(bs);
2208        int ret;
2209
2210        aio_context_acquire(aio_context);
2211        ret = bdrv_flush(bs);
2212        if (ret < 0 && !result) {
2213            result = ret;
2214        }
2215        aio_context_release(aio_context);
2216    }
2217
2218    return result;
2219}
2220
2221
2222typedef struct BdrvCoBlockStatusData {
2223    BlockDriverState *bs;
2224    BlockDriverState *base;
2225    bool want_zero;
2226    int64_t offset;
2227    int64_t bytes;
2228    int64_t *pnum;
2229    int64_t *map;
2230    BlockDriverState **file;
2231    int ret;
2232    bool done;
2233} BdrvCoBlockStatusData;
2234
2235int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
2236                                                bool want_zero,
2237                                                int64_t offset,
2238                                                int64_t bytes,
2239                                                int64_t *pnum,
2240                                                int64_t *map,
2241                                                BlockDriverState **file)
2242{
2243    assert(bs->file && bs->file->bs);
2244    *pnum = bytes;
2245    *map = offset;
2246    *file = bs->file->bs;
2247    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2248}
2249
2250int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
2251                                                   bool want_zero,
2252                                                   int64_t offset,
2253                                                   int64_t bytes,
2254                                                   int64_t *pnum,
2255                                                   int64_t *map,
2256                                                   BlockDriverState **file)
2257{
2258    assert(bs->backing && bs->backing->bs);
2259    *pnum = bytes;
2260    *map = offset;
2261    *file = bs->backing->bs;
2262    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2263}
2264
2265/*
2266 * Returns the allocation status of the specified sectors.
2267 * Drivers not implementing the functionality are assumed to not support
2268 * backing files, hence all their sectors are reported as allocated.
2269 *
2270 * If 'want_zero' is true, the caller is querying for mapping
2271 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
2272 * _ZERO where possible; otherwise, the result favors larger 'pnum',
2273 * with a focus on accurate BDRV_BLOCK_ALLOCATED.
2274 *
2275 * If 'offset' is beyond the end of the disk image the return value is
2276 * BDRV_BLOCK_EOF and 'pnum' is set to 0.
2277 *
2278 * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
2279 * beyond the end of the disk image it will be clamped; if 'pnum' is set to
2280 * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
2281 *
2282 * 'pnum' is set to the number of bytes (including and immediately
2283 * following the specified offset) that are easily known to be in the
2284 * same allocated/unallocated state.  Note that a second call starting
2285 * at the original offset plus returned pnum may have the same status.
2286 * The returned value is non-zero on success except at end-of-file.
2287 *
2288 * Returns negative errno on failure.  Otherwise, if the
2289 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
2290 * set to the host mapping and BDS corresponding to the guest offset.
2291 */
2292static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
2293                                             bool want_zero,
2294                                             int64_t offset, int64_t bytes,
2295                                             int64_t *pnum, int64_t *map,
2296                                             BlockDriverState **file)
2297{
2298    int64_t total_size;
2299    int64_t n; /* bytes */
2300    int ret;
2301    int64_t local_map = 0;
2302    BlockDriverState *local_file = NULL;
2303    int64_t aligned_offset, aligned_bytes;
2304    uint32_t align;
2305
2306    assert(pnum);
2307    *pnum = 0;
2308    total_size = bdrv_getlength(bs);
2309    if (total_size < 0) {
2310        ret = total_size;
2311        goto early_out;
2312    }
2313
2314    if (offset >= total_size) {
2315        ret = BDRV_BLOCK_EOF;
2316        goto early_out;
2317    }
2318    if (!bytes) {
2319        ret = 0;
2320        goto early_out;
2321    }
2322
2323    n = total_size - offset;
2324    if (n < bytes) {
2325        bytes = n;
2326    }
2327
2328    /* Must be non-NULL or bdrv_getlength() would have failed */
2329    assert(bs->drv);
2330    if (!bs->drv->bdrv_co_block_status) {
2331        *pnum = bytes;
2332        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
2333        if (offset + bytes == total_size) {
2334            ret |= BDRV_BLOCK_EOF;
2335        }
2336        if (bs->drv->protocol_name) {
2337            ret |= BDRV_BLOCK_OFFSET_VALID;
2338            local_map = offset;
2339            local_file = bs;
2340        }
2341        goto early_out;
2342    }
2343
2344    bdrv_inc_in_flight(bs);
2345
2346    /* Round out to request_alignment boundaries */
2347    align = bs->bl.request_alignment;
2348    aligned_offset = QEMU_ALIGN_DOWN(offset, align);
2349    aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
2350
2351    ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
2352                                        aligned_bytes, pnum, &local_map,
2353                                        &local_file);
2354    if (ret < 0) {
2355        *pnum = 0;
2356        goto out;
2357    }
2358
2359    /*
2360     * The driver's result must be a non-zero multiple of request_alignment.
2361     * Clamp pnum and adjust map to original request.
2362     */
2363    assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
2364           align > offset - aligned_offset);
2365    if (ret & BDRV_BLOCK_RECURSE) {
2366        assert(ret & BDRV_BLOCK_DATA);
2367        assert(ret & BDRV_BLOCK_OFFSET_VALID);
2368        assert(!(ret & BDRV_BLOCK_ZERO));
2369    }
2370
2371    *pnum -= offset - aligned_offset;
2372    if (*pnum > bytes) {
2373        *pnum = bytes;
2374    }
2375    if (ret & BDRV_BLOCK_OFFSET_VALID) {
2376        local_map += offset - aligned_offset;
2377    }
2378
2379    if (ret & BDRV_BLOCK_RAW) {
2380        assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
2381        ret = bdrv_co_block_status(local_file, want_zero, local_map,
2382                                   *pnum, pnum, &local_map, &local_file);
2383        goto out;
2384    }
2385
2386    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
2387        ret |= BDRV_BLOCK_ALLOCATED;
2388    } else if (want_zero) {
2389        if (bdrv_unallocated_blocks_are_zero(bs)) {
2390            ret |= BDRV_BLOCK_ZERO;
2391        } else if (bs->backing) {
2392            BlockDriverState *bs2 = bs->backing->bs;
2393            int64_t size2 = bdrv_getlength(bs2);
2394
2395            if (size2 >= 0 && offset >= size2) {
2396                ret |= BDRV_BLOCK_ZERO;
2397            }
2398        }
2399    }
2400
2401    if (want_zero && ret & BDRV_BLOCK_RECURSE &&
2402        local_file && local_file != bs &&
2403        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
2404        (ret & BDRV_BLOCK_OFFSET_VALID)) {
2405        int64_t file_pnum;
2406        int ret2;
2407
2408        ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
2409                                    *pnum, &file_pnum, NULL, NULL);
2410        if (ret2 >= 0) {
2411            /* Ignore errors.  This is just providing extra information, it
2412             * is useful but not necessary.
2413             */
2414            if (ret2 & BDRV_BLOCK_EOF &&
2415                (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
2416                /*
2417                 * It is valid for the format block driver to read
2418                 * beyond the end of the underlying file's current
2419                 * size; such areas read as zero.
2420                 */
2421                ret |= BDRV_BLOCK_ZERO;
2422            } else {
2423                /* Limit request to the range reported by the protocol driver */
2424                *pnum = file_pnum;
2425                ret |= (ret2 & BDRV_BLOCK_ZERO);
2426            }
2427        }
2428    }
2429
2430out:
2431    bdrv_dec_in_flight(bs);
2432    if (ret >= 0 && offset + *pnum == total_size) {
2433        ret |= BDRV_BLOCK_EOF;
2434    }
2435early_out:
2436    if (file) {
2437        *file = local_file;
2438    }
2439    if (map) {
2440        *map = local_map;
2441    }
2442    return ret;
2443}
2444
2445static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
2446                                                   BlockDriverState *base,
2447                                                   bool want_zero,
2448                                                   int64_t offset,
2449                                                   int64_t bytes,
2450                                                   int64_t *pnum,
2451                                                   int64_t *map,
2452                                                   BlockDriverState **file)
2453{
2454    BlockDriverState *p;
2455    int ret = 0;
2456    bool first = true;
2457
2458    assert(bs != base);
2459    for (p = bs; p != base; p = backing_bs(p)) {
2460        ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
2461                                   file);
2462        if (ret < 0) {
2463            break;
2464        }
2465        if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
2466            /*
2467             * Reading beyond the end of the file continues to read
2468             * zeroes, but we can only widen the result to the
2469             * unallocated length we learned from an earlier
2470             * iteration.
2471             */
2472            *pnum = bytes;
2473        }
2474        if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
2475            break;
2476        }
2477        /* [offset, pnum] unallocated on this layer, which could be only
2478         * the first part of [offset, bytes].  */
2479        bytes = MIN(bytes, *pnum);
2480        first = false;
2481    }
2482    return ret;
2483}
2484
2485/* Coroutine wrapper for bdrv_block_status_above() */
2486static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque)
2487{
2488    BdrvCoBlockStatusData *data = opaque;
2489
2490    data->ret = bdrv_co_block_status_above(data->bs, data->base,
2491                                           data->want_zero,
2492                                           data->offset, data->bytes,
2493                                           data->pnum, data->map, data->file);
2494    data->done = true;
2495    aio_wait_kick();
2496}
2497
2498/*
2499 * Synchronous wrapper around bdrv_co_block_status_above().
2500 *
2501 * See bdrv_co_block_status_above() for details.
2502 */
2503static int bdrv_common_block_status_above(BlockDriverState *bs,
2504                                          BlockDriverState *base,
2505                                          bool want_zero, int64_t offset,
2506                                          int64_t bytes, int64_t *pnum,
2507                                          int64_t *map,
2508                                          BlockDriverState **file)
2509{
2510    Coroutine *co;
2511    BdrvCoBlockStatusData data = {
2512        .bs = bs,
2513        .base = base,
2514        .want_zero = want_zero,
2515        .offset = offset,
2516        .bytes = bytes,
2517        .pnum = pnum,
2518        .map = map,
2519        .file = file,
2520        .done = false,
2521    };
2522
2523    if (qemu_in_coroutine()) {
2524        /* Fast-path if already in coroutine context */
2525        bdrv_block_status_above_co_entry(&data);
2526    } else {
2527        co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data);
2528        bdrv_coroutine_enter(bs, co);
2529        BDRV_POLL_WHILE(bs, !data.done);
2530    }
2531    return data.ret;
2532}
2533
2534int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
2535                            int64_t offset, int64_t bytes, int64_t *pnum,
2536                            int64_t *map, BlockDriverState **file)
2537{
2538    return bdrv_common_block_status_above(bs, base, true, offset, bytes,
2539                                          pnum, map, file);
2540}
2541
2542int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
2543                      int64_t *pnum, int64_t *map, BlockDriverState **file)
2544{
2545    return bdrv_block_status_above(bs, backing_bs(bs),
2546                                   offset, bytes, pnum, map, file);
2547}
2548
2549int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
2550                                   int64_t bytes, int64_t *pnum)
2551{
2552    int ret;
2553    int64_t dummy;
2554
2555    ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
2556                                         bytes, pnum ? pnum : &dummy, NULL,
2557                                         NULL);
2558    if (ret < 0) {
2559        return ret;
2560    }
2561    return !!(ret & BDRV_BLOCK_ALLOCATED);
2562}
2563
2564/*
2565 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2566 *
2567 * Return 1 if (a prefix of) the given range is allocated in any image
2568 * between BASE and TOP (BASE is only included if include_base is set).
2569 * BASE can be NULL to check if the given offset is allocated in any
2570 * image of the chain.  Return 0 otherwise, or negative errno on
2571 * failure.
2572 *
2573 * 'pnum' is set to the number of bytes (including and immediately
2574 * following the specified offset) that are known to be in the same
2575 * allocated/unallocated state.  Note that a subsequent call starting
2576 * at 'offset + *pnum' may return the same allocation status (in other
2577 * words, the result is not necessarily the maximum possible range);
2578 * but 'pnum' will only be 0 when end of file is reached.
2579 *
2580 */
2581int bdrv_is_allocated_above(BlockDriverState *top,
2582                            BlockDriverState *base,
2583                            bool include_base, int64_t offset,
2584                            int64_t bytes, int64_t *pnum)
2585{
2586    BlockDriverState *intermediate;
2587    int ret;
2588    int64_t n = bytes;
2589
2590    assert(base || !include_base);
2591
2592    intermediate = top;
2593    while (include_base || intermediate != base) {
2594        int64_t pnum_inter;
2595        int64_t size_inter;
2596
2597        assert(intermediate);
2598        ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
2599        if (ret < 0) {
2600            return ret;
2601        }
2602        if (ret) {
2603            *pnum = pnum_inter;
2604            return 1;
2605        }
2606
2607        size_inter = bdrv_getlength(intermediate);
2608        if (size_inter < 0) {
2609            return size_inter;
2610        }
2611        if (n > pnum_inter &&
2612            (intermediate == top || offset + pnum_inter < size_inter)) {
2613            n = pnum_inter;
2614        }
2615
2616        if (intermediate == base) {
2617            break;
2618        }
2619
2620        intermediate = backing_bs(intermediate);
2621    }
2622
2623    *pnum = n;
2624    return 0;
2625}
2626
2627typedef struct BdrvVmstateCo {
2628    BlockDriverState    *bs;
2629    QEMUIOVector        *qiov;
2630    int64_t             pos;
2631    bool                is_read;
2632    int                 ret;
2633} BdrvVmstateCo;
2634
2635static int coroutine_fn
2636bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2637                   bool is_read)
2638{
2639    BlockDriver *drv = bs->drv;
2640    int ret = -ENOTSUP;
2641
2642    bdrv_inc_in_flight(bs);
2643
2644    if (!drv) {
2645        ret = -ENOMEDIUM;
2646    } else if (drv->bdrv_load_vmstate) {
2647        if (is_read) {
2648            ret = drv->bdrv_load_vmstate(bs, qiov, pos);
2649        } else {
2650            ret = drv->bdrv_save_vmstate(bs, qiov, pos);
2651        }
2652    } else if (bs->file) {
2653        ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
2654    }
2655
2656    bdrv_dec_in_flight(bs);
2657    return ret;
2658}
2659
2660static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
2661{
2662    BdrvVmstateCo *co = opaque;
2663    co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
2664    aio_wait_kick();
2665}
2666
2667static inline int
2668bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2669                bool is_read)
2670{
2671    if (qemu_in_coroutine()) {
2672        return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
2673    } else {
2674        BdrvVmstateCo data = {
2675            .bs         = bs,
2676            .qiov       = qiov,
2677            .pos        = pos,
2678            .is_read    = is_read,
2679            .ret        = -EINPROGRESS,
2680        };
2681        Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
2682
2683        bdrv_coroutine_enter(bs, co);
2684        BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
2685        return data.ret;
2686    }
2687}
2688
2689int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2690                      int64_t pos, int size)
2691{
2692    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2693    int ret;
2694
2695    ret = bdrv_writev_vmstate(bs, &qiov, pos);
2696    if (ret < 0) {
2697        return ret;
2698    }
2699
2700    return size;
2701}
2702
2703int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2704{
2705    return bdrv_rw_vmstate(bs, qiov, pos, false);
2706}
2707
2708int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2709                      int64_t pos, int size)
2710{
2711    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2712    int ret;
2713
2714    ret = bdrv_readv_vmstate(bs, &qiov, pos);
2715    if (ret < 0) {
2716        return ret;
2717    }
2718
2719    return size;
2720}
2721
2722int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2723{
2724    return bdrv_rw_vmstate(bs, qiov, pos, true);
2725}
2726
2727/**************************************************************/
2728/* async I/Os */
2729
2730void bdrv_aio_cancel(BlockAIOCB *acb)
2731{
2732    qemu_aio_ref(acb);
2733    bdrv_aio_cancel_async(acb);
2734    while (acb->refcnt > 1) {
2735        if (acb->aiocb_info->get_aio_context) {
2736            aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2737        } else if (acb->bs) {
2738            /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
2739             * assert that we're not using an I/O thread.  Thread-safe
2740             * code should use bdrv_aio_cancel_async exclusively.
2741             */
2742            assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
2743            aio_poll(bdrv_get_aio_context(acb->bs), true);
2744        } else {
2745            abort();
2746        }
2747    }
2748    qemu_aio_unref(acb);
2749}
2750
2751/* Async version of aio cancel. The caller is not blocked if the acb implements
2752 * cancel_async, otherwise we do nothing and let the request normally complete.
2753 * In either case the completion callback must be called. */
2754void bdrv_aio_cancel_async(BlockAIOCB *acb)
2755{
2756    if (acb->aiocb_info->cancel_async) {
2757        acb->aiocb_info->cancel_async(acb);
2758    }
2759}
2760
2761/**************************************************************/
2762/* Coroutine block device emulation */
2763
2764typedef struct FlushCo {
2765    BlockDriverState *bs;
2766    int ret;
2767} FlushCo;
2768
2769
2770static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2771{
2772    FlushCo *rwco = opaque;
2773
2774    rwco->ret = bdrv_co_flush(rwco->bs);
2775    aio_wait_kick();
2776}
2777
2778int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2779{
2780    int current_gen;
2781    int ret = 0;
2782
2783    bdrv_inc_in_flight(bs);
2784
2785    if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2786        bdrv_is_sg(bs)) {
2787        goto early_exit;
2788    }
2789
2790    qemu_co_mutex_lock(&bs->reqs_lock);
2791    current_gen = atomic_read(&bs->write_gen);
2792
2793    /* Wait until any previous flushes are completed */
2794    while (bs->active_flush_req) {
2795        qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
2796    }
2797
2798    /* Flushes reach this point in nondecreasing current_gen order.  */
2799    bs->active_flush_req = true;
2800    qemu_co_mutex_unlock(&bs->reqs_lock);
2801
2802    /* Write back all layers by calling one driver function */
2803    if (bs->drv->bdrv_co_flush) {
2804        ret = bs->drv->bdrv_co_flush(bs);
2805        goto out;
2806    }
2807
2808    /* Write back cached data to the OS even with cache=unsafe */
2809    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2810    if (bs->drv->bdrv_co_flush_to_os) {
2811        ret = bs->drv->bdrv_co_flush_to_os(bs);
2812        if (ret < 0) {
2813            goto out;
2814        }
2815    }
2816
2817    /* But don't actually force it to the disk with cache=unsafe */
2818    if (bs->open_flags & BDRV_O_NO_FLUSH) {
2819        goto flush_parent;
2820    }
2821
2822    /* Check if we really need to flush anything */
2823    if (bs->flushed_gen == current_gen) {
2824        goto flush_parent;
2825    }
2826
2827    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2828    if (!bs->drv) {
2829        /* bs->drv->bdrv_co_flush() might have ejected the BDS
2830         * (even in case of apparent success) */
2831        ret = -ENOMEDIUM;
2832        goto out;
2833    }
2834    if (bs->drv->bdrv_co_flush_to_disk) {
2835        ret = bs->drv->bdrv_co_flush_to_disk(bs);
2836    } else if (bs->drv->bdrv_aio_flush) {
2837        BlockAIOCB *acb;
2838        CoroutineIOCompletion co = {
2839            .coroutine = qemu_coroutine_self(),
2840        };
2841
2842        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2843        if (acb == NULL) {
2844            ret = -EIO;
2845        } else {
2846            qemu_coroutine_yield();
2847            ret = co.ret;
2848        }
2849    } else {
2850        /*
2851         * Some block drivers always operate in either writethrough or unsafe
2852         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2853         * know how the server works (because the behaviour is hardcoded or
2854         * depends on server-side configuration), so we can't ensure that
2855         * everything is safe on disk. Returning an error doesn't work because
2856         * that would break guests even if the server operates in writethrough
2857         * mode.
2858         *
2859         * Let's hope the user knows what he's doing.
2860         */
2861        ret = 0;
2862    }
2863
2864    if (ret < 0) {
2865        goto out;
2866    }
2867
2868    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2869     * in the case of cache=unsafe, so there are no useless flushes.
2870     */
2871flush_parent:
2872    ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2873out:
2874    /* Notify any pending flushes that we have completed */
2875    if (ret == 0) {
2876        bs->flushed_gen = current_gen;
2877    }
2878
2879    qemu_co_mutex_lock(&bs->reqs_lock);
2880    bs->active_flush_req = false;
2881    /* Return value is ignored - it's ok if wait queue is empty */
2882    qemu_co_queue_next(&bs->flush_queue);
2883    qemu_co_mutex_unlock(&bs->reqs_lock);
2884
2885early_exit:
2886    bdrv_dec_in_flight(bs);
2887    return ret;
2888}
2889
2890int bdrv_flush(BlockDriverState *bs)
2891{
2892    Coroutine *co;
2893    FlushCo flush_co = {
2894        .bs = bs,
2895        .ret = NOT_DONE,
2896    };
2897
2898    if (qemu_in_coroutine()) {
2899        /* Fast-path if already in coroutine context */
2900        bdrv_flush_co_entry(&flush_co);
2901    } else {
2902        co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
2903        bdrv_coroutine_enter(bs, co);
2904        BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
2905    }
2906
2907    return flush_co.ret;
2908}
2909
2910typedef struct DiscardCo {
2911    BdrvChild *child;
2912    int64_t offset;
2913    int64_t bytes;
2914    int ret;
2915} DiscardCo;
2916static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
2917{
2918    DiscardCo *rwco = opaque;
2919
2920    rwco->ret = bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes);
2921    aio_wait_kick();
2922}
2923
2924int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
2925                                  int64_t bytes)
2926{
2927    BdrvTrackedRequest req;
2928    int max_pdiscard, ret;
2929    int head, tail, align;
2930    BlockDriverState *bs = child->bs;
2931
2932    if (!bs || !bs->drv || !bdrv_is_inserted(bs)) {
2933        return -ENOMEDIUM;
2934    }
2935
2936    if (bdrv_has_readonly_bitmaps(bs)) {
2937        return -EPERM;
2938    }
2939
2940    if (offset < 0 || bytes < 0 || bytes > INT64_MAX - offset) {
2941        return -EIO;
2942    }
2943
2944    /* Do nothing if disabled.  */
2945    if (!(bs->open_flags & BDRV_O_UNMAP)) {
2946        return 0;
2947    }
2948
2949    if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
2950        return 0;
2951    }
2952
2953    /* Discard is advisory, but some devices track and coalesce
2954     * unaligned requests, so we must pass everything down rather than
2955     * round here.  Still, most devices will just silently ignore
2956     * unaligned requests (by returning -ENOTSUP), so we must fragment
2957     * the request accordingly.  */
2958    align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2959    assert(align % bs->bl.request_alignment == 0);
2960    head = offset % align;
2961    tail = (offset + bytes) % align;
2962
2963    bdrv_inc_in_flight(bs);
2964    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
2965
2966    ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
2967    if (ret < 0) {
2968        goto out;
2969    }
2970
2971    max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
2972                                   align);
2973    assert(max_pdiscard >= bs->bl.request_alignment);
2974
2975    while (bytes > 0) {
2976        int64_t num = bytes;
2977
2978        if (head) {
2979            /* Make small requests to get to alignment boundaries. */
2980            num = MIN(bytes, align - head);
2981            if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
2982                num %= bs->bl.request_alignment;
2983            }
2984            head = (head + num) % align;
2985            assert(num < max_pdiscard);
2986        } else if (tail) {
2987            if (num > align) {
2988                /* Shorten the request to the last aligned cluster.  */
2989                num -= tail;
2990            } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
2991                       tail > bs->bl.request_alignment) {
2992                tail %= bs->bl.request_alignment;
2993                num -= tail;
2994            }
2995        }
2996        /* limit request size */
2997        if (num > max_pdiscard) {
2998            num = max_pdiscard;
2999        }
3000
3001        if (!bs->drv) {
3002            ret = -ENOMEDIUM;
3003            goto out;
3004        }
3005        if (bs->drv->bdrv_co_pdiscard) {
3006            ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
3007        } else {
3008            BlockAIOCB *acb;
3009            CoroutineIOCompletion co = {
3010                .coroutine = qemu_coroutine_self(),
3011            };
3012
3013            acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
3014                                             bdrv_co_io_em_complete, &co);
3015            if (acb == NULL) {
3016                ret = -EIO;
3017                goto out;
3018            } else {
3019                qemu_coroutine_yield();
3020                ret = co.ret;
3021            }
3022        }
3023        if (ret && ret != -ENOTSUP) {
3024            goto out;
3025        }
3026
3027        offset += num;
3028        bytes -= num;
3029    }
3030    ret = 0;
3031out:
3032    bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
3033    tracked_request_end(&req);
3034    bdrv_dec_in_flight(bs);
3035    return ret;
3036}
3037
3038int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes)
3039{
3040    Coroutine *co;
3041    DiscardCo rwco = {
3042        .child = child,
3043        .offset = offset,
3044        .bytes = bytes,
3045        .ret = NOT_DONE,
3046    };
3047
3048    if (qemu_in_coroutine()) {
3049        /* Fast-path if already in coroutine context */
3050        bdrv_pdiscard_co_entry(&rwco);
3051    } else {
3052        co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
3053        bdrv_coroutine_enter(child->bs, co);
3054        BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
3055    }
3056
3057    return rwco.ret;
3058}
3059
3060int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
3061{
3062    BlockDriver *drv = bs->drv;
3063    CoroutineIOCompletion co = {
3064        .coroutine = qemu_coroutine_self(),
3065    };
3066    BlockAIOCB *acb;
3067
3068    bdrv_inc_in_flight(bs);
3069    if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
3070        co.ret = -ENOTSUP;
3071        goto out;
3072    }
3073
3074    if (drv->bdrv_co_ioctl) {
3075        co.ret = drv->bdrv_co_ioctl(bs, req, buf);
3076    } else {
3077        acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
3078        if (!acb) {
3079            co.ret = -ENOTSUP;
3080            goto out;
3081        }
3082        qemu_coroutine_yield();
3083    }
3084out:
3085    bdrv_dec_in_flight(bs);
3086    return co.ret;
3087}
3088
3089void *qemu_blockalign(BlockDriverState *bs, size_t size)
3090{
3091    return qemu_memalign(bdrv_opt_mem_align(bs), size);
3092}
3093
3094void *qemu_blockalign0(BlockDriverState *bs, size_t size)
3095{
3096    return memset(qemu_blockalign(bs, size), 0, size);
3097}
3098
3099void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
3100{
3101    size_t align = bdrv_opt_mem_align(bs);
3102
3103    /* Ensure that NULL is never returned on success */
3104    assert(align > 0);
3105    if (size == 0) {
3106        size = align;
3107    }
3108
3109    return qemu_try_memalign(align, size);
3110}
3111
3112void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
3113{
3114    void *mem = qemu_try_blockalign(bs, size);
3115
3116    if (mem) {
3117        memset(mem, 0, size);
3118    }
3119
3120    return mem;
3121}
3122
3123/*
3124 * Check if all memory in this vector is sector aligned.
3125 */
3126bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
3127{
3128    int i;
3129    size_t alignment = bdrv_min_mem_align(bs);
3130
3131    for (i = 0; i < qiov->niov; i++) {
3132        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
3133            return false;
3134        }
3135        if (qiov->iov[i].iov_len % alignment) {
3136            return false;
3137        }
3138    }
3139
3140    return true;
3141}
3142
3143void bdrv_add_before_write_notifier(BlockDriverState *bs,
3144                                    NotifierWithReturn *notifier)
3145{
3146    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
3147}
3148
3149void bdrv_io_plug(BlockDriverState *bs)
3150{
3151    BdrvChild *child;
3152
3153    QLIST_FOREACH(child, &bs->children, next) {
3154        bdrv_io_plug(child->bs);
3155    }
3156
3157    if (atomic_fetch_inc(&bs->io_plugged) == 0) {
3158        BlockDriver *drv = bs->drv;
3159        if (drv && drv->bdrv_io_plug) {
3160            drv->bdrv_io_plug(bs);
3161        }
3162    }
3163}
3164
3165void bdrv_io_unplug(BlockDriverState *bs)
3166{
3167    BdrvChild *child;
3168
3169    assert(bs->io_plugged);
3170    if (atomic_fetch_dec(&bs->io_plugged) == 1) {
3171        BlockDriver *drv = bs->drv;
3172        if (drv && drv->bdrv_io_unplug) {
3173            drv->bdrv_io_unplug(bs);
3174        }
3175    }
3176
3177    QLIST_FOREACH(child, &bs->children, next) {
3178        bdrv_io_unplug(child->bs);
3179    }
3180}
3181
3182void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
3183{
3184    BdrvChild *child;
3185
3186    if (bs->drv && bs->drv->bdrv_register_buf) {
3187        bs->drv->bdrv_register_buf(bs, host, size);
3188    }
3189    QLIST_FOREACH(child, &bs->children, next) {
3190        bdrv_register_buf(child->bs, host, size);
3191    }
3192}
3193
3194void bdrv_unregister_buf(BlockDriverState *bs, void *host)
3195{
3196    BdrvChild *child;
3197
3198    if (bs->drv && bs->drv->bdrv_unregister_buf) {
3199        bs->drv->bdrv_unregister_buf(bs, host);
3200    }
3201    QLIST_FOREACH(child, &bs->children, next) {
3202        bdrv_unregister_buf(child->bs, host);
3203    }
3204}
3205
3206static int coroutine_fn bdrv_co_copy_range_internal(
3207        BdrvChild *src, uint64_t src_offset, BdrvChild *dst,
3208        uint64_t dst_offset, uint64_t bytes,
3209        BdrvRequestFlags read_flags, BdrvRequestFlags write_flags,
3210        bool recurse_src)
3211{
3212    BdrvTrackedRequest req;
3213    int ret;
3214
3215    /* TODO We can support BDRV_REQ_NO_FALLBACK here */
3216    assert(!(read_flags & BDRV_REQ_NO_FALLBACK));
3217    assert(!(write_flags & BDRV_REQ_NO_FALLBACK));
3218
3219    if (!dst || !dst->bs) {
3220        return -ENOMEDIUM;
3221    }
3222    ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes);
3223    if (ret) {
3224        return ret;
3225    }
3226    if (write_flags & BDRV_REQ_ZERO_WRITE) {
3227        return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags);
3228    }
3229
3230    if (!src || !src->bs) {
3231        return -ENOMEDIUM;
3232    }
3233    ret = bdrv_check_byte_request(src->bs, src_offset, bytes);
3234    if (ret) {
3235        return ret;
3236    }
3237
3238    if (!src->bs->drv->bdrv_co_copy_range_from
3239        || !dst->bs->drv->bdrv_co_copy_range_to
3240        || src->bs->encrypted || dst->bs->encrypted) {
3241        return -ENOTSUP;
3242    }
3243
3244    if (recurse_src) {
3245        bdrv_inc_in_flight(src->bs);
3246        tracked_request_begin(&req, src->bs, src_offset, bytes,
3247                              BDRV_TRACKED_READ);
3248
3249        /* BDRV_REQ_SERIALISING is only for write operation */
3250        assert(!(read_flags & BDRV_REQ_SERIALISING));
3251        if (!(read_flags & BDRV_REQ_NO_SERIALISING)) {
3252            bdrv_wait_serialising_requests(&req);
3253        }
3254
3255        ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
3256                                                    src, src_offset,
3257                                                    dst, dst_offset,
3258                                                    bytes,
3259                                                    read_flags, write_flags);
3260
3261        tracked_request_end(&req);
3262        bdrv_dec_in_flight(src->bs);
3263    } else {
3264        bdrv_inc_in_flight(dst->bs);
3265        tracked_request_begin(&req, dst->bs, dst_offset, bytes,
3266                              BDRV_TRACKED_WRITE);
3267        ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req,
3268                                        write_flags);
3269        if (!ret) {
3270            ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
3271                                                      src, src_offset,
3272                                                      dst, dst_offset,
3273                                                      bytes,
3274                                                      read_flags, write_flags);
3275        }
3276        bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret);
3277        tracked_request_end(&req);
3278        bdrv_dec_in_flight(dst->bs);
3279    }
3280
3281    return ret;
3282}
3283
3284/* Copy range from @src to @dst.
3285 *
3286 * See the comment of bdrv_co_copy_range for the parameter and return value
3287 * semantics. */
3288int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset,
3289                                         BdrvChild *dst, uint64_t dst_offset,
3290                                         uint64_t bytes,
3291                                         BdrvRequestFlags read_flags,
3292                                         BdrvRequestFlags write_flags)
3293{
3294    trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes,
3295                                  read_flags, write_flags);
3296    return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
3297                                       bytes, read_flags, write_flags, true);
3298}
3299
3300/* Copy range from @src to @dst.
3301 *
3302 * See the comment of bdrv_co_copy_range for the parameter and return value
3303 * semantics. */
3304int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset,
3305                                       BdrvChild *dst, uint64_t dst_offset,
3306                                       uint64_t bytes,
3307                                       BdrvRequestFlags read_flags,
3308                                       BdrvRequestFlags write_flags)
3309{
3310    trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3311                                read_flags, write_flags);
3312    return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
3313                                       bytes, read_flags, write_flags, false);
3314}
3315
3316int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset,
3317                                    BdrvChild *dst, uint64_t dst_offset,
3318                                    uint64_t bytes, BdrvRequestFlags read_flags,
3319                                    BdrvRequestFlags write_flags)
3320{
3321    return bdrv_co_copy_range_from(src, src_offset,
3322                                   dst, dst_offset,
3323                                   bytes, read_flags, write_flags);
3324}
3325
3326static void bdrv_parent_cb_resize(BlockDriverState *bs)
3327{
3328    BdrvChild *c;
3329    QLIST_FOREACH(c, &bs->parents, next_parent) {
3330        if (c->role->resize) {
3331            c->role->resize(c);
3332        }
3333    }
3334}
3335
3336/**
3337 * Truncate file to 'offset' bytes (needed only for file protocols)
3338 *
3339 * If 'exact' is true, the file must be resized to exactly the given
3340 * 'offset'.  Otherwise, it is sufficient for the node to be at least
3341 * 'offset' bytes in length.
3342 */
3343int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
3344                                  PreallocMode prealloc, Error **errp)
3345{
3346    BlockDriverState *bs = child->bs;
3347    BlockDriver *drv = bs->drv;
3348    BdrvTrackedRequest req;
3349    int64_t old_size, new_bytes;
3350    int ret;
3351
3352
3353    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
3354    if (!drv) {
3355        error_setg(errp, "No medium inserted");
3356        return -ENOMEDIUM;
3357    }
3358    if (offset < 0) {
3359        error_setg(errp, "Image size cannot be negative");
3360        return -EINVAL;
3361    }
3362
3363    old_size = bdrv_getlength(bs);
3364    if (old_size < 0) {
3365        error_setg_errno(errp, -old_size, "Failed to get old image size");
3366        return old_size;
3367    }
3368
3369    if (offset > old_size) {
3370        new_bytes = offset - old_size;
3371    } else {
3372        new_bytes = 0;
3373    }
3374
3375    bdrv_inc_in_flight(bs);
3376    tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
3377                          BDRV_TRACKED_TRUNCATE);
3378
3379    /* If we are growing the image and potentially using preallocation for the
3380     * new area, we need to make sure that no write requests are made to it
3381     * concurrently or they might be overwritten by preallocation. */
3382    if (new_bytes) {
3383        bdrv_mark_request_serialising(&req, 1);
3384    }
3385    if (bs->read_only) {
3386        error_setg(errp, "Image is read-only");
3387        ret = -EACCES;
3388        goto out;
3389    }
3390    ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req,
3391                                    0);
3392    if (ret < 0) {
3393        error_setg_errno(errp, -ret,
3394                         "Failed to prepare request for truncation");
3395        goto out;
3396    }
3397
3398    if (drv->bdrv_co_truncate) {
3399        ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, errp);
3400    } else if (bs->file && drv->is_filter) {
3401        ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
3402    } else {
3403        error_setg(errp, "Image format driver does not support resize");
3404        ret = -ENOTSUP;
3405        goto out;
3406    }
3407    if (ret < 0) {
3408        goto out;
3409    }
3410
3411    ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3412    if (ret < 0) {
3413        error_setg_errno(errp, -ret, "Could not refresh total sector count");
3414    } else {
3415        offset = bs->total_sectors * BDRV_SECTOR_SIZE;
3416    }
3417    /* It's possible that truncation succeeded but refresh_total_sectors
3418     * failed, but the latter doesn't affect how we should finish the request.
3419     * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */
3420    bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0);
3421
3422out:
3423    tracked_request_end(&req);
3424    bdrv_dec_in_flight(bs);
3425
3426    return ret;
3427}
3428
3429typedef struct TruncateCo {
3430    BdrvChild *child;
3431    int64_t offset;
3432    bool exact;
3433    PreallocMode prealloc;
3434    Error **errp;
3435    int ret;
3436} TruncateCo;
3437
3438static void coroutine_fn bdrv_truncate_co_entry(void *opaque)
3439{
3440    TruncateCo *tco = opaque;
3441    tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->exact,
3442                                tco->prealloc, tco->errp);
3443    aio_wait_kick();
3444}
3445
3446int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
3447                  PreallocMode prealloc, Error **errp)
3448{
3449    Coroutine *co;
3450    TruncateCo tco = {
3451        .child      = child,
3452        .offset     = offset,
3453        .exact      = exact,
3454        .prealloc   = prealloc,
3455        .errp       = errp,
3456        .ret        = NOT_DONE,
3457    };
3458
3459    if (qemu_in_coroutine()) {
3460        /* Fast-path if already in coroutine context */
3461        bdrv_truncate_co_entry(&tco);
3462    } else {
3463        co = qemu_coroutine_create(bdrv_truncate_co_entry, &tco);
3464        bdrv_coroutine_enter(child->bs, co);
3465        BDRV_POLL_WHILE(child->bs, tco.ret == NOT_DONE);
3466    }
3467
3468    return tco.ret;
3469}
3470