qemu/block/io.c
<<
>>
Prefs
   1/*
   2 * Block layer I/O functions
   3 *
   4 * Copyright (c) 2003 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "trace.h"
  27#include "sysemu/block-backend.h"
  28#include "block/aio-wait.h"
  29#include "block/blockjob.h"
  30#include "block/blockjob_int.h"
  31#include "block/block_int.h"
  32#include "qemu/cutils.h"
  33#include "qapi/error.h"
  34#include "qemu/error-report.h"
  35#include "qemu/main-loop.h"
  36#include "sysemu/replay.h"
  37
  38#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  39
  40/* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
  41#define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
  42
  43static void bdrv_parent_cb_resize(BlockDriverState *bs);
  44static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
  45    int64_t offset, int bytes, BdrvRequestFlags flags);
  46
  47static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
  48                                      bool ignore_bds_parents)
  49{
  50    BdrvChild *c, *next;
  51
  52    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
  53        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
  54            continue;
  55        }
  56        bdrv_parent_drained_begin_single(c, false);
  57    }
  58}
  59
  60static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c,
  61                                                   int *drained_end_counter)
  62{
  63    assert(c->parent_quiesce_counter > 0);
  64    c->parent_quiesce_counter--;
  65    if (c->role->drained_end) {
  66        c->role->drained_end(c, drained_end_counter);
  67    }
  68}
  69
  70void bdrv_parent_drained_end_single(BdrvChild *c)
  71{
  72    int drained_end_counter = 0;
  73    bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter);
  74    BDRV_POLL_WHILE(c->bs, atomic_read(&drained_end_counter) > 0);
  75}
  76
  77static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
  78                                    bool ignore_bds_parents,
  79                                    int *drained_end_counter)
  80{
  81    BdrvChild *c;
  82
  83    QLIST_FOREACH(c, &bs->parents, next_parent) {
  84        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
  85            continue;
  86        }
  87        bdrv_parent_drained_end_single_no_poll(c, drained_end_counter);
  88    }
  89}
  90
  91static bool bdrv_parent_drained_poll_single(BdrvChild *c)
  92{
  93    if (c->role->drained_poll) {
  94        return c->role->drained_poll(c);
  95    }
  96    return false;
  97}
  98
  99static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
 100                                     bool ignore_bds_parents)
 101{
 102    BdrvChild *c, *next;
 103    bool busy = false;
 104
 105    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
 106        if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
 107            continue;
 108        }
 109        busy |= bdrv_parent_drained_poll_single(c);
 110    }
 111
 112    return busy;
 113}
 114
 115void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
 116{
 117    c->parent_quiesce_counter++;
 118    if (c->role->drained_begin) {
 119        c->role->drained_begin(c);
 120    }
 121    if (poll) {
 122        BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c));
 123    }
 124}
 125
 126static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
 127{
 128    dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
 129    dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
 130    dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
 131                                 src->opt_mem_alignment);
 132    dst->min_mem_alignment = MAX(dst->min_mem_alignment,
 133                                 src->min_mem_alignment);
 134    dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
 135}
 136
 137void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
 138{
 139    BlockDriver *drv = bs->drv;
 140    Error *local_err = NULL;
 141
 142    memset(&bs->bl, 0, sizeof(bs->bl));
 143
 144    if (!drv) {
 145        return;
 146    }
 147
 148    /* Default alignment based on whether driver has byte interface */
 149    bs->bl.request_alignment = (drv->bdrv_co_preadv ||
 150                                drv->bdrv_aio_preadv ||
 151                                drv->bdrv_co_preadv_part) ? 1 : 512;
 152
 153    /* Take some limits from the children as a default */
 154    if (bs->file) {
 155        bdrv_refresh_limits(bs->file->bs, &local_err);
 156        if (local_err) {
 157            error_propagate(errp, local_err);
 158            return;
 159        }
 160        bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
 161    } else {
 162        bs->bl.min_mem_alignment = 512;
 163        bs->bl.opt_mem_alignment = qemu_real_host_page_size;
 164
 165        /* Safe default since most protocols use readv()/writev()/etc */
 166        bs->bl.max_iov = IOV_MAX;
 167    }
 168
 169    if (bs->backing) {
 170        bdrv_refresh_limits(bs->backing->bs, &local_err);
 171        if (local_err) {
 172            error_propagate(errp, local_err);
 173            return;
 174        }
 175        bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
 176    }
 177
 178    /* Then let the driver override it */
 179    if (drv->bdrv_refresh_limits) {
 180        drv->bdrv_refresh_limits(bs, errp);
 181    }
 182}
 183
 184/**
 185 * The copy-on-read flag is actually a reference count so multiple users may
 186 * use the feature without worrying about clobbering its previous state.
 187 * Copy-on-read stays enabled until all users have called to disable it.
 188 */
 189void bdrv_enable_copy_on_read(BlockDriverState *bs)
 190{
 191    atomic_inc(&bs->copy_on_read);
 192}
 193
 194void bdrv_disable_copy_on_read(BlockDriverState *bs)
 195{
 196    int old = atomic_fetch_dec(&bs->copy_on_read);
 197    assert(old >= 1);
 198}
 199
 200typedef struct {
 201    Coroutine *co;
 202    BlockDriverState *bs;
 203    bool done;
 204    bool begin;
 205    bool recursive;
 206    bool poll;
 207    BdrvChild *parent;
 208    bool ignore_bds_parents;
 209    int *drained_end_counter;
 210} BdrvCoDrainData;
 211
 212static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
 213{
 214    BdrvCoDrainData *data = opaque;
 215    BlockDriverState *bs = data->bs;
 216
 217    if (data->begin) {
 218        bs->drv->bdrv_co_drain_begin(bs);
 219    } else {
 220        bs->drv->bdrv_co_drain_end(bs);
 221    }
 222
 223    /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */
 224    atomic_mb_set(&data->done, true);
 225    if (!data->begin) {
 226        atomic_dec(data->drained_end_counter);
 227    }
 228    bdrv_dec_in_flight(bs);
 229
 230    g_free(data);
 231}
 232
 233/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
 234static void bdrv_drain_invoke(BlockDriverState *bs, bool begin,
 235                              int *drained_end_counter)
 236{
 237    BdrvCoDrainData *data;
 238
 239    if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
 240            (!begin && !bs->drv->bdrv_co_drain_end)) {
 241        return;
 242    }
 243
 244    data = g_new(BdrvCoDrainData, 1);
 245    *data = (BdrvCoDrainData) {
 246        .bs = bs,
 247        .done = false,
 248        .begin = begin,
 249        .drained_end_counter = drained_end_counter,
 250    };
 251
 252    if (!begin) {
 253        atomic_inc(drained_end_counter);
 254    }
 255
 256    /* Make sure the driver callback completes during the polling phase for
 257     * drain_begin. */
 258    bdrv_inc_in_flight(bs);
 259    data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
 260    aio_co_schedule(bdrv_get_aio_context(bs), data->co);
 261}
 262
 263/* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
 264bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
 265                     BdrvChild *ignore_parent, bool ignore_bds_parents)
 266{
 267    BdrvChild *child, *next;
 268
 269    if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
 270        return true;
 271    }
 272
 273    if (atomic_read(&bs->in_flight)) {
 274        return true;
 275    }
 276
 277    if (recursive) {
 278        assert(!ignore_bds_parents);
 279        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 280            if (bdrv_drain_poll(child->bs, recursive, child, false)) {
 281                return true;
 282            }
 283        }
 284    }
 285
 286    return false;
 287}
 288
 289static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
 290                                      BdrvChild *ignore_parent)
 291{
 292    return bdrv_drain_poll(bs, recursive, ignore_parent, false);
 293}
 294
 295static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 296                                  BdrvChild *parent, bool ignore_bds_parents,
 297                                  bool poll);
 298static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 299                                BdrvChild *parent, bool ignore_bds_parents,
 300                                int *drained_end_counter);
 301
 302static void bdrv_co_drain_bh_cb(void *opaque)
 303{
 304    BdrvCoDrainData *data = opaque;
 305    Coroutine *co = data->co;
 306    BlockDriverState *bs = data->bs;
 307
 308    if (bs) {
 309        AioContext *ctx = bdrv_get_aio_context(bs);
 310        AioContext *co_ctx = qemu_coroutine_get_aio_context(co);
 311
 312        /*
 313         * When the coroutine yielded, the lock for its home context was
 314         * released, so we need to re-acquire it here. If it explicitly
 315         * acquired a different context, the lock is still held and we don't
 316         * want to lock it a second time (or AIO_WAIT_WHILE() would hang).
 317         */
 318        if (ctx == co_ctx) {
 319            aio_context_acquire(ctx);
 320        }
 321        bdrv_dec_in_flight(bs);
 322        if (data->begin) {
 323            assert(!data->drained_end_counter);
 324            bdrv_do_drained_begin(bs, data->recursive, data->parent,
 325                                  data->ignore_bds_parents, data->poll);
 326        } else {
 327            assert(!data->poll);
 328            bdrv_do_drained_end(bs, data->recursive, data->parent,
 329                                data->ignore_bds_parents,
 330                                data->drained_end_counter);
 331        }
 332        if (ctx == co_ctx) {
 333            aio_context_release(ctx);
 334        }
 335    } else {
 336        assert(data->begin);
 337        bdrv_drain_all_begin();
 338    }
 339
 340    data->done = true;
 341    aio_co_wake(co);
 342}
 343
 344static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
 345                                                bool begin, bool recursive,
 346                                                BdrvChild *parent,
 347                                                bool ignore_bds_parents,
 348                                                bool poll,
 349                                                int *drained_end_counter)
 350{
 351    BdrvCoDrainData data;
 352
 353    /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
 354     * other coroutines run if they were queued by aio_co_enter(). */
 355
 356    assert(qemu_in_coroutine());
 357    data = (BdrvCoDrainData) {
 358        .co = qemu_coroutine_self(),
 359        .bs = bs,
 360        .done = false,
 361        .begin = begin,
 362        .recursive = recursive,
 363        .parent = parent,
 364        .ignore_bds_parents = ignore_bds_parents,
 365        .poll = poll,
 366        .drained_end_counter = drained_end_counter,
 367    };
 368
 369    if (bs) {
 370        bdrv_inc_in_flight(bs);
 371    }
 372    replay_bh_schedule_oneshot_event(bdrv_get_aio_context(bs),
 373                                     bdrv_co_drain_bh_cb, &data);
 374
 375    qemu_coroutine_yield();
 376    /* If we are resumed from some other event (such as an aio completion or a
 377     * timer callback), it is a bug in the caller that should be fixed. */
 378    assert(data.done);
 379}
 380
 381void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
 382                                   BdrvChild *parent, bool ignore_bds_parents)
 383{
 384    assert(!qemu_in_coroutine());
 385
 386    /* Stop things in parent-to-child order */
 387    if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
 388        aio_disable_external(bdrv_get_aio_context(bs));
 389    }
 390
 391    bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
 392    bdrv_drain_invoke(bs, true, NULL);
 393}
 394
 395static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 396                                  BdrvChild *parent, bool ignore_bds_parents,
 397                                  bool poll)
 398{
 399    BdrvChild *child, *next;
 400
 401    if (qemu_in_coroutine()) {
 402        bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
 403                               poll, NULL);
 404        return;
 405    }
 406
 407    bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
 408
 409    if (recursive) {
 410        assert(!ignore_bds_parents);
 411        bs->recursive_quiesce_counter++;
 412        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 413            bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
 414                                  false);
 415        }
 416    }
 417
 418    /*
 419     * Wait for drained requests to finish.
 420     *
 421     * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
 422     * call is needed so things in this AioContext can make progress even
 423     * though we don't return to the main AioContext loop - this automatically
 424     * includes other nodes in the same AioContext and therefore all child
 425     * nodes.
 426     */
 427    if (poll) {
 428        assert(!ignore_bds_parents);
 429        BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
 430    }
 431}
 432
 433void bdrv_drained_begin(BlockDriverState *bs)
 434{
 435    bdrv_do_drained_begin(bs, false, NULL, false, true);
 436}
 437
 438void bdrv_subtree_drained_begin(BlockDriverState *bs)
 439{
 440    bdrv_do_drained_begin(bs, true, NULL, false, true);
 441}
 442
 443/**
 444 * This function does not poll, nor must any of its recursively called
 445 * functions.  The *drained_end_counter pointee will be incremented
 446 * once for every background operation scheduled, and decremented once
 447 * the operation settles.  Therefore, the pointer must remain valid
 448 * until the pointee reaches 0.  That implies that whoever sets up the
 449 * pointee has to poll until it is 0.
 450 *
 451 * We use atomic operations to access *drained_end_counter, because
 452 * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of
 453 *     @bs may contain nodes in different AioContexts,
 454 * (2) bdrv_drain_all_end() uses the same counter for all nodes,
 455 *     regardless of which AioContext they are in.
 456 */
 457static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 458                                BdrvChild *parent, bool ignore_bds_parents,
 459                                int *drained_end_counter)
 460{
 461    BdrvChild *child;
 462    int old_quiesce_counter;
 463
 464    assert(drained_end_counter != NULL);
 465
 466    if (qemu_in_coroutine()) {
 467        bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
 468                               false, drained_end_counter);
 469        return;
 470    }
 471    assert(bs->quiesce_counter > 0);
 472
 473    /* Re-enable things in child-to-parent order */
 474    bdrv_drain_invoke(bs, false, drained_end_counter);
 475    bdrv_parent_drained_end(bs, parent, ignore_bds_parents,
 476                            drained_end_counter);
 477
 478    old_quiesce_counter = atomic_fetch_dec(&bs->quiesce_counter);
 479    if (old_quiesce_counter == 1) {
 480        aio_enable_external(bdrv_get_aio_context(bs));
 481    }
 482
 483    if (recursive) {
 484        assert(!ignore_bds_parents);
 485        bs->recursive_quiesce_counter--;
 486        QLIST_FOREACH(child, &bs->children, next) {
 487            bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents,
 488                                drained_end_counter);
 489        }
 490    }
 491}
 492
 493void bdrv_drained_end(BlockDriverState *bs)
 494{
 495    int drained_end_counter = 0;
 496    bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter);
 497    BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0);
 498}
 499
 500void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter)
 501{
 502    bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter);
 503}
 504
 505void bdrv_subtree_drained_end(BlockDriverState *bs)
 506{
 507    int drained_end_counter = 0;
 508    bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter);
 509    BDRV_POLL_WHILE(bs, atomic_read(&drained_end_counter) > 0);
 510}
 511
 512void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
 513{
 514    int i;
 515
 516    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
 517        bdrv_do_drained_begin(child->bs, true, child, false, true);
 518    }
 519}
 520
 521void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
 522{
 523    int drained_end_counter = 0;
 524    int i;
 525
 526    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
 527        bdrv_do_drained_end(child->bs, true, child, false,
 528                            &drained_end_counter);
 529    }
 530
 531    BDRV_POLL_WHILE(child->bs, atomic_read(&drained_end_counter) > 0);
 532}
 533
 534/*
 535 * Wait for pending requests to complete on a single BlockDriverState subtree,
 536 * and suspend block driver's internal I/O until next request arrives.
 537 *
 538 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
 539 * AioContext.
 540 */
 541void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
 542{
 543    assert(qemu_in_coroutine());
 544    bdrv_drained_begin(bs);
 545    bdrv_drained_end(bs);
 546}
 547
 548void bdrv_drain(BlockDriverState *bs)
 549{
 550    bdrv_drained_begin(bs);
 551    bdrv_drained_end(bs);
 552}
 553
 554static void bdrv_drain_assert_idle(BlockDriverState *bs)
 555{
 556    BdrvChild *child, *next;
 557
 558    assert(atomic_read(&bs->in_flight) == 0);
 559    QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 560        bdrv_drain_assert_idle(child->bs);
 561    }
 562}
 563
 564unsigned int bdrv_drain_all_count = 0;
 565
 566static bool bdrv_drain_all_poll(void)
 567{
 568    BlockDriverState *bs = NULL;
 569    bool result = false;
 570
 571    /* bdrv_drain_poll() can't make changes to the graph and we are holding the
 572     * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
 573    while ((bs = bdrv_next_all_states(bs))) {
 574        AioContext *aio_context = bdrv_get_aio_context(bs);
 575        aio_context_acquire(aio_context);
 576        result |= bdrv_drain_poll(bs, false, NULL, true);
 577        aio_context_release(aio_context);
 578    }
 579
 580    return result;
 581}
 582
 583/*
 584 * Wait for pending requests to complete across all BlockDriverStates
 585 *
 586 * This function does not flush data to disk, use bdrv_flush_all() for that
 587 * after calling this function.
 588 *
 589 * This pauses all block jobs and disables external clients. It must
 590 * be paired with bdrv_drain_all_end().
 591 *
 592 * NOTE: no new block jobs or BlockDriverStates can be created between
 593 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
 594 */
 595void bdrv_drain_all_begin(void)
 596{
 597    BlockDriverState *bs = NULL;
 598
 599    if (qemu_in_coroutine()) {
 600        bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL);
 601        return;
 602    }
 603
 604    /*
 605     * bdrv queue is managed by record/replay,
 606     * waiting for finishing the I/O requests may
 607     * be infinite
 608     */
 609    if (replay_events_enabled()) {
 610        return;
 611    }
 612
 613    /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
 614     * loop AioContext, so make sure we're in the main context. */
 615    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 616    assert(bdrv_drain_all_count < INT_MAX);
 617    bdrv_drain_all_count++;
 618
 619    /* Quiesce all nodes, without polling in-flight requests yet. The graph
 620     * cannot change during this loop. */
 621    while ((bs = bdrv_next_all_states(bs))) {
 622        AioContext *aio_context = bdrv_get_aio_context(bs);
 623
 624        aio_context_acquire(aio_context);
 625        bdrv_do_drained_begin(bs, false, NULL, true, false);
 626        aio_context_release(aio_context);
 627    }
 628
 629    /* Now poll the in-flight requests */
 630    AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll());
 631
 632    while ((bs = bdrv_next_all_states(bs))) {
 633        bdrv_drain_assert_idle(bs);
 634    }
 635}
 636
 637void bdrv_drain_all_end(void)
 638{
 639    BlockDriverState *bs = NULL;
 640    int drained_end_counter = 0;
 641
 642    /*
 643     * bdrv queue is managed by record/replay,
 644     * waiting for finishing the I/O requests may
 645     * be endless
 646     */
 647    if (replay_events_enabled()) {
 648        return;
 649    }
 650
 651    while ((bs = bdrv_next_all_states(bs))) {
 652        AioContext *aio_context = bdrv_get_aio_context(bs);
 653
 654        aio_context_acquire(aio_context);
 655        bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter);
 656        aio_context_release(aio_context);
 657    }
 658
 659    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 660    AIO_WAIT_WHILE(NULL, atomic_read(&drained_end_counter) > 0);
 661
 662    assert(bdrv_drain_all_count > 0);
 663    bdrv_drain_all_count--;
 664}
 665
 666void bdrv_drain_all(void)
 667{
 668    bdrv_drain_all_begin();
 669    bdrv_drain_all_end();
 670}
 671
 672/**
 673 * Remove an active request from the tracked requests list
 674 *
 675 * This function should be called when a tracked request is completing.
 676 */
 677static void tracked_request_end(BdrvTrackedRequest *req)
 678{
 679    if (req->serialising) {
 680        atomic_dec(&req->bs->serialising_in_flight);
 681    }
 682
 683    qemu_co_mutex_lock(&req->bs->reqs_lock);
 684    QLIST_REMOVE(req, list);
 685    qemu_co_queue_restart_all(&req->wait_queue);
 686    qemu_co_mutex_unlock(&req->bs->reqs_lock);
 687}
 688
 689/**
 690 * Add an active request to the tracked requests list
 691 */
 692static void tracked_request_begin(BdrvTrackedRequest *req,
 693                                  BlockDriverState *bs,
 694                                  int64_t offset,
 695                                  uint64_t bytes,
 696                                  enum BdrvTrackedRequestType type)
 697{
 698    assert(bytes <= INT64_MAX && offset <= INT64_MAX - bytes);
 699
 700    *req = (BdrvTrackedRequest){
 701        .bs = bs,
 702        .offset         = offset,
 703        .bytes          = bytes,
 704        .type           = type,
 705        .co             = qemu_coroutine_self(),
 706        .serialising    = false,
 707        .overlap_offset = offset,
 708        .overlap_bytes  = bytes,
 709    };
 710
 711    qemu_co_queue_init(&req->wait_queue);
 712
 713    qemu_co_mutex_lock(&bs->reqs_lock);
 714    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
 715    qemu_co_mutex_unlock(&bs->reqs_lock);
 716}
 717
 718void bdrv_mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
 719{
 720    int64_t overlap_offset = req->offset & ~(align - 1);
 721    uint64_t overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
 722                               - overlap_offset;
 723
 724    if (!req->serialising) {
 725        atomic_inc(&req->bs->serialising_in_flight);
 726        req->serialising = true;
 727    }
 728
 729    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
 730    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
 731}
 732
 733static bool is_request_serialising_and_aligned(BdrvTrackedRequest *req)
 734{
 735    /*
 736     * If the request is serialising, overlap_offset and overlap_bytes are set,
 737     * so we can check if the request is aligned. Otherwise, don't care and
 738     * return false.
 739     */
 740
 741    return req->serialising && (req->offset == req->overlap_offset) &&
 742           (req->bytes == req->overlap_bytes);
 743}
 744
 745/**
 746 * Return the tracked request on @bs for the current coroutine, or
 747 * NULL if there is none.
 748 */
 749BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs)
 750{
 751    BdrvTrackedRequest *req;
 752    Coroutine *self = qemu_coroutine_self();
 753
 754    QLIST_FOREACH(req, &bs->tracked_requests, list) {
 755        if (req->co == self) {
 756            return req;
 757        }
 758    }
 759
 760    return NULL;
 761}
 762
 763/**
 764 * Round a region to cluster boundaries
 765 */
 766void bdrv_round_to_clusters(BlockDriverState *bs,
 767                            int64_t offset, int64_t bytes,
 768                            int64_t *cluster_offset,
 769                            int64_t *cluster_bytes)
 770{
 771    BlockDriverInfo bdi;
 772
 773    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
 774        *cluster_offset = offset;
 775        *cluster_bytes = bytes;
 776    } else {
 777        int64_t c = bdi.cluster_size;
 778        *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
 779        *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
 780    }
 781}
 782
 783static int bdrv_get_cluster_size(BlockDriverState *bs)
 784{
 785    BlockDriverInfo bdi;
 786    int ret;
 787
 788    ret = bdrv_get_info(bs, &bdi);
 789    if (ret < 0 || bdi.cluster_size == 0) {
 790        return bs->bl.request_alignment;
 791    } else {
 792        return bdi.cluster_size;
 793    }
 794}
 795
 796static bool tracked_request_overlaps(BdrvTrackedRequest *req,
 797                                     int64_t offset, uint64_t bytes)
 798{
 799    /*        aaaa   bbbb */
 800    if (offset >= req->overlap_offset + req->overlap_bytes) {
 801        return false;
 802    }
 803    /* bbbb   aaaa        */
 804    if (req->overlap_offset >= offset + bytes) {
 805        return false;
 806    }
 807    return true;
 808}
 809
 810void bdrv_inc_in_flight(BlockDriverState *bs)
 811{
 812    atomic_inc(&bs->in_flight);
 813}
 814
 815void bdrv_wakeup(BlockDriverState *bs)
 816{
 817    aio_wait_kick();
 818}
 819
 820void bdrv_dec_in_flight(BlockDriverState *bs)
 821{
 822    atomic_dec(&bs->in_flight);
 823    bdrv_wakeup(bs);
 824}
 825
 826bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self)
 827{
 828    BlockDriverState *bs = self->bs;
 829    BdrvTrackedRequest *req;
 830    bool retry;
 831    bool waited = false;
 832
 833    if (!atomic_read(&bs->serialising_in_flight)) {
 834        return false;
 835    }
 836
 837    do {
 838        retry = false;
 839        qemu_co_mutex_lock(&bs->reqs_lock);
 840        QLIST_FOREACH(req, &bs->tracked_requests, list) {
 841            if (req == self || (!req->serialising && !self->serialising)) {
 842                continue;
 843            }
 844            if (tracked_request_overlaps(req, self->overlap_offset,
 845                                         self->overlap_bytes))
 846            {
 847                /* Hitting this means there was a reentrant request, for
 848                 * example, a block driver issuing nested requests.  This must
 849                 * never happen since it means deadlock.
 850                 */
 851                assert(qemu_coroutine_self() != req->co);
 852
 853                /* If the request is already (indirectly) waiting for us, or
 854                 * will wait for us as soon as it wakes up, then just go on
 855                 * (instead of producing a deadlock in the former case). */
 856                if (!req->waiting_for) {
 857                    self->waiting_for = req;
 858                    qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
 859                    self->waiting_for = NULL;
 860                    retry = true;
 861                    waited = true;
 862                    break;
 863                }
 864            }
 865        }
 866        qemu_co_mutex_unlock(&bs->reqs_lock);
 867    } while (retry);
 868
 869    return waited;
 870}
 871
 872static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 873                                   size_t size)
 874{
 875    if (size > BDRV_REQUEST_MAX_BYTES) {
 876        return -EIO;
 877    }
 878
 879    if (!bdrv_is_inserted(bs)) {
 880        return -ENOMEDIUM;
 881    }
 882
 883    if (offset < 0) {
 884        return -EIO;
 885    }
 886
 887    return 0;
 888}
 889
 890typedef struct RwCo {
 891    BdrvChild *child;
 892    int64_t offset;
 893    QEMUIOVector *qiov;
 894    bool is_write;
 895    int ret;
 896    BdrvRequestFlags flags;
 897} RwCo;
 898
 899static void coroutine_fn bdrv_rw_co_entry(void *opaque)
 900{
 901    RwCo *rwco = opaque;
 902
 903    if (!rwco->is_write) {
 904        rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
 905                                   rwco->qiov->size, rwco->qiov,
 906                                   rwco->flags);
 907    } else {
 908        rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
 909                                    rwco->qiov->size, rwco->qiov,
 910                                    rwco->flags);
 911    }
 912    aio_wait_kick();
 913}
 914
 915/*
 916 * Process a vectored synchronous request using coroutines
 917 */
 918static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
 919                        QEMUIOVector *qiov, bool is_write,
 920                        BdrvRequestFlags flags)
 921{
 922    Coroutine *co;
 923    RwCo rwco = {
 924        .child = child,
 925        .offset = offset,
 926        .qiov = qiov,
 927        .is_write = is_write,
 928        .ret = NOT_DONE,
 929        .flags = flags,
 930    };
 931
 932    if (qemu_in_coroutine()) {
 933        /* Fast-path if already in coroutine context */
 934        bdrv_rw_co_entry(&rwco);
 935    } else {
 936        co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
 937        bdrv_coroutine_enter(child->bs, co);
 938        BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
 939    }
 940    return rwco.ret;
 941}
 942
 943int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
 944                       int bytes, BdrvRequestFlags flags)
 945{
 946    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, NULL, bytes);
 947
 948    return bdrv_prwv_co(child, offset, &qiov, true,
 949                        BDRV_REQ_ZERO_WRITE | flags);
 950}
 951
 952/*
 953 * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
 954 * The operation is sped up by checking the block status and only writing
 955 * zeroes to the device if they currently do not return zeroes. Optional
 956 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
 957 * BDRV_REQ_FUA).
 958 *
 959 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
 960 */
 961int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
 962{
 963    int ret;
 964    int64_t target_size, bytes, offset = 0;
 965    BlockDriverState *bs = child->bs;
 966
 967    target_size = bdrv_getlength(bs);
 968    if (target_size < 0) {
 969        return target_size;
 970    }
 971
 972    for (;;) {
 973        bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
 974        if (bytes <= 0) {
 975            return 0;
 976        }
 977        ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
 978        if (ret < 0) {
 979            return ret;
 980        }
 981        if (ret & BDRV_BLOCK_ZERO) {
 982            offset += bytes;
 983            continue;
 984        }
 985        ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
 986        if (ret < 0) {
 987            return ret;
 988        }
 989        offset += bytes;
 990    }
 991}
 992
 993int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
 994{
 995    int ret;
 996
 997    ret = bdrv_prwv_co(child, offset, qiov, false, 0);
 998    if (ret < 0) {
 999        return ret;
1000    }
1001
1002    return qiov->size;
1003}
1004
1005/* See bdrv_pwrite() for the return codes */
1006int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
1007{
1008    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1009
1010    if (bytes < 0) {
1011        return -EINVAL;
1012    }
1013
1014    return bdrv_preadv(child, offset, &qiov);
1015}
1016
1017int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
1018{
1019    int ret;
1020
1021    ret = bdrv_prwv_co(child, offset, qiov, true, 0);
1022    if (ret < 0) {
1023        return ret;
1024    }
1025
1026    return qiov->size;
1027}
1028
1029/* Return no. of bytes on success or < 0 on error. Important errors are:
1030  -EIO         generic I/O error (may happen for all errors)
1031  -ENOMEDIUM   No media inserted.
1032  -EINVAL      Invalid offset or number of bytes
1033  -EACCES      Trying to write a read-only device
1034*/
1035int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
1036{
1037    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1038
1039    if (bytes < 0) {
1040        return -EINVAL;
1041    }
1042
1043    return bdrv_pwritev(child, offset, &qiov);
1044}
1045
1046/*
1047 * Writes to the file and ensures that no writes are reordered across this
1048 * request (acts as a barrier)
1049 *
1050 * Returns 0 on success, -errno in error cases.
1051 */
1052int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
1053                     const void *buf, int count)
1054{
1055    int ret;
1056
1057    ret = bdrv_pwrite(child, offset, buf, count);
1058    if (ret < 0) {
1059        return ret;
1060    }
1061
1062    ret = bdrv_flush(child->bs);
1063    if (ret < 0) {
1064        return ret;
1065    }
1066
1067    return 0;
1068}
1069
1070typedef struct CoroutineIOCompletion {
1071    Coroutine *coroutine;
1072    int ret;
1073} CoroutineIOCompletion;
1074
1075static void bdrv_co_io_em_complete(void *opaque, int ret)
1076{
1077    CoroutineIOCompletion *co = opaque;
1078
1079    co->ret = ret;
1080    aio_co_wake(co->coroutine);
1081}
1082
1083static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
1084                                           uint64_t offset, uint64_t bytes,
1085                                           QEMUIOVector *qiov,
1086                                           size_t qiov_offset, int flags)
1087{
1088    BlockDriver *drv = bs->drv;
1089    int64_t sector_num;
1090    unsigned int nb_sectors;
1091    QEMUIOVector local_qiov;
1092    int ret;
1093
1094    assert(!(flags & ~BDRV_REQ_MASK));
1095    assert(!(flags & BDRV_REQ_NO_FALLBACK));
1096
1097    if (!drv) {
1098        return -ENOMEDIUM;
1099    }
1100
1101    if (drv->bdrv_co_preadv_part) {
1102        return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset,
1103                                        flags);
1104    }
1105
1106    if (qiov_offset > 0 || bytes != qiov->size) {
1107        qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1108        qiov = &local_qiov;
1109    }
1110
1111    if (drv->bdrv_co_preadv) {
1112        ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
1113        goto out;
1114    }
1115
1116    if (drv->bdrv_aio_preadv) {
1117        BlockAIOCB *acb;
1118        CoroutineIOCompletion co = {
1119            .coroutine = qemu_coroutine_self(),
1120        };
1121
1122        acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
1123                                   bdrv_co_io_em_complete, &co);
1124        if (acb == NULL) {
1125            ret = -EIO;
1126            goto out;
1127        } else {
1128            qemu_coroutine_yield();
1129            ret = co.ret;
1130            goto out;
1131        }
1132    }
1133
1134    sector_num = offset >> BDRV_SECTOR_BITS;
1135    nb_sectors = bytes >> BDRV_SECTOR_BITS;
1136
1137    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
1138    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
1139    assert(bytes <= BDRV_REQUEST_MAX_BYTES);
1140    assert(drv->bdrv_co_readv);
1141
1142    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1143
1144out:
1145    if (qiov == &local_qiov) {
1146        qemu_iovec_destroy(&local_qiov);
1147    }
1148
1149    return ret;
1150}
1151
1152static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
1153                                            uint64_t offset, uint64_t bytes,
1154                                            QEMUIOVector *qiov,
1155                                            size_t qiov_offset, int flags)
1156{
1157    BlockDriver *drv = bs->drv;
1158    int64_t sector_num;
1159    unsigned int nb_sectors;
1160    QEMUIOVector local_qiov;
1161    int ret;
1162
1163    assert(!(flags & ~BDRV_REQ_MASK));
1164    assert(!(flags & BDRV_REQ_NO_FALLBACK));
1165
1166    if (!drv) {
1167        return -ENOMEDIUM;
1168    }
1169
1170    if (drv->bdrv_co_pwritev_part) {
1171        ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset,
1172                                        flags & bs->supported_write_flags);
1173        flags &= ~bs->supported_write_flags;
1174        goto emulate_flags;
1175    }
1176
1177    if (qiov_offset > 0 || bytes != qiov->size) {
1178        qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1179        qiov = &local_qiov;
1180    }
1181
1182    if (drv->bdrv_co_pwritev) {
1183        ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
1184                                   flags & bs->supported_write_flags);
1185        flags &= ~bs->supported_write_flags;
1186        goto emulate_flags;
1187    }
1188
1189    if (drv->bdrv_aio_pwritev) {
1190        BlockAIOCB *acb;
1191        CoroutineIOCompletion co = {
1192            .coroutine = qemu_coroutine_self(),
1193        };
1194
1195        acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
1196                                    flags & bs->supported_write_flags,
1197                                    bdrv_co_io_em_complete, &co);
1198        flags &= ~bs->supported_write_flags;
1199        if (acb == NULL) {
1200            ret = -EIO;
1201        } else {
1202            qemu_coroutine_yield();
1203            ret = co.ret;
1204        }
1205        goto emulate_flags;
1206    }
1207
1208    sector_num = offset >> BDRV_SECTOR_BITS;
1209    nb_sectors = bytes >> BDRV_SECTOR_BITS;
1210
1211    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
1212    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
1213    assert(bytes <= BDRV_REQUEST_MAX_BYTES);
1214
1215    assert(drv->bdrv_co_writev);
1216    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
1217                              flags & bs->supported_write_flags);
1218    flags &= ~bs->supported_write_flags;
1219
1220emulate_flags:
1221    if (ret == 0 && (flags & BDRV_REQ_FUA)) {
1222        ret = bdrv_co_flush(bs);
1223    }
1224
1225    if (qiov == &local_qiov) {
1226        qemu_iovec_destroy(&local_qiov);
1227    }
1228
1229    return ret;
1230}
1231
1232static int coroutine_fn
1233bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
1234                               uint64_t bytes, QEMUIOVector *qiov,
1235                               size_t qiov_offset)
1236{
1237    BlockDriver *drv = bs->drv;
1238    QEMUIOVector local_qiov;
1239    int ret;
1240
1241    if (!drv) {
1242        return -ENOMEDIUM;
1243    }
1244
1245    if (!block_driver_can_compress(drv)) {
1246        return -ENOTSUP;
1247    }
1248
1249    if (drv->bdrv_co_pwritev_compressed_part) {
1250        return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes,
1251                                                    qiov, qiov_offset);
1252    }
1253
1254    if (qiov_offset == 0) {
1255        return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
1256    }
1257
1258    qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1259    ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov);
1260    qemu_iovec_destroy(&local_qiov);
1261
1262    return ret;
1263}
1264
1265static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
1266        int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1267        size_t qiov_offset, int flags)
1268{
1269    BlockDriverState *bs = child->bs;
1270
1271    /* Perform I/O through a temporary buffer so that users who scribble over
1272     * their read buffer while the operation is in progress do not end up
1273     * modifying the image file.  This is critical for zero-copy guest I/O
1274     * where anything might happen inside guest memory.
1275     */
1276    void *bounce_buffer = NULL;
1277
1278    BlockDriver *drv = bs->drv;
1279    int64_t cluster_offset;
1280    int64_t cluster_bytes;
1281    size_t skip_bytes;
1282    int ret;
1283    int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1284                                    BDRV_REQUEST_MAX_BYTES);
1285    unsigned int progress = 0;
1286    bool skip_write;
1287
1288    if (!drv) {
1289        return -ENOMEDIUM;
1290    }
1291
1292    /*
1293     * Do not write anything when the BDS is inactive.  That is not
1294     * allowed, and it would not help.
1295     */
1296    skip_write = (bs->open_flags & BDRV_O_INACTIVE);
1297
1298    /* FIXME We cannot require callers to have write permissions when all they
1299     * are doing is a read request. If we did things right, write permissions
1300     * would be obtained anyway, but internally by the copy-on-read code. As
1301     * long as it is implemented here rather than in a separate filter driver,
1302     * the copy-on-read code doesn't have its own BdrvChild, however, for which
1303     * it could request permissions. Therefore we have to bypass the permission
1304     * system for the moment. */
1305    // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1306
1307    /* Cover entire cluster so no additional backing file I/O is required when
1308     * allocating cluster in the image file.  Note that this value may exceed
1309     * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
1310     * is one reason we loop rather than doing it all at once.
1311     */
1312    bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
1313    skip_bytes = offset - cluster_offset;
1314
1315    trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
1316                                   cluster_offset, cluster_bytes);
1317
1318    while (cluster_bytes) {
1319        int64_t pnum;
1320
1321        if (skip_write) {
1322            ret = 1; /* "already allocated", so nothing will be copied */
1323            pnum = MIN(cluster_bytes, max_transfer);
1324        } else {
1325            ret = bdrv_is_allocated(bs, cluster_offset,
1326                                    MIN(cluster_bytes, max_transfer), &pnum);
1327            if (ret < 0) {
1328                /*
1329                 * Safe to treat errors in querying allocation as if
1330                 * unallocated; we'll probably fail again soon on the
1331                 * read, but at least that will set a decent errno.
1332                 */
1333                pnum = MIN(cluster_bytes, max_transfer);
1334            }
1335
1336            /* Stop at EOF if the image ends in the middle of the cluster */
1337            if (ret == 0 && pnum == 0) {
1338                assert(progress >= bytes);
1339                break;
1340            }
1341
1342            assert(skip_bytes < pnum);
1343        }
1344
1345        if (ret <= 0) {
1346            QEMUIOVector local_qiov;
1347
1348            /* Must copy-on-read; use the bounce buffer */
1349            pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
1350            if (!bounce_buffer) {
1351                int64_t max_we_need = MAX(pnum, cluster_bytes - pnum);
1352                int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER);
1353                int64_t bounce_buffer_len = MIN(max_we_need, max_allowed);
1354
1355                bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len);
1356                if (!bounce_buffer) {
1357                    ret = -ENOMEM;
1358                    goto err;
1359                }
1360            }
1361            qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
1362
1363            ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
1364                                     &local_qiov, 0, 0);
1365            if (ret < 0) {
1366                goto err;
1367            }
1368
1369            bdrv_debug_event(bs, BLKDBG_COR_WRITE);
1370            if (drv->bdrv_co_pwrite_zeroes &&
1371                buffer_is_zero(bounce_buffer, pnum)) {
1372                /* FIXME: Should we (perhaps conditionally) be setting
1373                 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
1374                 * that still correctly reads as zero? */
1375                ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
1376                                               BDRV_REQ_WRITE_UNCHANGED);
1377            } else {
1378                /* This does not change the data on the disk, it is not
1379                 * necessary to flush even in cache=writethrough mode.
1380                 */
1381                ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
1382                                          &local_qiov, 0,
1383                                          BDRV_REQ_WRITE_UNCHANGED);
1384            }
1385
1386            if (ret < 0) {
1387                /* It might be okay to ignore write errors for guest
1388                 * requests.  If this is a deliberate copy-on-read
1389                 * then we don't want to ignore the error.  Simply
1390                 * report it in all cases.
1391                 */
1392                goto err;
1393            }
1394
1395            if (!(flags & BDRV_REQ_PREFETCH)) {
1396                qemu_iovec_from_buf(qiov, qiov_offset + progress,
1397                                    bounce_buffer + skip_bytes,
1398                                    pnum - skip_bytes);
1399            }
1400        } else if (!(flags & BDRV_REQ_PREFETCH)) {
1401            /* Read directly into the destination */
1402            ret = bdrv_driver_preadv(bs, offset + progress,
1403                                     MIN(pnum - skip_bytes, bytes - progress),
1404                                     qiov, qiov_offset + progress, 0);
1405            if (ret < 0) {
1406                goto err;
1407            }
1408        }
1409
1410        cluster_offset += pnum;
1411        cluster_bytes -= pnum;
1412        progress += pnum - skip_bytes;
1413        skip_bytes = 0;
1414    }
1415    ret = 0;
1416
1417err:
1418    qemu_vfree(bounce_buffer);
1419    return ret;
1420}
1421
1422/*
1423 * Forwards an already correctly aligned request to the BlockDriver. This
1424 * handles copy on read, zeroing after EOF, and fragmentation of large
1425 * reads; any other features must be implemented by the caller.
1426 */
1427static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
1428    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1429    int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags)
1430{
1431    BlockDriverState *bs = child->bs;
1432    int64_t total_bytes, max_bytes;
1433    int ret = 0;
1434    uint64_t bytes_remaining = bytes;
1435    int max_transfer;
1436
1437    assert(is_power_of_2(align));
1438    assert((offset & (align - 1)) == 0);
1439    assert((bytes & (align - 1)) == 0);
1440    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1441    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1442                                   align);
1443
1444    /* TODO: We would need a per-BDS .supported_read_flags and
1445     * potential fallback support, if we ever implement any read flags
1446     * to pass through to drivers.  For now, there aren't any
1447     * passthrough flags.  */
1448    assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ |
1449                       BDRV_REQ_PREFETCH)));
1450
1451    /* Handle Copy on Read and associated serialisation */
1452    if (flags & BDRV_REQ_COPY_ON_READ) {
1453        /* If we touch the same cluster it counts as an overlap.  This
1454         * guarantees that allocating writes will be serialized and not race
1455         * with each other for the same cluster.  For example, in copy-on-read
1456         * it ensures that the CoR read and write operations are atomic and
1457         * guest writes cannot interleave between them. */
1458        bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
1459    }
1460
1461    /* BDRV_REQ_SERIALISING is only for write operation */
1462    assert(!(flags & BDRV_REQ_SERIALISING));
1463
1464    if (!(flags & BDRV_REQ_NO_SERIALISING)) {
1465        bdrv_wait_serialising_requests(req);
1466    }
1467
1468    if (flags & BDRV_REQ_COPY_ON_READ) {
1469        int64_t pnum;
1470
1471        ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
1472        if (ret < 0) {
1473            goto out;
1474        }
1475
1476        if (!ret || pnum != bytes) {
1477            ret = bdrv_co_do_copy_on_readv(child, offset, bytes,
1478                                           qiov, qiov_offset, flags);
1479            goto out;
1480        } else if (flags & BDRV_REQ_PREFETCH) {
1481            goto out;
1482        }
1483    }
1484
1485    /* Forward the request to the BlockDriver, possibly fragmenting it */
1486    total_bytes = bdrv_getlength(bs);
1487    if (total_bytes < 0) {
1488        ret = total_bytes;
1489        goto out;
1490    }
1491
1492    max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1493    if (bytes <= max_bytes && bytes <= max_transfer) {
1494        ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0);
1495        goto out;
1496    }
1497
1498    while (bytes_remaining) {
1499        int num;
1500
1501        if (max_bytes) {
1502            num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1503            assert(num);
1504
1505            ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1506                                     num, qiov, bytes - bytes_remaining, 0);
1507            max_bytes -= num;
1508        } else {
1509            num = bytes_remaining;
1510            ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
1511                                    bytes_remaining);
1512        }
1513        if (ret < 0) {
1514            goto out;
1515        }
1516        bytes_remaining -= num;
1517    }
1518
1519out:
1520    return ret < 0 ? ret : 0;
1521}
1522
1523/*
1524 * Request padding
1525 *
1526 *  |<---- align ----->|                     |<----- align ---->|
1527 *  |<- head ->|<------------- bytes ------------->|<-- tail -->|
1528 *  |          |       |                     |     |            |
1529 * -*----------$-------*-------- ... --------*-----$------------*---
1530 *  |          |       |                     |     |            |
1531 *  |          offset  |                     |     end          |
1532 *  ALIGN_DOWN(offset) ALIGN_UP(offset)      ALIGN_DOWN(end)   ALIGN_UP(end)
1533 *  [buf   ... )                             [tail_buf          )
1534 *
1535 * @buf is an aligned allocation needed to store @head and @tail paddings. @head
1536 * is placed at the beginning of @buf and @tail at the @end.
1537 *
1538 * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk
1539 * around tail, if tail exists.
1540 *
1541 * @merge_reads is true for small requests,
1542 * if @buf_len == @head + bytes + @tail. In this case it is possible that both
1543 * head and tail exist but @buf_len == align and @tail_buf == @buf.
1544 */
1545typedef struct BdrvRequestPadding {
1546    uint8_t *buf;
1547    size_t buf_len;
1548    uint8_t *tail_buf;
1549    size_t head;
1550    size_t tail;
1551    bool merge_reads;
1552    QEMUIOVector local_qiov;
1553} BdrvRequestPadding;
1554
1555static bool bdrv_init_padding(BlockDriverState *bs,
1556                              int64_t offset, int64_t bytes,
1557                              BdrvRequestPadding *pad)
1558{
1559    uint64_t align = bs->bl.request_alignment;
1560    size_t sum;
1561
1562    memset(pad, 0, sizeof(*pad));
1563
1564    pad->head = offset & (align - 1);
1565    pad->tail = ((offset + bytes) & (align - 1));
1566    if (pad->tail) {
1567        pad->tail = align - pad->tail;
1568    }
1569
1570    if ((!pad->head && !pad->tail) || !bytes) {
1571        return false;
1572    }
1573
1574    sum = pad->head + bytes + pad->tail;
1575    pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align;
1576    pad->buf = qemu_blockalign(bs, pad->buf_len);
1577    pad->merge_reads = sum == pad->buf_len;
1578    if (pad->tail) {
1579        pad->tail_buf = pad->buf + pad->buf_len - align;
1580    }
1581
1582    return true;
1583}
1584
1585static int bdrv_padding_rmw_read(BdrvChild *child,
1586                                 BdrvTrackedRequest *req,
1587                                 BdrvRequestPadding *pad,
1588                                 bool zero_middle)
1589{
1590    QEMUIOVector local_qiov;
1591    BlockDriverState *bs = child->bs;
1592    uint64_t align = bs->bl.request_alignment;
1593    int ret;
1594
1595    assert(req->serialising && pad->buf);
1596
1597    if (pad->head || pad->merge_reads) {
1598        uint64_t bytes = pad->merge_reads ? pad->buf_len : align;
1599
1600        qemu_iovec_init_buf(&local_qiov, pad->buf, bytes);
1601
1602        if (pad->head) {
1603            bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1604        }
1605        if (pad->merge_reads && pad->tail) {
1606            bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1607        }
1608        ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes,
1609                                  align, &local_qiov, 0, 0);
1610        if (ret < 0) {
1611            return ret;
1612        }
1613        if (pad->head) {
1614            bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1615        }
1616        if (pad->merge_reads && pad->tail) {
1617            bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1618        }
1619
1620        if (pad->merge_reads) {
1621            goto zero_mem;
1622        }
1623    }
1624
1625    if (pad->tail) {
1626        qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align);
1627
1628        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1629        ret = bdrv_aligned_preadv(
1630                child, req,
1631                req->overlap_offset + req->overlap_bytes - align,
1632                align, align, &local_qiov, 0, 0);
1633        if (ret < 0) {
1634            return ret;
1635        }
1636        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1637    }
1638
1639zero_mem:
1640    if (zero_middle) {
1641        memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail);
1642    }
1643
1644    return 0;
1645}
1646
1647static void bdrv_padding_destroy(BdrvRequestPadding *pad)
1648{
1649    if (pad->buf) {
1650        qemu_vfree(pad->buf);
1651        qemu_iovec_destroy(&pad->local_qiov);
1652    }
1653}
1654
1655/*
1656 * bdrv_pad_request
1657 *
1658 * Exchange request parameters with padded request if needed. Don't include RMW
1659 * read of padding, bdrv_padding_rmw_read() should be called separately if
1660 * needed.
1661 *
1662 * All parameters except @bs are in-out: they represent original request at
1663 * function call and padded (if padding needed) at function finish.
1664 *
1665 * Function always succeeds.
1666 */
1667static bool bdrv_pad_request(BlockDriverState *bs,
1668                             QEMUIOVector **qiov, size_t *qiov_offset,
1669                             int64_t *offset, unsigned int *bytes,
1670                             BdrvRequestPadding *pad)
1671{
1672    if (!bdrv_init_padding(bs, *offset, *bytes, pad)) {
1673        return false;
1674    }
1675
1676    qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head,
1677                             *qiov, *qiov_offset, *bytes,
1678                             pad->buf + pad->buf_len - pad->tail, pad->tail);
1679    *bytes += pad->head + pad->tail;
1680    *offset -= pad->head;
1681    *qiov = &pad->local_qiov;
1682    *qiov_offset = 0;
1683
1684    return true;
1685}
1686
1687int coroutine_fn bdrv_co_preadv(BdrvChild *child,
1688    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1689    BdrvRequestFlags flags)
1690{
1691    return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags);
1692}
1693
1694int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
1695    int64_t offset, unsigned int bytes,
1696    QEMUIOVector *qiov, size_t qiov_offset,
1697    BdrvRequestFlags flags)
1698{
1699    BlockDriverState *bs = child->bs;
1700    BdrvTrackedRequest req;
1701    BdrvRequestPadding pad;
1702    int ret;
1703
1704    trace_bdrv_co_preadv(bs, offset, bytes, flags);
1705
1706    ret = bdrv_check_byte_request(bs, offset, bytes);
1707    if (ret < 0) {
1708        return ret;
1709    }
1710
1711    bdrv_inc_in_flight(bs);
1712
1713    /* Don't do copy-on-read if we read data before write operation */
1714    if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
1715        flags |= BDRV_REQ_COPY_ON_READ;
1716    }
1717
1718    bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad);
1719
1720    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1721    ret = bdrv_aligned_preadv(child, &req, offset, bytes,
1722                              bs->bl.request_alignment,
1723                              qiov, qiov_offset, flags);
1724    tracked_request_end(&req);
1725    bdrv_dec_in_flight(bs);
1726
1727    bdrv_padding_destroy(&pad);
1728
1729    return ret;
1730}
1731
1732static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1733    int64_t offset, int bytes, BdrvRequestFlags flags)
1734{
1735    BlockDriver *drv = bs->drv;
1736    QEMUIOVector qiov;
1737    void *buf = NULL;
1738    int ret = 0;
1739    bool need_flush = false;
1740    int head = 0;
1741    int tail = 0;
1742
1743    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1744    int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1745                        bs->bl.request_alignment);
1746    int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
1747
1748    if (!drv) {
1749        return -ENOMEDIUM;
1750    }
1751
1752    if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) {
1753        return -ENOTSUP;
1754    }
1755
1756    assert(alignment % bs->bl.request_alignment == 0);
1757    head = offset % alignment;
1758    tail = (offset + bytes) % alignment;
1759    max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1760    assert(max_write_zeroes >= bs->bl.request_alignment);
1761
1762    while (bytes > 0 && !ret) {
1763        int num = bytes;
1764
1765        /* Align request.  Block drivers can expect the "bulk" of the request
1766         * to be aligned, and that unaligned requests do not cross cluster
1767         * boundaries.
1768         */
1769        if (head) {
1770            /* Make a small request up to the first aligned sector. For
1771             * convenience, limit this request to max_transfer even if
1772             * we don't need to fall back to writes.  */
1773            num = MIN(MIN(bytes, max_transfer), alignment - head);
1774            head = (head + num) % alignment;
1775            assert(num < max_write_zeroes);
1776        } else if (tail && num > alignment) {
1777            /* Shorten the request to the last aligned sector.  */
1778            num -= tail;
1779        }
1780
1781        /* limit request size */
1782        if (num > max_write_zeroes) {
1783            num = max_write_zeroes;
1784        }
1785
1786        ret = -ENOTSUP;
1787        /* First try the efficient write zeroes operation */
1788        if (drv->bdrv_co_pwrite_zeroes) {
1789            ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1790                                             flags & bs->supported_zero_flags);
1791            if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1792                !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1793                need_flush = true;
1794            }
1795        } else {
1796            assert(!bs->supported_zero_flags);
1797        }
1798
1799        if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) {
1800            /* Fall back to bounce buffer if write zeroes is unsupported */
1801            BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1802
1803            if ((flags & BDRV_REQ_FUA) &&
1804                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1805                /* No need for bdrv_driver_pwrite() to do a fallback
1806                 * flush on each chunk; use just one at the end */
1807                write_flags &= ~BDRV_REQ_FUA;
1808                need_flush = true;
1809            }
1810            num = MIN(num, max_transfer);
1811            if (buf == NULL) {
1812                buf = qemu_try_blockalign0(bs, num);
1813                if (buf == NULL) {
1814                    ret = -ENOMEM;
1815                    goto fail;
1816                }
1817            }
1818            qemu_iovec_init_buf(&qiov, buf, num);
1819
1820            ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags);
1821
1822            /* Keep bounce buffer around if it is big enough for all
1823             * all future requests.
1824             */
1825            if (num < max_transfer) {
1826                qemu_vfree(buf);
1827                buf = NULL;
1828            }
1829        }
1830
1831        offset += num;
1832        bytes -= num;
1833    }
1834
1835fail:
1836    if (ret == 0 && need_flush) {
1837        ret = bdrv_co_flush(bs);
1838    }
1839    qemu_vfree(buf);
1840    return ret;
1841}
1842
1843static inline int coroutine_fn
1844bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, uint64_t bytes,
1845                          BdrvTrackedRequest *req, int flags)
1846{
1847    BlockDriverState *bs = child->bs;
1848    bool waited;
1849    int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1850
1851    if (bs->read_only) {
1852        return -EPERM;
1853    }
1854
1855    /* BDRV_REQ_NO_SERIALISING is only for read operation */
1856    assert(!(flags & BDRV_REQ_NO_SERIALISING));
1857    assert(!(bs->open_flags & BDRV_O_INACTIVE));
1858    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1859    assert(!(flags & ~BDRV_REQ_MASK));
1860
1861    if (flags & BDRV_REQ_SERIALISING) {
1862        bdrv_mark_request_serialising(req, bdrv_get_cluster_size(bs));
1863    }
1864
1865    waited = bdrv_wait_serialising_requests(req);
1866
1867    assert(!waited || !req->serialising ||
1868           is_request_serialising_and_aligned(req));
1869    assert(req->overlap_offset <= offset);
1870    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1871    assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
1872
1873    switch (req->type) {
1874    case BDRV_TRACKED_WRITE:
1875    case BDRV_TRACKED_DISCARD:
1876        if (flags & BDRV_REQ_WRITE_UNCHANGED) {
1877            assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1878        } else {
1879            assert(child->perm & BLK_PERM_WRITE);
1880        }
1881        return notifier_with_return_list_notify(&bs->before_write_notifiers,
1882                                                req);
1883    case BDRV_TRACKED_TRUNCATE:
1884        assert(child->perm & BLK_PERM_RESIZE);
1885        return 0;
1886    default:
1887        abort();
1888    }
1889}
1890
1891static inline void coroutine_fn
1892bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, uint64_t bytes,
1893                         BdrvTrackedRequest *req, int ret)
1894{
1895    int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1896    BlockDriverState *bs = child->bs;
1897
1898    atomic_inc(&bs->write_gen);
1899
1900    /*
1901     * Discard cannot extend the image, but in error handling cases, such as
1902     * when reverting a qcow2 cluster allocation, the discarded range can pass
1903     * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD
1904     * here. Instead, just skip it, since semantically a discard request
1905     * beyond EOF cannot expand the image anyway.
1906     */
1907    if (ret == 0 &&
1908        (req->type == BDRV_TRACKED_TRUNCATE ||
1909         end_sector > bs->total_sectors) &&
1910        req->type != BDRV_TRACKED_DISCARD) {
1911        bs->total_sectors = end_sector;
1912        bdrv_parent_cb_resize(bs);
1913        bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS);
1914    }
1915    if (req->bytes) {
1916        switch (req->type) {
1917        case BDRV_TRACKED_WRITE:
1918            stat64_max(&bs->wr_highest_offset, offset + bytes);
1919            /* fall through, to set dirty bits */
1920        case BDRV_TRACKED_DISCARD:
1921            bdrv_set_dirty(bs, offset, bytes);
1922            break;
1923        default:
1924            break;
1925        }
1926    }
1927}
1928
1929/*
1930 * Forwards an already correctly aligned write request to the BlockDriver,
1931 * after possibly fragmenting it.
1932 */
1933static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
1934    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1935    int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags)
1936{
1937    BlockDriverState *bs = child->bs;
1938    BlockDriver *drv = bs->drv;
1939    int ret;
1940
1941    uint64_t bytes_remaining = bytes;
1942    int max_transfer;
1943
1944    if (!drv) {
1945        return -ENOMEDIUM;
1946    }
1947
1948    if (bdrv_has_readonly_bitmaps(bs)) {
1949        return -EPERM;
1950    }
1951
1952    assert(is_power_of_2(align));
1953    assert((offset & (align - 1)) == 0);
1954    assert((bytes & (align - 1)) == 0);
1955    assert(!qiov || qiov_offset + bytes <= qiov->size);
1956    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1957                                   align);
1958
1959    ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags);
1960
1961    if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1962        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
1963        qemu_iovec_is_zero(qiov, qiov_offset, bytes)) {
1964        flags |= BDRV_REQ_ZERO_WRITE;
1965        if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1966            flags |= BDRV_REQ_MAY_UNMAP;
1967        }
1968    }
1969
1970    if (ret < 0) {
1971        /* Do nothing, write notifier decided to fail this request */
1972    } else if (flags & BDRV_REQ_ZERO_WRITE) {
1973        bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1974        ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
1975    } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
1976        ret = bdrv_driver_pwritev_compressed(bs, offset, bytes,
1977                                             qiov, qiov_offset);
1978    } else if (bytes <= max_transfer) {
1979        bdrv_debug_event(bs, BLKDBG_PWRITEV);
1980        ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags);
1981    } else {
1982        bdrv_debug_event(bs, BLKDBG_PWRITEV);
1983        while (bytes_remaining) {
1984            int num = MIN(bytes_remaining, max_transfer);
1985            int local_flags = flags;
1986
1987            assert(num);
1988            if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
1989                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1990                /* If FUA is going to be emulated by flush, we only
1991                 * need to flush on the last iteration */
1992                local_flags &= ~BDRV_REQ_FUA;
1993            }
1994
1995            ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
1996                                      num, qiov, bytes - bytes_remaining,
1997                                      local_flags);
1998            if (ret < 0) {
1999                break;
2000            }
2001            bytes_remaining -= num;
2002        }
2003    }
2004    bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
2005
2006    if (ret >= 0) {
2007        ret = 0;
2008    }
2009    bdrv_co_write_req_finish(child, offset, bytes, req, ret);
2010
2011    return ret;
2012}
2013
2014static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
2015                                                int64_t offset,
2016                                                unsigned int bytes,
2017                                                BdrvRequestFlags flags,
2018                                                BdrvTrackedRequest *req)
2019{
2020    BlockDriverState *bs = child->bs;
2021    QEMUIOVector local_qiov;
2022    uint64_t align = bs->bl.request_alignment;
2023    int ret = 0;
2024    bool padding;
2025    BdrvRequestPadding pad;
2026
2027    padding = bdrv_init_padding(bs, offset, bytes, &pad);
2028    if (padding) {
2029        bdrv_mark_request_serialising(req, align);
2030        bdrv_wait_serialising_requests(req);
2031
2032        bdrv_padding_rmw_read(child, req, &pad, true);
2033
2034        if (pad.head || pad.merge_reads) {
2035            int64_t aligned_offset = offset & ~(align - 1);
2036            int64_t write_bytes = pad.merge_reads ? pad.buf_len : align;
2037
2038            qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes);
2039            ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes,
2040                                       align, &local_qiov, 0,
2041                                       flags & ~BDRV_REQ_ZERO_WRITE);
2042            if (ret < 0 || pad.merge_reads) {
2043                /* Error or all work is done */
2044                goto out;
2045            }
2046            offset += write_bytes - pad.head;
2047            bytes -= write_bytes - pad.head;
2048        }
2049    }
2050
2051    assert(!bytes || (offset & (align - 1)) == 0);
2052    if (bytes >= align) {
2053        /* Write the aligned part in the middle. */
2054        uint64_t aligned_bytes = bytes & ~(align - 1);
2055        ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
2056                                   NULL, 0, flags);
2057        if (ret < 0) {
2058            goto out;
2059        }
2060        bytes -= aligned_bytes;
2061        offset += aligned_bytes;
2062    }
2063
2064    assert(!bytes || (offset & (align - 1)) == 0);
2065    if (bytes) {
2066        assert(align == pad.tail + bytes);
2067
2068        qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align);
2069        ret = bdrv_aligned_pwritev(child, req, offset, align, align,
2070                                   &local_qiov, 0,
2071                                   flags & ~BDRV_REQ_ZERO_WRITE);
2072    }
2073
2074out:
2075    bdrv_padding_destroy(&pad);
2076
2077    return ret;
2078}
2079
2080/*
2081 * Handle a write request in coroutine context
2082 */
2083int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
2084    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
2085    BdrvRequestFlags flags)
2086{
2087    return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags);
2088}
2089
2090int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
2091    int64_t offset, unsigned int bytes, QEMUIOVector *qiov, size_t qiov_offset,
2092    BdrvRequestFlags flags)
2093{
2094    BlockDriverState *bs = child->bs;
2095    BdrvTrackedRequest req;
2096    uint64_t align = bs->bl.request_alignment;
2097    BdrvRequestPadding pad;
2098    int ret;
2099
2100    trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
2101
2102    if (!bs->drv) {
2103        return -ENOMEDIUM;
2104    }
2105
2106    ret = bdrv_check_byte_request(bs, offset, bytes);
2107    if (ret < 0) {
2108        return ret;
2109    }
2110
2111    /* If the request is misaligned then we can't make it efficient */
2112    if ((flags & BDRV_REQ_NO_FALLBACK) &&
2113        !QEMU_IS_ALIGNED(offset | bytes, align))
2114    {
2115        return -ENOTSUP;
2116    }
2117
2118    bdrv_inc_in_flight(bs);
2119    /*
2120     * Align write if necessary by performing a read-modify-write cycle.
2121     * Pad qiov with the read parts and be sure to have a tracked request not
2122     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
2123     */
2124    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
2125
2126    if (flags & BDRV_REQ_ZERO_WRITE) {
2127        ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
2128        goto out;
2129    }
2130
2131    if (bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad)) {
2132        bdrv_mark_request_serialising(&req, align);
2133        bdrv_wait_serialising_requests(&req);
2134        bdrv_padding_rmw_read(child, &req, &pad, false);
2135    }
2136
2137    ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
2138                               qiov, qiov_offset, flags);
2139
2140    bdrv_padding_destroy(&pad);
2141
2142out:
2143    tracked_request_end(&req);
2144    bdrv_dec_in_flight(bs);
2145
2146    return ret;
2147}
2148
2149int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
2150                                       int bytes, BdrvRequestFlags flags)
2151{
2152    trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
2153
2154    if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
2155        flags &= ~BDRV_REQ_MAY_UNMAP;
2156    }
2157
2158    return bdrv_co_pwritev(child, offset, bytes, NULL,
2159                           BDRV_REQ_ZERO_WRITE | flags);
2160}
2161
2162/*
2163 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
2164 */
2165int bdrv_flush_all(void)
2166{
2167    BdrvNextIterator it;
2168    BlockDriverState *bs = NULL;
2169    int result = 0;
2170
2171    /*
2172     * bdrv queue is managed by record/replay,
2173     * creating new flush request for stopping
2174     * the VM may break the determinism
2175     */
2176    if (replay_events_enabled()) {
2177        return result;
2178    }
2179
2180    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
2181        AioContext *aio_context = bdrv_get_aio_context(bs);
2182        int ret;
2183
2184        aio_context_acquire(aio_context);
2185        ret = bdrv_flush(bs);
2186        if (ret < 0 && !result) {
2187            result = ret;
2188        }
2189        aio_context_release(aio_context);
2190    }
2191
2192    return result;
2193}
2194
2195
2196typedef struct BdrvCoBlockStatusData {
2197    BlockDriverState *bs;
2198    BlockDriverState *base;
2199    bool want_zero;
2200    int64_t offset;
2201    int64_t bytes;
2202    int64_t *pnum;
2203    int64_t *map;
2204    BlockDriverState **file;
2205    int ret;
2206    bool done;
2207} BdrvCoBlockStatusData;
2208
2209int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
2210                                                bool want_zero,
2211                                                int64_t offset,
2212                                                int64_t bytes,
2213                                                int64_t *pnum,
2214                                                int64_t *map,
2215                                                BlockDriverState **file)
2216{
2217    assert(bs->file && bs->file->bs);
2218    *pnum = bytes;
2219    *map = offset;
2220    *file = bs->file->bs;
2221    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2222}
2223
2224int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
2225                                                   bool want_zero,
2226                                                   int64_t offset,
2227                                                   int64_t bytes,
2228                                                   int64_t *pnum,
2229                                                   int64_t *map,
2230                                                   BlockDriverState **file)
2231{
2232    assert(bs->backing && bs->backing->bs);
2233    *pnum = bytes;
2234    *map = offset;
2235    *file = bs->backing->bs;
2236    return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2237}
2238
2239/*
2240 * Returns the allocation status of the specified sectors.
2241 * Drivers not implementing the functionality are assumed to not support
2242 * backing files, hence all their sectors are reported as allocated.
2243 *
2244 * If 'want_zero' is true, the caller is querying for mapping
2245 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
2246 * _ZERO where possible; otherwise, the result favors larger 'pnum',
2247 * with a focus on accurate BDRV_BLOCK_ALLOCATED.
2248 *
2249 * If 'offset' is beyond the end of the disk image the return value is
2250 * BDRV_BLOCK_EOF and 'pnum' is set to 0.
2251 *
2252 * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
2253 * beyond the end of the disk image it will be clamped; if 'pnum' is set to
2254 * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
2255 *
2256 * 'pnum' is set to the number of bytes (including and immediately
2257 * following the specified offset) that are easily known to be in the
2258 * same allocated/unallocated state.  Note that a second call starting
2259 * at the original offset plus returned pnum may have the same status.
2260 * The returned value is non-zero on success except at end-of-file.
2261 *
2262 * Returns negative errno on failure.  Otherwise, if the
2263 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
2264 * set to the host mapping and BDS corresponding to the guest offset.
2265 */
2266static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
2267                                             bool want_zero,
2268                                             int64_t offset, int64_t bytes,
2269                                             int64_t *pnum, int64_t *map,
2270                                             BlockDriverState **file)
2271{
2272    int64_t total_size;
2273    int64_t n; /* bytes */
2274    int ret;
2275    int64_t local_map = 0;
2276    BlockDriverState *local_file = NULL;
2277    int64_t aligned_offset, aligned_bytes;
2278    uint32_t align;
2279
2280    assert(pnum);
2281    *pnum = 0;
2282    total_size = bdrv_getlength(bs);
2283    if (total_size < 0) {
2284        ret = total_size;
2285        goto early_out;
2286    }
2287
2288    if (offset >= total_size) {
2289        ret = BDRV_BLOCK_EOF;
2290        goto early_out;
2291    }
2292    if (!bytes) {
2293        ret = 0;
2294        goto early_out;
2295    }
2296
2297    n = total_size - offset;
2298    if (n < bytes) {
2299        bytes = n;
2300    }
2301
2302    /* Must be non-NULL or bdrv_getlength() would have failed */
2303    assert(bs->drv);
2304    if (!bs->drv->bdrv_co_block_status) {
2305        *pnum = bytes;
2306        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
2307        if (offset + bytes == total_size) {
2308            ret |= BDRV_BLOCK_EOF;
2309        }
2310        if (bs->drv->protocol_name) {
2311            ret |= BDRV_BLOCK_OFFSET_VALID;
2312            local_map = offset;
2313            local_file = bs;
2314        }
2315        goto early_out;
2316    }
2317
2318    bdrv_inc_in_flight(bs);
2319
2320    /* Round out to request_alignment boundaries */
2321    align = bs->bl.request_alignment;
2322    aligned_offset = QEMU_ALIGN_DOWN(offset, align);
2323    aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
2324
2325    ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
2326                                        aligned_bytes, pnum, &local_map,
2327                                        &local_file);
2328    if (ret < 0) {
2329        *pnum = 0;
2330        goto out;
2331    }
2332
2333    /*
2334     * The driver's result must be a non-zero multiple of request_alignment.
2335     * Clamp pnum and adjust map to original request.
2336     */
2337    assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
2338           align > offset - aligned_offset);
2339    if (ret & BDRV_BLOCK_RECURSE) {
2340        assert(ret & BDRV_BLOCK_DATA);
2341        assert(ret & BDRV_BLOCK_OFFSET_VALID);
2342        assert(!(ret & BDRV_BLOCK_ZERO));
2343    }
2344
2345    *pnum -= offset - aligned_offset;
2346    if (*pnum > bytes) {
2347        *pnum = bytes;
2348    }
2349    if (ret & BDRV_BLOCK_OFFSET_VALID) {
2350        local_map += offset - aligned_offset;
2351    }
2352
2353    if (ret & BDRV_BLOCK_RAW) {
2354        assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
2355        ret = bdrv_co_block_status(local_file, want_zero, local_map,
2356                                   *pnum, pnum, &local_map, &local_file);
2357        goto out;
2358    }
2359
2360    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
2361        ret |= BDRV_BLOCK_ALLOCATED;
2362    } else if (want_zero) {
2363        if (bdrv_unallocated_blocks_are_zero(bs)) {
2364            ret |= BDRV_BLOCK_ZERO;
2365        } else if (bs->backing) {
2366            BlockDriverState *bs2 = bs->backing->bs;
2367            int64_t size2 = bdrv_getlength(bs2);
2368
2369            if (size2 >= 0 && offset >= size2) {
2370                ret |= BDRV_BLOCK_ZERO;
2371            }
2372        }
2373    }
2374
2375    if (want_zero && ret & BDRV_BLOCK_RECURSE &&
2376        local_file && local_file != bs &&
2377        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
2378        (ret & BDRV_BLOCK_OFFSET_VALID)) {
2379        int64_t file_pnum;
2380        int ret2;
2381
2382        ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
2383                                    *pnum, &file_pnum, NULL, NULL);
2384        if (ret2 >= 0) {
2385            /* Ignore errors.  This is just providing extra information, it
2386             * is useful but not necessary.
2387             */
2388            if (ret2 & BDRV_BLOCK_EOF &&
2389                (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
2390                /*
2391                 * It is valid for the format block driver to read
2392                 * beyond the end of the underlying file's current
2393                 * size; such areas read as zero.
2394                 */
2395                ret |= BDRV_BLOCK_ZERO;
2396            } else {
2397                /* Limit request to the range reported by the protocol driver */
2398                *pnum = file_pnum;
2399                ret |= (ret2 & BDRV_BLOCK_ZERO);
2400            }
2401        }
2402    }
2403
2404out:
2405    bdrv_dec_in_flight(bs);
2406    if (ret >= 0 && offset + *pnum == total_size) {
2407        ret |= BDRV_BLOCK_EOF;
2408    }
2409early_out:
2410    if (file) {
2411        *file = local_file;
2412    }
2413    if (map) {
2414        *map = local_map;
2415    }
2416    return ret;
2417}
2418
2419static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
2420                                                   BlockDriverState *base,
2421                                                   bool want_zero,
2422                                                   int64_t offset,
2423                                                   int64_t bytes,
2424                                                   int64_t *pnum,
2425                                                   int64_t *map,
2426                                                   BlockDriverState **file)
2427{
2428    BlockDriverState *p;
2429    int ret = 0;
2430    bool first = true;
2431
2432    assert(bs != base);
2433    for (p = bs; p != base; p = backing_bs(p)) {
2434        ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
2435                                   file);
2436        if (ret < 0) {
2437            break;
2438        }
2439        if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
2440            /*
2441             * Reading beyond the end of the file continues to read
2442             * zeroes, but we can only widen the result to the
2443             * unallocated length we learned from an earlier
2444             * iteration.
2445             */
2446            *pnum = bytes;
2447        }
2448        if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
2449            break;
2450        }
2451        /* [offset, pnum] unallocated on this layer, which could be only
2452         * the first part of [offset, bytes].  */
2453        bytes = MIN(bytes, *pnum);
2454        first = false;
2455    }
2456    return ret;
2457}
2458
2459/* Coroutine wrapper for bdrv_block_status_above() */
2460static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque)
2461{
2462    BdrvCoBlockStatusData *data = opaque;
2463
2464    data->ret = bdrv_co_block_status_above(data->bs, data->base,
2465                                           data->want_zero,
2466                                           data->offset, data->bytes,
2467                                           data->pnum, data->map, data->file);
2468    data->done = true;
2469    aio_wait_kick();
2470}
2471
2472/*
2473 * Synchronous wrapper around bdrv_co_block_status_above().
2474 *
2475 * See bdrv_co_block_status_above() for details.
2476 */
2477static int bdrv_common_block_status_above(BlockDriverState *bs,
2478                                          BlockDriverState *base,
2479                                          bool want_zero, int64_t offset,
2480                                          int64_t bytes, int64_t *pnum,
2481                                          int64_t *map,
2482                                          BlockDriverState **file)
2483{
2484    Coroutine *co;
2485    BdrvCoBlockStatusData data = {
2486        .bs = bs,
2487        .base = base,
2488        .want_zero = want_zero,
2489        .offset = offset,
2490        .bytes = bytes,
2491        .pnum = pnum,
2492        .map = map,
2493        .file = file,
2494        .done = false,
2495    };
2496
2497    if (qemu_in_coroutine()) {
2498        /* Fast-path if already in coroutine context */
2499        bdrv_block_status_above_co_entry(&data);
2500    } else {
2501        co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data);
2502        bdrv_coroutine_enter(bs, co);
2503        BDRV_POLL_WHILE(bs, !data.done);
2504    }
2505    return data.ret;
2506}
2507
2508int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
2509                            int64_t offset, int64_t bytes, int64_t *pnum,
2510                            int64_t *map, BlockDriverState **file)
2511{
2512    return bdrv_common_block_status_above(bs, base, true, offset, bytes,
2513                                          pnum, map, file);
2514}
2515
2516int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
2517                      int64_t *pnum, int64_t *map, BlockDriverState **file)
2518{
2519    return bdrv_block_status_above(bs, backing_bs(bs),
2520                                   offset, bytes, pnum, map, file);
2521}
2522
2523int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
2524                                   int64_t bytes, int64_t *pnum)
2525{
2526    int ret;
2527    int64_t dummy;
2528
2529    ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
2530                                         bytes, pnum ? pnum : &dummy, NULL,
2531                                         NULL);
2532    if (ret < 0) {
2533        return ret;
2534    }
2535    return !!(ret & BDRV_BLOCK_ALLOCATED);
2536}
2537
2538/*
2539 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2540 *
2541 * Return 1 if (a prefix of) the given range is allocated in any image
2542 * between BASE and TOP (BASE is only included if include_base is set).
2543 * BASE can be NULL to check if the given offset is allocated in any
2544 * image of the chain.  Return 0 otherwise, or negative errno on
2545 * failure.
2546 *
2547 * 'pnum' is set to the number of bytes (including and immediately
2548 * following the specified offset) that are known to be in the same
2549 * allocated/unallocated state.  Note that a subsequent call starting
2550 * at 'offset + *pnum' may return the same allocation status (in other
2551 * words, the result is not necessarily the maximum possible range);
2552 * but 'pnum' will only be 0 when end of file is reached.
2553 *
2554 */
2555int bdrv_is_allocated_above(BlockDriverState *top,
2556                            BlockDriverState *base,
2557                            bool include_base, int64_t offset,
2558                            int64_t bytes, int64_t *pnum)
2559{
2560    BlockDriverState *intermediate;
2561    int ret;
2562    int64_t n = bytes;
2563
2564    assert(base || !include_base);
2565
2566    intermediate = top;
2567    while (include_base || intermediate != base) {
2568        int64_t pnum_inter;
2569        int64_t size_inter;
2570
2571        assert(intermediate);
2572        ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
2573        if (ret < 0) {
2574            return ret;
2575        }
2576        if (ret) {
2577            *pnum = pnum_inter;
2578            return 1;
2579        }
2580
2581        size_inter = bdrv_getlength(intermediate);
2582        if (size_inter < 0) {
2583            return size_inter;
2584        }
2585        if (n > pnum_inter &&
2586            (intermediate == top || offset + pnum_inter < size_inter)) {
2587            n = pnum_inter;
2588        }
2589
2590        if (intermediate == base) {
2591            break;
2592        }
2593
2594        intermediate = backing_bs(intermediate);
2595    }
2596
2597    *pnum = n;
2598    return 0;
2599}
2600
2601typedef struct BdrvVmstateCo {
2602    BlockDriverState    *bs;
2603    QEMUIOVector        *qiov;
2604    int64_t             pos;
2605    bool                is_read;
2606    int                 ret;
2607} BdrvVmstateCo;
2608
2609static int coroutine_fn
2610bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2611                   bool is_read)
2612{
2613    BlockDriver *drv = bs->drv;
2614    int ret = -ENOTSUP;
2615
2616    bdrv_inc_in_flight(bs);
2617
2618    if (!drv) {
2619        ret = -ENOMEDIUM;
2620    } else if (drv->bdrv_load_vmstate) {
2621        if (is_read) {
2622            ret = drv->bdrv_load_vmstate(bs, qiov, pos);
2623        } else {
2624            ret = drv->bdrv_save_vmstate(bs, qiov, pos);
2625        }
2626    } else if (bs->file) {
2627        ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
2628    }
2629
2630    bdrv_dec_in_flight(bs);
2631    return ret;
2632}
2633
2634static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
2635{
2636    BdrvVmstateCo *co = opaque;
2637    co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
2638    aio_wait_kick();
2639}
2640
2641static inline int
2642bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2643                bool is_read)
2644{
2645    if (qemu_in_coroutine()) {
2646        return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
2647    } else {
2648        BdrvVmstateCo data = {
2649            .bs         = bs,
2650            .qiov       = qiov,
2651            .pos        = pos,
2652            .is_read    = is_read,
2653            .ret        = -EINPROGRESS,
2654        };
2655        Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
2656
2657        bdrv_coroutine_enter(bs, co);
2658        BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
2659        return data.ret;
2660    }
2661}
2662
2663int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2664                      int64_t pos, int size)
2665{
2666    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2667    int ret;
2668
2669    ret = bdrv_writev_vmstate(bs, &qiov, pos);
2670    if (ret < 0) {
2671        return ret;
2672    }
2673
2674    return size;
2675}
2676
2677int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2678{
2679    return bdrv_rw_vmstate(bs, qiov, pos, false);
2680}
2681
2682int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2683                      int64_t pos, int size)
2684{
2685    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2686    int ret;
2687
2688    ret = bdrv_readv_vmstate(bs, &qiov, pos);
2689    if (ret < 0) {
2690        return ret;
2691    }
2692
2693    return size;
2694}
2695
2696int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2697{
2698    return bdrv_rw_vmstate(bs, qiov, pos, true);
2699}
2700
2701/**************************************************************/
2702/* async I/Os */
2703
2704void bdrv_aio_cancel(BlockAIOCB *acb)
2705{
2706    qemu_aio_ref(acb);
2707    bdrv_aio_cancel_async(acb);
2708    while (acb->refcnt > 1) {
2709        if (acb->aiocb_info->get_aio_context) {
2710            aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2711        } else if (acb->bs) {
2712            /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
2713             * assert that we're not using an I/O thread.  Thread-safe
2714             * code should use bdrv_aio_cancel_async exclusively.
2715             */
2716            assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
2717            aio_poll(bdrv_get_aio_context(acb->bs), true);
2718        } else {
2719            abort();
2720        }
2721    }
2722    qemu_aio_unref(acb);
2723}
2724
2725/* Async version of aio cancel. The caller is not blocked if the acb implements
2726 * cancel_async, otherwise we do nothing and let the request normally complete.
2727 * In either case the completion callback must be called. */
2728void bdrv_aio_cancel_async(BlockAIOCB *acb)
2729{
2730    if (acb->aiocb_info->cancel_async) {
2731        acb->aiocb_info->cancel_async(acb);
2732    }
2733}
2734
2735/**************************************************************/
2736/* Coroutine block device emulation */
2737
2738typedef struct FlushCo {
2739    BlockDriverState *bs;
2740    int ret;
2741} FlushCo;
2742
2743
2744static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2745{
2746    FlushCo *rwco = opaque;
2747
2748    rwco->ret = bdrv_co_flush(rwco->bs);
2749    aio_wait_kick();
2750}
2751
2752int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2753{
2754    int current_gen;
2755    int ret = 0;
2756
2757    bdrv_inc_in_flight(bs);
2758
2759    if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2760        bdrv_is_sg(bs)) {
2761        goto early_exit;
2762    }
2763
2764    qemu_co_mutex_lock(&bs->reqs_lock);
2765    current_gen = atomic_read(&bs->write_gen);
2766
2767    /* Wait until any previous flushes are completed */
2768    while (bs->active_flush_req) {
2769        qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
2770    }
2771
2772    /* Flushes reach this point in nondecreasing current_gen order.  */
2773    bs->active_flush_req = true;
2774    qemu_co_mutex_unlock(&bs->reqs_lock);
2775
2776    /* Write back all layers by calling one driver function */
2777    if (bs->drv->bdrv_co_flush) {
2778        ret = bs->drv->bdrv_co_flush(bs);
2779        goto out;
2780    }
2781
2782    /* Write back cached data to the OS even with cache=unsafe */
2783    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2784    if (bs->drv->bdrv_co_flush_to_os) {
2785        ret = bs->drv->bdrv_co_flush_to_os(bs);
2786        if (ret < 0) {
2787            goto out;
2788        }
2789    }
2790
2791    /* But don't actually force it to the disk with cache=unsafe */
2792    if (bs->open_flags & BDRV_O_NO_FLUSH) {
2793        goto flush_parent;
2794    }
2795
2796    /* Check if we really need to flush anything */
2797    if (bs->flushed_gen == current_gen) {
2798        goto flush_parent;
2799    }
2800
2801    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2802    if (!bs->drv) {
2803        /* bs->drv->bdrv_co_flush() might have ejected the BDS
2804         * (even in case of apparent success) */
2805        ret = -ENOMEDIUM;
2806        goto out;
2807    }
2808    if (bs->drv->bdrv_co_flush_to_disk) {
2809        ret = bs->drv->bdrv_co_flush_to_disk(bs);
2810    } else if (bs->drv->bdrv_aio_flush) {
2811        BlockAIOCB *acb;
2812        CoroutineIOCompletion co = {
2813            .coroutine = qemu_coroutine_self(),
2814        };
2815
2816        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2817        if (acb == NULL) {
2818            ret = -EIO;
2819        } else {
2820            qemu_coroutine_yield();
2821            ret = co.ret;
2822        }
2823    } else {
2824        /*
2825         * Some block drivers always operate in either writethrough or unsafe
2826         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2827         * know how the server works (because the behaviour is hardcoded or
2828         * depends on server-side configuration), so we can't ensure that
2829         * everything is safe on disk. Returning an error doesn't work because
2830         * that would break guests even if the server operates in writethrough
2831         * mode.
2832         *
2833         * Let's hope the user knows what he's doing.
2834         */
2835        ret = 0;
2836    }
2837
2838    if (ret < 0) {
2839        goto out;
2840    }
2841
2842    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2843     * in the case of cache=unsafe, so there are no useless flushes.
2844     */
2845flush_parent:
2846    ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2847out:
2848    /* Notify any pending flushes that we have completed */
2849    if (ret == 0) {
2850        bs->flushed_gen = current_gen;
2851    }
2852
2853    qemu_co_mutex_lock(&bs->reqs_lock);
2854    bs->active_flush_req = false;
2855    /* Return value is ignored - it's ok if wait queue is empty */
2856    qemu_co_queue_next(&bs->flush_queue);
2857    qemu_co_mutex_unlock(&bs->reqs_lock);
2858
2859early_exit:
2860    bdrv_dec_in_flight(bs);
2861    return ret;
2862}
2863
2864int bdrv_flush(BlockDriverState *bs)
2865{
2866    Coroutine *co;
2867    FlushCo flush_co = {
2868        .bs = bs,
2869        .ret = NOT_DONE,
2870    };
2871
2872    if (qemu_in_coroutine()) {
2873        /* Fast-path if already in coroutine context */
2874        bdrv_flush_co_entry(&flush_co);
2875    } else {
2876        co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
2877        bdrv_coroutine_enter(bs, co);
2878        BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
2879    }
2880
2881    return flush_co.ret;
2882}
2883
2884typedef struct DiscardCo {
2885    BdrvChild *child;
2886    int64_t offset;
2887    int64_t bytes;
2888    int ret;
2889} DiscardCo;
2890static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
2891{
2892    DiscardCo *rwco = opaque;
2893
2894    rwco->ret = bdrv_co_pdiscard(rwco->child, rwco->offset, rwco->bytes);
2895    aio_wait_kick();
2896}
2897
2898int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
2899                                  int64_t bytes)
2900{
2901    BdrvTrackedRequest req;
2902    int max_pdiscard, ret;
2903    int head, tail, align;
2904    BlockDriverState *bs = child->bs;
2905
2906    if (!bs || !bs->drv || !bdrv_is_inserted(bs)) {
2907        return -ENOMEDIUM;
2908    }
2909
2910    if (bdrv_has_readonly_bitmaps(bs)) {
2911        return -EPERM;
2912    }
2913
2914    if (offset < 0 || bytes < 0 || bytes > INT64_MAX - offset) {
2915        return -EIO;
2916    }
2917
2918    /* Do nothing if disabled.  */
2919    if (!(bs->open_flags & BDRV_O_UNMAP)) {
2920        return 0;
2921    }
2922
2923    if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
2924        return 0;
2925    }
2926
2927    /* Discard is advisory, but some devices track and coalesce
2928     * unaligned requests, so we must pass everything down rather than
2929     * round here.  Still, most devices will just silently ignore
2930     * unaligned requests (by returning -ENOTSUP), so we must fragment
2931     * the request accordingly.  */
2932    align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2933    assert(align % bs->bl.request_alignment == 0);
2934    head = offset % align;
2935    tail = (offset + bytes) % align;
2936
2937    bdrv_inc_in_flight(bs);
2938    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
2939
2940    ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
2941    if (ret < 0) {
2942        goto out;
2943    }
2944
2945    max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
2946                                   align);
2947    assert(max_pdiscard >= bs->bl.request_alignment);
2948
2949    while (bytes > 0) {
2950        int64_t num = bytes;
2951
2952        if (head) {
2953            /* Make small requests to get to alignment boundaries. */
2954            num = MIN(bytes, align - head);
2955            if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
2956                num %= bs->bl.request_alignment;
2957            }
2958            head = (head + num) % align;
2959            assert(num < max_pdiscard);
2960        } else if (tail) {
2961            if (num > align) {
2962                /* Shorten the request to the last aligned cluster.  */
2963                num -= tail;
2964            } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
2965                       tail > bs->bl.request_alignment) {
2966                tail %= bs->bl.request_alignment;
2967                num -= tail;
2968            }
2969        }
2970        /* limit request size */
2971        if (num > max_pdiscard) {
2972            num = max_pdiscard;
2973        }
2974
2975        if (!bs->drv) {
2976            ret = -ENOMEDIUM;
2977            goto out;
2978        }
2979        if (bs->drv->bdrv_co_pdiscard) {
2980            ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
2981        } else {
2982            BlockAIOCB *acb;
2983            CoroutineIOCompletion co = {
2984                .coroutine = qemu_coroutine_self(),
2985            };
2986
2987            acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
2988                                             bdrv_co_io_em_complete, &co);
2989            if (acb == NULL) {
2990                ret = -EIO;
2991                goto out;
2992            } else {
2993                qemu_coroutine_yield();
2994                ret = co.ret;
2995            }
2996        }
2997        if (ret && ret != -ENOTSUP) {
2998            goto out;
2999        }
3000
3001        offset += num;
3002        bytes -= num;
3003    }
3004    ret = 0;
3005out:
3006    bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
3007    tracked_request_end(&req);
3008    bdrv_dec_in_flight(bs);
3009    return ret;
3010}
3011
3012int bdrv_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes)
3013{
3014    Coroutine *co;
3015    DiscardCo rwco = {
3016        .child = child,
3017        .offset = offset,
3018        .bytes = bytes,
3019        .ret = NOT_DONE,
3020    };
3021
3022    if (qemu_in_coroutine()) {
3023        /* Fast-path if already in coroutine context */
3024        bdrv_pdiscard_co_entry(&rwco);
3025    } else {
3026        co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
3027        bdrv_coroutine_enter(child->bs, co);
3028        BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
3029    }
3030
3031    return rwco.ret;
3032}
3033
3034int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
3035{
3036    BlockDriver *drv = bs->drv;
3037    CoroutineIOCompletion co = {
3038        .coroutine = qemu_coroutine_self(),
3039    };
3040    BlockAIOCB *acb;
3041
3042    bdrv_inc_in_flight(bs);
3043    if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
3044        co.ret = -ENOTSUP;
3045        goto out;
3046    }
3047
3048    if (drv->bdrv_co_ioctl) {
3049        co.ret = drv->bdrv_co_ioctl(bs, req, buf);
3050    } else {
3051        acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
3052        if (!acb) {
3053            co.ret = -ENOTSUP;
3054            goto out;
3055        }
3056        qemu_coroutine_yield();
3057    }
3058out:
3059    bdrv_dec_in_flight(bs);
3060    return co.ret;
3061}
3062
3063void *qemu_blockalign(BlockDriverState *bs, size_t size)
3064{
3065    return qemu_memalign(bdrv_opt_mem_align(bs), size);
3066}
3067
3068void *qemu_blockalign0(BlockDriverState *bs, size_t size)
3069{
3070    return memset(qemu_blockalign(bs, size), 0, size);
3071}
3072
3073void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
3074{
3075    size_t align = bdrv_opt_mem_align(bs);
3076
3077    /* Ensure that NULL is never returned on success */
3078    assert(align > 0);
3079    if (size == 0) {
3080        size = align;
3081    }
3082
3083    return qemu_try_memalign(align, size);
3084}
3085
3086void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
3087{
3088    void *mem = qemu_try_blockalign(bs, size);
3089
3090    if (mem) {
3091        memset(mem, 0, size);
3092    }
3093
3094    return mem;
3095}
3096
3097/*
3098 * Check if all memory in this vector is sector aligned.
3099 */
3100bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
3101{
3102    int i;
3103    size_t alignment = bdrv_min_mem_align(bs);
3104
3105    for (i = 0; i < qiov->niov; i++) {
3106        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
3107            return false;
3108        }
3109        if (qiov->iov[i].iov_len % alignment) {
3110            return false;
3111        }
3112    }
3113
3114    return true;
3115}
3116
3117void bdrv_add_before_write_notifier(BlockDriverState *bs,
3118                                    NotifierWithReturn *notifier)
3119{
3120    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
3121}
3122
3123void bdrv_io_plug(BlockDriverState *bs)
3124{
3125    BdrvChild *child;
3126
3127    QLIST_FOREACH(child, &bs->children, next) {
3128        bdrv_io_plug(child->bs);
3129    }
3130
3131    if (atomic_fetch_inc(&bs->io_plugged) == 0) {
3132        BlockDriver *drv = bs->drv;
3133        if (drv && drv->bdrv_io_plug) {
3134            drv->bdrv_io_plug(bs);
3135        }
3136    }
3137}
3138
3139void bdrv_io_unplug(BlockDriverState *bs)
3140{
3141    BdrvChild *child;
3142
3143    assert(bs->io_plugged);
3144    if (atomic_fetch_dec(&bs->io_plugged) == 1) {
3145        BlockDriver *drv = bs->drv;
3146        if (drv && drv->bdrv_io_unplug) {
3147            drv->bdrv_io_unplug(bs);
3148        }
3149    }
3150
3151    QLIST_FOREACH(child, &bs->children, next) {
3152        bdrv_io_unplug(child->bs);
3153    }
3154}
3155
3156void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
3157{
3158    BdrvChild *child;
3159
3160    if (bs->drv && bs->drv->bdrv_register_buf) {
3161        bs->drv->bdrv_register_buf(bs, host, size);
3162    }
3163    QLIST_FOREACH(child, &bs->children, next) {
3164        bdrv_register_buf(child->bs, host, size);
3165    }
3166}
3167
3168void bdrv_unregister_buf(BlockDriverState *bs, void *host)
3169{
3170    BdrvChild *child;
3171
3172    if (bs->drv && bs->drv->bdrv_unregister_buf) {
3173        bs->drv->bdrv_unregister_buf(bs, host);
3174    }
3175    QLIST_FOREACH(child, &bs->children, next) {
3176        bdrv_unregister_buf(child->bs, host);
3177    }
3178}
3179
3180static int coroutine_fn bdrv_co_copy_range_internal(
3181        BdrvChild *src, uint64_t src_offset, BdrvChild *dst,
3182        uint64_t dst_offset, uint64_t bytes,
3183        BdrvRequestFlags read_flags, BdrvRequestFlags write_flags,
3184        bool recurse_src)
3185{
3186    BdrvTrackedRequest req;
3187    int ret;
3188
3189    /* TODO We can support BDRV_REQ_NO_FALLBACK here */
3190    assert(!(read_flags & BDRV_REQ_NO_FALLBACK));
3191    assert(!(write_flags & BDRV_REQ_NO_FALLBACK));
3192
3193    if (!dst || !dst->bs) {
3194        return -ENOMEDIUM;
3195    }
3196    ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes);
3197    if (ret) {
3198        return ret;
3199    }
3200    if (write_flags & BDRV_REQ_ZERO_WRITE) {
3201        return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags);
3202    }
3203
3204    if (!src || !src->bs) {
3205        return -ENOMEDIUM;
3206    }
3207    ret = bdrv_check_byte_request(src->bs, src_offset, bytes);
3208    if (ret) {
3209        return ret;
3210    }
3211
3212    if (!src->bs->drv->bdrv_co_copy_range_from
3213        || !dst->bs->drv->bdrv_co_copy_range_to
3214        || src->bs->encrypted || dst->bs->encrypted) {
3215        return -ENOTSUP;
3216    }
3217
3218    if (recurse_src) {
3219        bdrv_inc_in_flight(src->bs);
3220        tracked_request_begin(&req, src->bs, src_offset, bytes,
3221                              BDRV_TRACKED_READ);
3222
3223        /* BDRV_REQ_SERIALISING is only for write operation */
3224        assert(!(read_flags & BDRV_REQ_SERIALISING));
3225        if (!(read_flags & BDRV_REQ_NO_SERIALISING)) {
3226            bdrv_wait_serialising_requests(&req);
3227        }
3228
3229        ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
3230                                                    src, src_offset,
3231                                                    dst, dst_offset,
3232                                                    bytes,
3233                                                    read_flags, write_flags);
3234
3235        tracked_request_end(&req);
3236        bdrv_dec_in_flight(src->bs);
3237    } else {
3238        bdrv_inc_in_flight(dst->bs);
3239        tracked_request_begin(&req, dst->bs, dst_offset, bytes,
3240                              BDRV_TRACKED_WRITE);
3241        ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req,
3242                                        write_flags);
3243        if (!ret) {
3244            ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
3245                                                      src, src_offset,
3246                                                      dst, dst_offset,
3247                                                      bytes,
3248                                                      read_flags, write_flags);
3249        }
3250        bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret);
3251        tracked_request_end(&req);
3252        bdrv_dec_in_flight(dst->bs);
3253    }
3254
3255    return ret;
3256}
3257
3258/* Copy range from @src to @dst.
3259 *
3260 * See the comment of bdrv_co_copy_range for the parameter and return value
3261 * semantics. */
3262int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset,
3263                                         BdrvChild *dst, uint64_t dst_offset,
3264                                         uint64_t bytes,
3265                                         BdrvRequestFlags read_flags,
3266                                         BdrvRequestFlags write_flags)
3267{
3268    trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes,
3269                                  read_flags, write_flags);
3270    return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
3271                                       bytes, read_flags, write_flags, true);
3272}
3273
3274/* Copy range from @src to @dst.
3275 *
3276 * See the comment of bdrv_co_copy_range for the parameter and return value
3277 * semantics. */
3278int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset,
3279                                       BdrvChild *dst, uint64_t dst_offset,
3280                                       uint64_t bytes,
3281                                       BdrvRequestFlags read_flags,
3282                                       BdrvRequestFlags write_flags)
3283{
3284    trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3285                                read_flags, write_flags);
3286    return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
3287                                       bytes, read_flags, write_flags, false);
3288}
3289
3290int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset,
3291                                    BdrvChild *dst, uint64_t dst_offset,
3292                                    uint64_t bytes, BdrvRequestFlags read_flags,
3293                                    BdrvRequestFlags write_flags)
3294{
3295    return bdrv_co_copy_range_from(src, src_offset,
3296                                   dst, dst_offset,
3297                                   bytes, read_flags, write_flags);
3298}
3299
3300static void bdrv_parent_cb_resize(BlockDriverState *bs)
3301{
3302    BdrvChild *c;
3303    QLIST_FOREACH(c, &bs->parents, next_parent) {
3304        if (c->role->resize) {
3305            c->role->resize(c);
3306        }
3307    }
3308}
3309
3310/**
3311 * Truncate file to 'offset' bytes (needed only for file protocols)
3312 *
3313 * If 'exact' is true, the file must be resized to exactly the given
3314 * 'offset'.  Otherwise, it is sufficient for the node to be at least
3315 * 'offset' bytes in length.
3316 */
3317int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
3318                                  PreallocMode prealloc, Error **errp)
3319{
3320    BlockDriverState *bs = child->bs;
3321    BlockDriver *drv = bs->drv;
3322    BdrvTrackedRequest req;
3323    int64_t old_size, new_bytes;
3324    int ret;
3325
3326
3327    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
3328    if (!drv) {
3329        error_setg(errp, "No medium inserted");
3330        return -ENOMEDIUM;
3331    }
3332    if (offset < 0) {
3333        error_setg(errp, "Image size cannot be negative");
3334        return -EINVAL;
3335    }
3336
3337    old_size = bdrv_getlength(bs);
3338    if (old_size < 0) {
3339        error_setg_errno(errp, -old_size, "Failed to get old image size");
3340        return old_size;
3341    }
3342
3343    if (offset > old_size) {
3344        new_bytes = offset - old_size;
3345    } else {
3346        new_bytes = 0;
3347    }
3348
3349    bdrv_inc_in_flight(bs);
3350    tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
3351                          BDRV_TRACKED_TRUNCATE);
3352
3353    /* If we are growing the image and potentially using preallocation for the
3354     * new area, we need to make sure that no write requests are made to it
3355     * concurrently or they might be overwritten by preallocation. */
3356    if (new_bytes) {
3357        bdrv_mark_request_serialising(&req, 1);
3358    }
3359    if (bs->read_only) {
3360        error_setg(errp, "Image is read-only");
3361        ret = -EACCES;
3362        goto out;
3363    }
3364    ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req,
3365                                    0);
3366    if (ret < 0) {
3367        error_setg_errno(errp, -ret,
3368                         "Failed to prepare request for truncation");
3369        goto out;
3370    }
3371
3372    if (drv->bdrv_co_truncate) {
3373        ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, errp);
3374    } else if (bs->file && drv->is_filter) {
3375        ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, errp);
3376    } else {
3377        error_setg(errp, "Image format driver does not support resize");
3378        ret = -ENOTSUP;
3379        goto out;
3380    }
3381    if (ret < 0) {
3382        goto out;
3383    }
3384
3385    ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3386    if (ret < 0) {
3387        error_setg_errno(errp, -ret, "Could not refresh total sector count");
3388    } else {
3389        offset = bs->total_sectors * BDRV_SECTOR_SIZE;
3390    }
3391    /* It's possible that truncation succeeded but refresh_total_sectors
3392     * failed, but the latter doesn't affect how we should finish the request.
3393     * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */
3394    bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0);
3395
3396out:
3397    tracked_request_end(&req);
3398    bdrv_dec_in_flight(bs);
3399
3400    return ret;
3401}
3402
3403typedef struct TruncateCo {
3404    BdrvChild *child;
3405    int64_t offset;
3406    bool exact;
3407    PreallocMode prealloc;
3408    Error **errp;
3409    int ret;
3410} TruncateCo;
3411
3412static void coroutine_fn bdrv_truncate_co_entry(void *opaque)
3413{
3414    TruncateCo *tco = opaque;
3415    tco->ret = bdrv_co_truncate(tco->child, tco->offset, tco->exact,
3416                                tco->prealloc, tco->errp);
3417    aio_wait_kick();
3418}
3419
3420int bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
3421                  PreallocMode prealloc, Error **errp)
3422{
3423    Coroutine *co;
3424    TruncateCo tco = {
3425        .child      = child,
3426        .offset     = offset,
3427        .exact      = exact,
3428        .prealloc   = prealloc,
3429        .errp       = errp,
3430        .ret        = NOT_DONE,
3431    };
3432
3433    if (qemu_in_coroutine()) {
3434        /* Fast-path if already in coroutine context */
3435        bdrv_truncate_co_entry(&tco);
3436    } else {
3437        co = qemu_coroutine_create(bdrv_truncate_co_entry, &tco);
3438        bdrv_coroutine_enter(child->bs, co);
3439        BDRV_POLL_WHILE(child->bs, tco.ret == NOT_DONE);
3440    }
3441
3442    return tco.ret;
3443}
3444