qemu/block/io.c
<<
>>
Prefs
   1/*
   2 * Block layer I/O functions
   3 *
   4 * Copyright (c) 2003 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "trace.h"
  27#include "sysemu/block-backend.h"
  28#include "block/aio-wait.h"
  29#include "block/blockjob.h"
  30#include "block/blockjob_int.h"
  31#include "block/block_int.h"
  32#include "block/coroutines.h"
  33#include "block/write-threshold.h"
  34#include "qemu/cutils.h"
  35#include "qemu/memalign.h"
  36#include "qapi/error.h"
  37#include "qemu/error-report.h"
  38#include "qemu/main-loop.h"
  39#include "sysemu/replay.h"
  40
  41/* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
  42#define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
  43
  44static void bdrv_parent_cb_resize(BlockDriverState *bs);
  45static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
  46    int64_t offset, int64_t bytes, BdrvRequestFlags flags);
  47
  48static void bdrv_parent_drained_begin(BlockDriverState *bs, BdrvChild *ignore,
  49                                      bool ignore_bds_parents)
  50{
  51    BdrvChild *c, *next;
  52
  53    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
  54        if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
  55            continue;
  56        }
  57        bdrv_parent_drained_begin_single(c, false);
  58    }
  59}
  60
  61static void bdrv_parent_drained_end_single_no_poll(BdrvChild *c,
  62                                                   int *drained_end_counter)
  63{
  64    assert(c->parent_quiesce_counter > 0);
  65    c->parent_quiesce_counter--;
  66    if (c->klass->drained_end) {
  67        c->klass->drained_end(c, drained_end_counter);
  68    }
  69}
  70
  71void bdrv_parent_drained_end_single(BdrvChild *c)
  72{
  73    int drained_end_counter = 0;
  74    IO_OR_GS_CODE();
  75    bdrv_parent_drained_end_single_no_poll(c, &drained_end_counter);
  76    BDRV_POLL_WHILE(c->bs, qatomic_read(&drained_end_counter) > 0);
  77}
  78
  79static void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild *ignore,
  80                                    bool ignore_bds_parents,
  81                                    int *drained_end_counter)
  82{
  83    BdrvChild *c;
  84
  85    QLIST_FOREACH(c, &bs->parents, next_parent) {
  86        if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
  87            continue;
  88        }
  89        bdrv_parent_drained_end_single_no_poll(c, drained_end_counter);
  90    }
  91}
  92
  93static bool bdrv_parent_drained_poll_single(BdrvChild *c)
  94{
  95    if (c->klass->drained_poll) {
  96        return c->klass->drained_poll(c);
  97    }
  98    return false;
  99}
 100
 101static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
 102                                     bool ignore_bds_parents)
 103{
 104    BdrvChild *c, *next;
 105    bool busy = false;
 106
 107    QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
 108        if (c == ignore || (ignore_bds_parents && c->klass->parent_is_bds)) {
 109            continue;
 110        }
 111        busy |= bdrv_parent_drained_poll_single(c);
 112    }
 113
 114    return busy;
 115}
 116
 117void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
 118{
 119    IO_OR_GS_CODE();
 120    c->parent_quiesce_counter++;
 121    if (c->klass->drained_begin) {
 122        c->klass->drained_begin(c);
 123    }
 124    if (poll) {
 125        BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c));
 126    }
 127}
 128
 129static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
 130{
 131    dst->pdiscard_alignment = MAX(dst->pdiscard_alignment,
 132                                  src->pdiscard_alignment);
 133    dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
 134    dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
 135    dst->max_hw_transfer = MIN_NON_ZERO(dst->max_hw_transfer,
 136                                        src->max_hw_transfer);
 137    dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
 138                                 src->opt_mem_alignment);
 139    dst->min_mem_alignment = MAX(dst->min_mem_alignment,
 140                                 src->min_mem_alignment);
 141    dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
 142    dst->max_hw_iov = MIN_NON_ZERO(dst->max_hw_iov, src->max_hw_iov);
 143}
 144
 145typedef struct BdrvRefreshLimitsState {
 146    BlockDriverState *bs;
 147    BlockLimits old_bl;
 148} BdrvRefreshLimitsState;
 149
 150static void bdrv_refresh_limits_abort(void *opaque)
 151{
 152    BdrvRefreshLimitsState *s = opaque;
 153
 154    s->bs->bl = s->old_bl;
 155}
 156
 157static TransactionActionDrv bdrv_refresh_limits_drv = {
 158    .abort = bdrv_refresh_limits_abort,
 159    .clean = g_free,
 160};
 161
 162/* @tran is allowed to be NULL, in this case no rollback is possible. */
 163void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp)
 164{
 165    ERRP_GUARD();
 166    BlockDriver *drv = bs->drv;
 167    BdrvChild *c;
 168    bool have_limits;
 169
 170    GLOBAL_STATE_CODE();
 171
 172    if (tran) {
 173        BdrvRefreshLimitsState *s = g_new(BdrvRefreshLimitsState, 1);
 174        *s = (BdrvRefreshLimitsState) {
 175            .bs = bs,
 176            .old_bl = bs->bl,
 177        };
 178        tran_add(tran, &bdrv_refresh_limits_drv, s);
 179    }
 180
 181    memset(&bs->bl, 0, sizeof(bs->bl));
 182
 183    if (!drv) {
 184        return;
 185    }
 186
 187    /* Default alignment based on whether driver has byte interface */
 188    bs->bl.request_alignment = (drv->bdrv_co_preadv ||
 189                                drv->bdrv_aio_preadv ||
 190                                drv->bdrv_co_preadv_part) ? 1 : 512;
 191
 192    /* Take some limits from the children as a default */
 193    have_limits = false;
 194    QLIST_FOREACH(c, &bs->children, next) {
 195        if (c->role & (BDRV_CHILD_DATA | BDRV_CHILD_FILTERED | BDRV_CHILD_COW))
 196        {
 197            bdrv_merge_limits(&bs->bl, &c->bs->bl);
 198            have_limits = true;
 199        }
 200    }
 201
 202    if (!have_limits) {
 203        bs->bl.min_mem_alignment = 512;
 204        bs->bl.opt_mem_alignment = qemu_real_host_page_size;
 205
 206        /* Safe default since most protocols use readv()/writev()/etc */
 207        bs->bl.max_iov = IOV_MAX;
 208    }
 209
 210    /* Then let the driver override it */
 211    if (drv->bdrv_refresh_limits) {
 212        drv->bdrv_refresh_limits(bs, errp);
 213        if (*errp) {
 214            return;
 215        }
 216    }
 217
 218    if (bs->bl.request_alignment > BDRV_MAX_ALIGNMENT) {
 219        error_setg(errp, "Driver requires too large request alignment");
 220    }
 221}
 222
 223/**
 224 * The copy-on-read flag is actually a reference count so multiple users may
 225 * use the feature without worrying about clobbering its previous state.
 226 * Copy-on-read stays enabled until all users have called to disable it.
 227 */
 228void bdrv_enable_copy_on_read(BlockDriverState *bs)
 229{
 230    IO_CODE();
 231    qatomic_inc(&bs->copy_on_read);
 232}
 233
 234void bdrv_disable_copy_on_read(BlockDriverState *bs)
 235{
 236    int old = qatomic_fetch_dec(&bs->copy_on_read);
 237    IO_CODE();
 238    assert(old >= 1);
 239}
 240
 241typedef struct {
 242    Coroutine *co;
 243    BlockDriverState *bs;
 244    bool done;
 245    bool begin;
 246    bool recursive;
 247    bool poll;
 248    BdrvChild *parent;
 249    bool ignore_bds_parents;
 250    int *drained_end_counter;
 251} BdrvCoDrainData;
 252
 253static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
 254{
 255    BdrvCoDrainData *data = opaque;
 256    BlockDriverState *bs = data->bs;
 257
 258    if (data->begin) {
 259        bs->drv->bdrv_co_drain_begin(bs);
 260    } else {
 261        bs->drv->bdrv_co_drain_end(bs);
 262    }
 263
 264    /* Set data->done and decrement drained_end_counter before bdrv_wakeup() */
 265    qatomic_mb_set(&data->done, true);
 266    if (!data->begin) {
 267        qatomic_dec(data->drained_end_counter);
 268    }
 269    bdrv_dec_in_flight(bs);
 270
 271    g_free(data);
 272}
 273
 274/* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
 275static void bdrv_drain_invoke(BlockDriverState *bs, bool begin,
 276                              int *drained_end_counter)
 277{
 278    BdrvCoDrainData *data;
 279
 280    if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
 281            (!begin && !bs->drv->bdrv_co_drain_end)) {
 282        return;
 283    }
 284
 285    data = g_new(BdrvCoDrainData, 1);
 286    *data = (BdrvCoDrainData) {
 287        .bs = bs,
 288        .done = false,
 289        .begin = begin,
 290        .drained_end_counter = drained_end_counter,
 291    };
 292
 293    if (!begin) {
 294        qatomic_inc(drained_end_counter);
 295    }
 296
 297    /* Make sure the driver callback completes during the polling phase for
 298     * drain_begin. */
 299    bdrv_inc_in_flight(bs);
 300    data->co = qemu_coroutine_create(bdrv_drain_invoke_entry, data);
 301    aio_co_schedule(bdrv_get_aio_context(bs), data->co);
 302}
 303
 304/* Returns true if BDRV_POLL_WHILE() should go into a blocking aio_poll() */
 305bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
 306                     BdrvChild *ignore_parent, bool ignore_bds_parents)
 307{
 308    BdrvChild *child, *next;
 309    IO_OR_GS_CODE();
 310
 311    if (bdrv_parent_drained_poll(bs, ignore_parent, ignore_bds_parents)) {
 312        return true;
 313    }
 314
 315    if (qatomic_read(&bs->in_flight)) {
 316        return true;
 317    }
 318
 319    if (recursive) {
 320        assert(!ignore_bds_parents);
 321        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 322            if (bdrv_drain_poll(child->bs, recursive, child, false)) {
 323                return true;
 324            }
 325        }
 326    }
 327
 328    return false;
 329}
 330
 331static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
 332                                      BdrvChild *ignore_parent)
 333{
 334    return bdrv_drain_poll(bs, recursive, ignore_parent, false);
 335}
 336
 337static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 338                                  BdrvChild *parent, bool ignore_bds_parents,
 339                                  bool poll);
 340static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 341                                BdrvChild *parent, bool ignore_bds_parents,
 342                                int *drained_end_counter);
 343
 344static void bdrv_co_drain_bh_cb(void *opaque)
 345{
 346    BdrvCoDrainData *data = opaque;
 347    Coroutine *co = data->co;
 348    BlockDriverState *bs = data->bs;
 349
 350    if (bs) {
 351        AioContext *ctx = bdrv_get_aio_context(bs);
 352        aio_context_acquire(ctx);
 353        bdrv_dec_in_flight(bs);
 354        if (data->begin) {
 355            assert(!data->drained_end_counter);
 356            bdrv_do_drained_begin(bs, data->recursive, data->parent,
 357                                  data->ignore_bds_parents, data->poll);
 358        } else {
 359            assert(!data->poll);
 360            bdrv_do_drained_end(bs, data->recursive, data->parent,
 361                                data->ignore_bds_parents,
 362                                data->drained_end_counter);
 363        }
 364        aio_context_release(ctx);
 365    } else {
 366        assert(data->begin);
 367        bdrv_drain_all_begin();
 368    }
 369
 370    data->done = true;
 371    aio_co_wake(co);
 372}
 373
 374static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
 375                                                bool begin, bool recursive,
 376                                                BdrvChild *parent,
 377                                                bool ignore_bds_parents,
 378                                                bool poll,
 379                                                int *drained_end_counter)
 380{
 381    BdrvCoDrainData data;
 382    Coroutine *self = qemu_coroutine_self();
 383    AioContext *ctx = bdrv_get_aio_context(bs);
 384    AioContext *co_ctx = qemu_coroutine_get_aio_context(self);
 385
 386    /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
 387     * other coroutines run if they were queued by aio_co_enter(). */
 388
 389    assert(qemu_in_coroutine());
 390    data = (BdrvCoDrainData) {
 391        .co = self,
 392        .bs = bs,
 393        .done = false,
 394        .begin = begin,
 395        .recursive = recursive,
 396        .parent = parent,
 397        .ignore_bds_parents = ignore_bds_parents,
 398        .poll = poll,
 399        .drained_end_counter = drained_end_counter,
 400    };
 401
 402    if (bs) {
 403        bdrv_inc_in_flight(bs);
 404    }
 405
 406    /*
 407     * Temporarily drop the lock across yield or we would get deadlocks.
 408     * bdrv_co_drain_bh_cb() reaquires the lock as needed.
 409     *
 410     * When we yield below, the lock for the current context will be
 411     * released, so if this is actually the lock that protects bs, don't drop
 412     * it a second time.
 413     */
 414    if (ctx != co_ctx) {
 415        aio_context_release(ctx);
 416    }
 417    replay_bh_schedule_oneshot_event(ctx, bdrv_co_drain_bh_cb, &data);
 418
 419    qemu_coroutine_yield();
 420    /* If we are resumed from some other event (such as an aio completion or a
 421     * timer callback), it is a bug in the caller that should be fixed. */
 422    assert(data.done);
 423
 424    /* Reaquire the AioContext of bs if we dropped it */
 425    if (ctx != co_ctx) {
 426        aio_context_acquire(ctx);
 427    }
 428}
 429
 430void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
 431                                   BdrvChild *parent, bool ignore_bds_parents)
 432{
 433    IO_OR_GS_CODE();
 434    assert(!qemu_in_coroutine());
 435
 436    /* Stop things in parent-to-child order */
 437    if (qatomic_fetch_inc(&bs->quiesce_counter) == 0) {
 438        aio_disable_external(bdrv_get_aio_context(bs));
 439    }
 440
 441    bdrv_parent_drained_begin(bs, parent, ignore_bds_parents);
 442    bdrv_drain_invoke(bs, true, NULL);
 443}
 444
 445static void bdrv_do_drained_begin(BlockDriverState *bs, bool recursive,
 446                                  BdrvChild *parent, bool ignore_bds_parents,
 447                                  bool poll)
 448{
 449    BdrvChild *child, *next;
 450
 451    if (qemu_in_coroutine()) {
 452        bdrv_co_yield_to_drain(bs, true, recursive, parent, ignore_bds_parents,
 453                               poll, NULL);
 454        return;
 455    }
 456
 457    bdrv_do_drained_begin_quiesce(bs, parent, ignore_bds_parents);
 458
 459    if (recursive) {
 460        assert(!ignore_bds_parents);
 461        bs->recursive_quiesce_counter++;
 462        QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 463            bdrv_do_drained_begin(child->bs, true, child, ignore_bds_parents,
 464                                  false);
 465        }
 466    }
 467
 468    /*
 469     * Wait for drained requests to finish.
 470     *
 471     * Calling BDRV_POLL_WHILE() only once for the top-level node is okay: The
 472     * call is needed so things in this AioContext can make progress even
 473     * though we don't return to the main AioContext loop - this automatically
 474     * includes other nodes in the same AioContext and therefore all child
 475     * nodes.
 476     */
 477    if (poll) {
 478        assert(!ignore_bds_parents);
 479        BDRV_POLL_WHILE(bs, bdrv_drain_poll_top_level(bs, recursive, parent));
 480    }
 481}
 482
 483void bdrv_drained_begin(BlockDriverState *bs)
 484{
 485    IO_OR_GS_CODE();
 486    bdrv_do_drained_begin(bs, false, NULL, false, true);
 487}
 488
 489void bdrv_subtree_drained_begin(BlockDriverState *bs)
 490{
 491    IO_OR_GS_CODE();
 492    bdrv_do_drained_begin(bs, true, NULL, false, true);
 493}
 494
 495/**
 496 * This function does not poll, nor must any of its recursively called
 497 * functions.  The *drained_end_counter pointee will be incremented
 498 * once for every background operation scheduled, and decremented once
 499 * the operation settles.  Therefore, the pointer must remain valid
 500 * until the pointee reaches 0.  That implies that whoever sets up the
 501 * pointee has to poll until it is 0.
 502 *
 503 * We use atomic operations to access *drained_end_counter, because
 504 * (1) when called from bdrv_set_aio_context_ignore(), the subgraph of
 505 *     @bs may contain nodes in different AioContexts,
 506 * (2) bdrv_drain_all_end() uses the same counter for all nodes,
 507 *     regardless of which AioContext they are in.
 508 */
 509static void bdrv_do_drained_end(BlockDriverState *bs, bool recursive,
 510                                BdrvChild *parent, bool ignore_bds_parents,
 511                                int *drained_end_counter)
 512{
 513    BdrvChild *child;
 514    int old_quiesce_counter;
 515
 516    assert(drained_end_counter != NULL);
 517
 518    if (qemu_in_coroutine()) {
 519        bdrv_co_yield_to_drain(bs, false, recursive, parent, ignore_bds_parents,
 520                               false, drained_end_counter);
 521        return;
 522    }
 523    assert(bs->quiesce_counter > 0);
 524
 525    /* Re-enable things in child-to-parent order */
 526    bdrv_drain_invoke(bs, false, drained_end_counter);
 527    bdrv_parent_drained_end(bs, parent, ignore_bds_parents,
 528                            drained_end_counter);
 529
 530    old_quiesce_counter = qatomic_fetch_dec(&bs->quiesce_counter);
 531    if (old_quiesce_counter == 1) {
 532        aio_enable_external(bdrv_get_aio_context(bs));
 533    }
 534
 535    if (recursive) {
 536        assert(!ignore_bds_parents);
 537        bs->recursive_quiesce_counter--;
 538        QLIST_FOREACH(child, &bs->children, next) {
 539            bdrv_do_drained_end(child->bs, true, child, ignore_bds_parents,
 540                                drained_end_counter);
 541        }
 542    }
 543}
 544
 545void bdrv_drained_end(BlockDriverState *bs)
 546{
 547    int drained_end_counter = 0;
 548    IO_OR_GS_CODE();
 549    bdrv_do_drained_end(bs, false, NULL, false, &drained_end_counter);
 550    BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0);
 551}
 552
 553void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter)
 554{
 555    IO_CODE();
 556    bdrv_do_drained_end(bs, false, NULL, false, drained_end_counter);
 557}
 558
 559void bdrv_subtree_drained_end(BlockDriverState *bs)
 560{
 561    int drained_end_counter = 0;
 562    IO_OR_GS_CODE();
 563    bdrv_do_drained_end(bs, true, NULL, false, &drained_end_counter);
 564    BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0);
 565}
 566
 567void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent)
 568{
 569    int i;
 570    IO_OR_GS_CODE();
 571
 572    for (i = 0; i < new_parent->recursive_quiesce_counter; i++) {
 573        bdrv_do_drained_begin(child->bs, true, child, false, true);
 574    }
 575}
 576
 577void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent)
 578{
 579    int drained_end_counter = 0;
 580    int i;
 581    IO_OR_GS_CODE();
 582
 583    for (i = 0; i < old_parent->recursive_quiesce_counter; i++) {
 584        bdrv_do_drained_end(child->bs, true, child, false,
 585                            &drained_end_counter);
 586    }
 587
 588    BDRV_POLL_WHILE(child->bs, qatomic_read(&drained_end_counter) > 0);
 589}
 590
 591/*
 592 * Wait for pending requests to complete on a single BlockDriverState subtree,
 593 * and suspend block driver's internal I/O until next request arrives.
 594 *
 595 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
 596 * AioContext.
 597 */
 598void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
 599{
 600    IO_OR_GS_CODE();
 601    assert(qemu_in_coroutine());
 602    bdrv_drained_begin(bs);
 603    bdrv_drained_end(bs);
 604}
 605
 606void bdrv_drain(BlockDriverState *bs)
 607{
 608    IO_OR_GS_CODE();
 609    bdrv_drained_begin(bs);
 610    bdrv_drained_end(bs);
 611}
 612
 613static void bdrv_drain_assert_idle(BlockDriverState *bs)
 614{
 615    BdrvChild *child, *next;
 616
 617    assert(qatomic_read(&bs->in_flight) == 0);
 618    QLIST_FOREACH_SAFE(child, &bs->children, next, next) {
 619        bdrv_drain_assert_idle(child->bs);
 620    }
 621}
 622
 623unsigned int bdrv_drain_all_count = 0;
 624
 625static bool bdrv_drain_all_poll(void)
 626{
 627    BlockDriverState *bs = NULL;
 628    bool result = false;
 629    GLOBAL_STATE_CODE();
 630
 631    /* bdrv_drain_poll() can't make changes to the graph and we are holding the
 632     * main AioContext lock, so iterating bdrv_next_all_states() is safe. */
 633    while ((bs = bdrv_next_all_states(bs))) {
 634        AioContext *aio_context = bdrv_get_aio_context(bs);
 635        aio_context_acquire(aio_context);
 636        result |= bdrv_drain_poll(bs, false, NULL, true);
 637        aio_context_release(aio_context);
 638    }
 639
 640    return result;
 641}
 642
 643/*
 644 * Wait for pending requests to complete across all BlockDriverStates
 645 *
 646 * This function does not flush data to disk, use bdrv_flush_all() for that
 647 * after calling this function.
 648 *
 649 * This pauses all block jobs and disables external clients. It must
 650 * be paired with bdrv_drain_all_end().
 651 *
 652 * NOTE: no new block jobs or BlockDriverStates can be created between
 653 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
 654 */
 655void bdrv_drain_all_begin(void)
 656{
 657    BlockDriverState *bs = NULL;
 658    GLOBAL_STATE_CODE();
 659
 660    if (qemu_in_coroutine()) {
 661        bdrv_co_yield_to_drain(NULL, true, false, NULL, true, true, NULL);
 662        return;
 663    }
 664
 665    /*
 666     * bdrv queue is managed by record/replay,
 667     * waiting for finishing the I/O requests may
 668     * be infinite
 669     */
 670    if (replay_events_enabled()) {
 671        return;
 672    }
 673
 674    /* AIO_WAIT_WHILE() with a NULL context can only be called from the main
 675     * loop AioContext, so make sure we're in the main context. */
 676    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 677    assert(bdrv_drain_all_count < INT_MAX);
 678    bdrv_drain_all_count++;
 679
 680    /* Quiesce all nodes, without polling in-flight requests yet. The graph
 681     * cannot change during this loop. */
 682    while ((bs = bdrv_next_all_states(bs))) {
 683        AioContext *aio_context = bdrv_get_aio_context(bs);
 684
 685        aio_context_acquire(aio_context);
 686        bdrv_do_drained_begin(bs, false, NULL, true, false);
 687        aio_context_release(aio_context);
 688    }
 689
 690    /* Now poll the in-flight requests */
 691    AIO_WAIT_WHILE(NULL, bdrv_drain_all_poll());
 692
 693    while ((bs = bdrv_next_all_states(bs))) {
 694        bdrv_drain_assert_idle(bs);
 695    }
 696}
 697
 698void bdrv_drain_all_end_quiesce(BlockDriverState *bs)
 699{
 700    int drained_end_counter = 0;
 701    GLOBAL_STATE_CODE();
 702
 703    g_assert(bs->quiesce_counter > 0);
 704    g_assert(!bs->refcnt);
 705
 706    while (bs->quiesce_counter) {
 707        bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter);
 708    }
 709    BDRV_POLL_WHILE(bs, qatomic_read(&drained_end_counter) > 0);
 710}
 711
 712void bdrv_drain_all_end(void)
 713{
 714    BlockDriverState *bs = NULL;
 715    int drained_end_counter = 0;
 716    GLOBAL_STATE_CODE();
 717
 718    /*
 719     * bdrv queue is managed by record/replay,
 720     * waiting for finishing the I/O requests may
 721     * be endless
 722     */
 723    if (replay_events_enabled()) {
 724        return;
 725    }
 726
 727    while ((bs = bdrv_next_all_states(bs))) {
 728        AioContext *aio_context = bdrv_get_aio_context(bs);
 729
 730        aio_context_acquire(aio_context);
 731        bdrv_do_drained_end(bs, false, NULL, true, &drained_end_counter);
 732        aio_context_release(aio_context);
 733    }
 734
 735    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 736    AIO_WAIT_WHILE(NULL, qatomic_read(&drained_end_counter) > 0);
 737
 738    assert(bdrv_drain_all_count > 0);
 739    bdrv_drain_all_count--;
 740}
 741
 742void bdrv_drain_all(void)
 743{
 744    GLOBAL_STATE_CODE();
 745    bdrv_drain_all_begin();
 746    bdrv_drain_all_end();
 747}
 748
 749/**
 750 * Remove an active request from the tracked requests list
 751 *
 752 * This function should be called when a tracked request is completing.
 753 */
 754static void tracked_request_end(BdrvTrackedRequest *req)
 755{
 756    if (req->serialising) {
 757        qatomic_dec(&req->bs->serialising_in_flight);
 758    }
 759
 760    qemu_co_mutex_lock(&req->bs->reqs_lock);
 761    QLIST_REMOVE(req, list);
 762    qemu_co_queue_restart_all(&req->wait_queue);
 763    qemu_co_mutex_unlock(&req->bs->reqs_lock);
 764}
 765
 766/**
 767 * Add an active request to the tracked requests list
 768 */
 769static void tracked_request_begin(BdrvTrackedRequest *req,
 770                                  BlockDriverState *bs,
 771                                  int64_t offset,
 772                                  int64_t bytes,
 773                                  enum BdrvTrackedRequestType type)
 774{
 775    bdrv_check_request(offset, bytes, &error_abort);
 776
 777    *req = (BdrvTrackedRequest){
 778        .bs = bs,
 779        .offset         = offset,
 780        .bytes          = bytes,
 781        .type           = type,
 782        .co             = qemu_coroutine_self(),
 783        .serialising    = false,
 784        .overlap_offset = offset,
 785        .overlap_bytes  = bytes,
 786    };
 787
 788    qemu_co_queue_init(&req->wait_queue);
 789
 790    qemu_co_mutex_lock(&bs->reqs_lock);
 791    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
 792    qemu_co_mutex_unlock(&bs->reqs_lock);
 793}
 794
 795static bool tracked_request_overlaps(BdrvTrackedRequest *req,
 796                                     int64_t offset, int64_t bytes)
 797{
 798    bdrv_check_request(offset, bytes, &error_abort);
 799
 800    /*        aaaa   bbbb */
 801    if (offset >= req->overlap_offset + req->overlap_bytes) {
 802        return false;
 803    }
 804    /* bbbb   aaaa        */
 805    if (req->overlap_offset >= offset + bytes) {
 806        return false;
 807    }
 808    return true;
 809}
 810
 811/* Called with self->bs->reqs_lock held */
 812static BdrvTrackedRequest *
 813bdrv_find_conflicting_request(BdrvTrackedRequest *self)
 814{
 815    BdrvTrackedRequest *req;
 816
 817    QLIST_FOREACH(req, &self->bs->tracked_requests, list) {
 818        if (req == self || (!req->serialising && !self->serialising)) {
 819            continue;
 820        }
 821        if (tracked_request_overlaps(req, self->overlap_offset,
 822                                     self->overlap_bytes))
 823        {
 824            /*
 825             * Hitting this means there was a reentrant request, for
 826             * example, a block driver issuing nested requests.  This must
 827             * never happen since it means deadlock.
 828             */
 829            assert(qemu_coroutine_self() != req->co);
 830
 831            /*
 832             * If the request is already (indirectly) waiting for us, or
 833             * will wait for us as soon as it wakes up, then just go on
 834             * (instead of producing a deadlock in the former case).
 835             */
 836            if (!req->waiting_for) {
 837                return req;
 838            }
 839        }
 840    }
 841
 842    return NULL;
 843}
 844
 845/* Called with self->bs->reqs_lock held */
 846static bool coroutine_fn
 847bdrv_wait_serialising_requests_locked(BdrvTrackedRequest *self)
 848{
 849    BdrvTrackedRequest *req;
 850    bool waited = false;
 851
 852    while ((req = bdrv_find_conflicting_request(self))) {
 853        self->waiting_for = req;
 854        qemu_co_queue_wait(&req->wait_queue, &self->bs->reqs_lock);
 855        self->waiting_for = NULL;
 856        waited = true;
 857    }
 858
 859    return waited;
 860}
 861
 862/* Called with req->bs->reqs_lock held */
 863static void tracked_request_set_serialising(BdrvTrackedRequest *req,
 864                                            uint64_t align)
 865{
 866    int64_t overlap_offset = req->offset & ~(align - 1);
 867    int64_t overlap_bytes =
 868        ROUND_UP(req->offset + req->bytes, align) - overlap_offset;
 869
 870    bdrv_check_request(req->offset, req->bytes, &error_abort);
 871
 872    if (!req->serialising) {
 873        qatomic_inc(&req->bs->serialising_in_flight);
 874        req->serialising = true;
 875    }
 876
 877    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
 878    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
 879}
 880
 881/**
 882 * Return the tracked request on @bs for the current coroutine, or
 883 * NULL if there is none.
 884 */
 885BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs)
 886{
 887    BdrvTrackedRequest *req;
 888    Coroutine *self = qemu_coroutine_self();
 889    IO_CODE();
 890
 891    QLIST_FOREACH(req, &bs->tracked_requests, list) {
 892        if (req->co == self) {
 893            return req;
 894        }
 895    }
 896
 897    return NULL;
 898}
 899
 900/**
 901 * Round a region to cluster boundaries
 902 */
 903void bdrv_round_to_clusters(BlockDriverState *bs,
 904                            int64_t offset, int64_t bytes,
 905                            int64_t *cluster_offset,
 906                            int64_t *cluster_bytes)
 907{
 908    BlockDriverInfo bdi;
 909    IO_CODE();
 910    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
 911        *cluster_offset = offset;
 912        *cluster_bytes = bytes;
 913    } else {
 914        int64_t c = bdi.cluster_size;
 915        *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
 916        *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
 917    }
 918}
 919
 920static int bdrv_get_cluster_size(BlockDriverState *bs)
 921{
 922    BlockDriverInfo bdi;
 923    int ret;
 924
 925    ret = bdrv_get_info(bs, &bdi);
 926    if (ret < 0 || bdi.cluster_size == 0) {
 927        return bs->bl.request_alignment;
 928    } else {
 929        return bdi.cluster_size;
 930    }
 931}
 932
 933void bdrv_inc_in_flight(BlockDriverState *bs)
 934{
 935    IO_CODE();
 936    qatomic_inc(&bs->in_flight);
 937}
 938
 939void bdrv_wakeup(BlockDriverState *bs)
 940{
 941    IO_CODE();
 942    aio_wait_kick();
 943}
 944
 945void bdrv_dec_in_flight(BlockDriverState *bs)
 946{
 947    IO_CODE();
 948    qatomic_dec(&bs->in_flight);
 949    bdrv_wakeup(bs);
 950}
 951
 952static bool coroutine_fn bdrv_wait_serialising_requests(BdrvTrackedRequest *self)
 953{
 954    BlockDriverState *bs = self->bs;
 955    bool waited = false;
 956
 957    if (!qatomic_read(&bs->serialising_in_flight)) {
 958        return false;
 959    }
 960
 961    qemu_co_mutex_lock(&bs->reqs_lock);
 962    waited = bdrv_wait_serialising_requests_locked(self);
 963    qemu_co_mutex_unlock(&bs->reqs_lock);
 964
 965    return waited;
 966}
 967
 968bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req,
 969                                                uint64_t align)
 970{
 971    bool waited;
 972    IO_CODE();
 973
 974    qemu_co_mutex_lock(&req->bs->reqs_lock);
 975
 976    tracked_request_set_serialising(req, align);
 977    waited = bdrv_wait_serialising_requests_locked(req);
 978
 979    qemu_co_mutex_unlock(&req->bs->reqs_lock);
 980
 981    return waited;
 982}
 983
 984int bdrv_check_qiov_request(int64_t offset, int64_t bytes,
 985                            QEMUIOVector *qiov, size_t qiov_offset,
 986                            Error **errp)
 987{
 988    /*
 989     * Check generic offset/bytes correctness
 990     */
 991
 992    if (offset < 0) {
 993        error_setg(errp, "offset is negative: %" PRIi64, offset);
 994        return -EIO;
 995    }
 996
 997    if (bytes < 0) {
 998        error_setg(errp, "bytes is negative: %" PRIi64, bytes);
 999        return -EIO;
1000    }
1001
1002    if (bytes > BDRV_MAX_LENGTH) {
1003        error_setg(errp, "bytes(%" PRIi64 ") exceeds maximum(%" PRIi64 ")",
1004                   bytes, BDRV_MAX_LENGTH);
1005        return -EIO;
1006    }
1007
1008    if (offset > BDRV_MAX_LENGTH) {
1009        error_setg(errp, "offset(%" PRIi64 ") exceeds maximum(%" PRIi64 ")",
1010                   offset, BDRV_MAX_LENGTH);
1011        return -EIO;
1012    }
1013
1014    if (offset > BDRV_MAX_LENGTH - bytes) {
1015        error_setg(errp, "sum of offset(%" PRIi64 ") and bytes(%" PRIi64 ") "
1016                   "exceeds maximum(%" PRIi64 ")", offset, bytes,
1017                   BDRV_MAX_LENGTH);
1018        return -EIO;
1019    }
1020
1021    if (!qiov) {
1022        return 0;
1023    }
1024
1025    /*
1026     * Check qiov and qiov_offset
1027     */
1028
1029    if (qiov_offset > qiov->size) {
1030        error_setg(errp, "qiov_offset(%zu) overflow io vector size(%zu)",
1031                   qiov_offset, qiov->size);
1032        return -EIO;
1033    }
1034
1035    if (bytes > qiov->size - qiov_offset) {
1036        error_setg(errp, "bytes(%" PRIi64 ") + qiov_offset(%zu) overflow io "
1037                   "vector size(%zu)", bytes, qiov_offset, qiov->size);
1038        return -EIO;
1039    }
1040
1041    return 0;
1042}
1043
1044int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp)
1045{
1046    return bdrv_check_qiov_request(offset, bytes, NULL, 0, errp);
1047}
1048
1049static int bdrv_check_request32(int64_t offset, int64_t bytes,
1050                                QEMUIOVector *qiov, size_t qiov_offset)
1051{
1052    int ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL);
1053    if (ret < 0) {
1054        return ret;
1055    }
1056
1057    if (bytes > BDRV_REQUEST_MAX_BYTES) {
1058        return -EIO;
1059    }
1060
1061    return 0;
1062}
1063
1064int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
1065                       int64_t bytes, BdrvRequestFlags flags)
1066{
1067    IO_CODE();
1068    return bdrv_pwritev(child, offset, bytes, NULL,
1069                        BDRV_REQ_ZERO_WRITE | flags);
1070}
1071
1072/*
1073 * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
1074 * The operation is sped up by checking the block status and only writing
1075 * zeroes to the device if they currently do not return zeroes. Optional
1076 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
1077 * BDRV_REQ_FUA).
1078 *
1079 * Returns < 0 on error, 0 on success. For error codes see bdrv_pwrite().
1080 */
1081int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
1082{
1083    int ret;
1084    int64_t target_size, bytes, offset = 0;
1085    BlockDriverState *bs = child->bs;
1086    IO_CODE();
1087
1088    target_size = bdrv_getlength(bs);
1089    if (target_size < 0) {
1090        return target_size;
1091    }
1092
1093    for (;;) {
1094        bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
1095        if (bytes <= 0) {
1096            return 0;
1097        }
1098        ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
1099        if (ret < 0) {
1100            return ret;
1101        }
1102        if (ret & BDRV_BLOCK_ZERO) {
1103            offset += bytes;
1104            continue;
1105        }
1106        ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
1107        if (ret < 0) {
1108            return ret;
1109        }
1110        offset += bytes;
1111    }
1112}
1113
1114/* See bdrv_pwrite() for the return codes */
1115int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int64_t bytes)
1116{
1117    int ret;
1118    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1119    IO_CODE();
1120
1121    if (bytes < 0) {
1122        return -EINVAL;
1123    }
1124
1125    ret = bdrv_preadv(child, offset, bytes, &qiov,  0);
1126
1127    return ret < 0 ? ret : bytes;
1128}
1129
1130/* Return no. of bytes on success or < 0 on error. Important errors are:
1131  -EIO         generic I/O error (may happen for all errors)
1132  -ENOMEDIUM   No media inserted.
1133  -EINVAL      Invalid offset or number of bytes
1134  -EACCES      Trying to write a read-only device
1135*/
1136int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf,
1137                int64_t bytes)
1138{
1139    int ret;
1140    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1141    IO_CODE();
1142
1143    if (bytes < 0) {
1144        return -EINVAL;
1145    }
1146
1147    ret = bdrv_pwritev(child, offset, bytes, &qiov, 0);
1148
1149    return ret < 0 ? ret : bytes;
1150}
1151
1152/*
1153 * Writes to the file and ensures that no writes are reordered across this
1154 * request (acts as a barrier)
1155 *
1156 * Returns 0 on success, -errno in error cases.
1157 */
1158int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
1159                     const void *buf, int64_t count)
1160{
1161    int ret;
1162    IO_CODE();
1163
1164    ret = bdrv_pwrite(child, offset, buf, count);
1165    if (ret < 0) {
1166        return ret;
1167    }
1168
1169    ret = bdrv_flush(child->bs);
1170    if (ret < 0) {
1171        return ret;
1172    }
1173
1174    return 0;
1175}
1176
1177typedef struct CoroutineIOCompletion {
1178    Coroutine *coroutine;
1179    int ret;
1180} CoroutineIOCompletion;
1181
1182static void bdrv_co_io_em_complete(void *opaque, int ret)
1183{
1184    CoroutineIOCompletion *co = opaque;
1185
1186    co->ret = ret;
1187    aio_co_wake(co->coroutine);
1188}
1189
1190static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
1191                                           int64_t offset, int64_t bytes,
1192                                           QEMUIOVector *qiov,
1193                                           size_t qiov_offset, int flags)
1194{
1195    BlockDriver *drv = bs->drv;
1196    int64_t sector_num;
1197    unsigned int nb_sectors;
1198    QEMUIOVector local_qiov;
1199    int ret;
1200
1201    bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
1202    assert(!(flags & ~BDRV_REQ_MASK));
1203    assert(!(flags & BDRV_REQ_NO_FALLBACK));
1204
1205    if (!drv) {
1206        return -ENOMEDIUM;
1207    }
1208
1209    if (drv->bdrv_co_preadv_part) {
1210        return drv->bdrv_co_preadv_part(bs, offset, bytes, qiov, qiov_offset,
1211                                        flags);
1212    }
1213
1214    if (qiov_offset > 0 || bytes != qiov->size) {
1215        qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1216        qiov = &local_qiov;
1217    }
1218
1219    if (drv->bdrv_co_preadv) {
1220        ret = drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
1221        goto out;
1222    }
1223
1224    if (drv->bdrv_aio_preadv) {
1225        BlockAIOCB *acb;
1226        CoroutineIOCompletion co = {
1227            .coroutine = qemu_coroutine_self(),
1228        };
1229
1230        acb = drv->bdrv_aio_preadv(bs, offset, bytes, qiov, flags,
1231                                   bdrv_co_io_em_complete, &co);
1232        if (acb == NULL) {
1233            ret = -EIO;
1234            goto out;
1235        } else {
1236            qemu_coroutine_yield();
1237            ret = co.ret;
1238            goto out;
1239        }
1240    }
1241
1242    sector_num = offset >> BDRV_SECTOR_BITS;
1243    nb_sectors = bytes >> BDRV_SECTOR_BITS;
1244
1245    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
1246    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
1247    assert(bytes <= BDRV_REQUEST_MAX_BYTES);
1248    assert(drv->bdrv_co_readv);
1249
1250    ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
1251
1252out:
1253    if (qiov == &local_qiov) {
1254        qemu_iovec_destroy(&local_qiov);
1255    }
1256
1257    return ret;
1258}
1259
1260static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
1261                                            int64_t offset, int64_t bytes,
1262                                            QEMUIOVector *qiov,
1263                                            size_t qiov_offset,
1264                                            BdrvRequestFlags flags)
1265{
1266    BlockDriver *drv = bs->drv;
1267    int64_t sector_num;
1268    unsigned int nb_sectors;
1269    QEMUIOVector local_qiov;
1270    int ret;
1271
1272    bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
1273    assert(!(flags & ~BDRV_REQ_MASK));
1274    assert(!(flags & BDRV_REQ_NO_FALLBACK));
1275
1276    if (!drv) {
1277        return -ENOMEDIUM;
1278    }
1279
1280    if (drv->bdrv_co_pwritev_part) {
1281        ret = drv->bdrv_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset,
1282                                        flags & bs->supported_write_flags);
1283        flags &= ~bs->supported_write_flags;
1284        goto emulate_flags;
1285    }
1286
1287    if (qiov_offset > 0 || bytes != qiov->size) {
1288        qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1289        qiov = &local_qiov;
1290    }
1291
1292    if (drv->bdrv_co_pwritev) {
1293        ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
1294                                   flags & bs->supported_write_flags);
1295        flags &= ~bs->supported_write_flags;
1296        goto emulate_flags;
1297    }
1298
1299    if (drv->bdrv_aio_pwritev) {
1300        BlockAIOCB *acb;
1301        CoroutineIOCompletion co = {
1302            .coroutine = qemu_coroutine_self(),
1303        };
1304
1305        acb = drv->bdrv_aio_pwritev(bs, offset, bytes, qiov,
1306                                    flags & bs->supported_write_flags,
1307                                    bdrv_co_io_em_complete, &co);
1308        flags &= ~bs->supported_write_flags;
1309        if (acb == NULL) {
1310            ret = -EIO;
1311        } else {
1312            qemu_coroutine_yield();
1313            ret = co.ret;
1314        }
1315        goto emulate_flags;
1316    }
1317
1318    sector_num = offset >> BDRV_SECTOR_BITS;
1319    nb_sectors = bytes >> BDRV_SECTOR_BITS;
1320
1321    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
1322    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE));
1323    assert(bytes <= BDRV_REQUEST_MAX_BYTES);
1324
1325    assert(drv->bdrv_co_writev);
1326    ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov,
1327                              flags & bs->supported_write_flags);
1328    flags &= ~bs->supported_write_flags;
1329
1330emulate_flags:
1331    if (ret == 0 && (flags & BDRV_REQ_FUA)) {
1332        ret = bdrv_co_flush(bs);
1333    }
1334
1335    if (qiov == &local_qiov) {
1336        qemu_iovec_destroy(&local_qiov);
1337    }
1338
1339    return ret;
1340}
1341
1342static int coroutine_fn
1343bdrv_driver_pwritev_compressed(BlockDriverState *bs, int64_t offset,
1344                               int64_t bytes, QEMUIOVector *qiov,
1345                               size_t qiov_offset)
1346{
1347    BlockDriver *drv = bs->drv;
1348    QEMUIOVector local_qiov;
1349    int ret;
1350
1351    bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
1352
1353    if (!drv) {
1354        return -ENOMEDIUM;
1355    }
1356
1357    if (!block_driver_can_compress(drv)) {
1358        return -ENOTSUP;
1359    }
1360
1361    if (drv->bdrv_co_pwritev_compressed_part) {
1362        return drv->bdrv_co_pwritev_compressed_part(bs, offset, bytes,
1363                                                    qiov, qiov_offset);
1364    }
1365
1366    if (qiov_offset == 0) {
1367        return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
1368    }
1369
1370    qemu_iovec_init_slice(&local_qiov, qiov, qiov_offset, bytes);
1371    ret = drv->bdrv_co_pwritev_compressed(bs, offset, bytes, &local_qiov);
1372    qemu_iovec_destroy(&local_qiov);
1373
1374    return ret;
1375}
1376
1377static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
1378        int64_t offset, int64_t bytes, QEMUIOVector *qiov,
1379        size_t qiov_offset, int flags)
1380{
1381    BlockDriverState *bs = child->bs;
1382
1383    /* Perform I/O through a temporary buffer so that users who scribble over
1384     * their read buffer while the operation is in progress do not end up
1385     * modifying the image file.  This is critical for zero-copy guest I/O
1386     * where anything might happen inside guest memory.
1387     */
1388    void *bounce_buffer = NULL;
1389
1390    BlockDriver *drv = bs->drv;
1391    int64_t cluster_offset;
1392    int64_t cluster_bytes;
1393    int64_t skip_bytes;
1394    int ret;
1395    int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1396                                    BDRV_REQUEST_MAX_BYTES);
1397    int64_t progress = 0;
1398    bool skip_write;
1399
1400    bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
1401
1402    if (!drv) {
1403        return -ENOMEDIUM;
1404    }
1405
1406    /*
1407     * Do not write anything when the BDS is inactive.  That is not
1408     * allowed, and it would not help.
1409     */
1410    skip_write = (bs->open_flags & BDRV_O_INACTIVE);
1411
1412    /* FIXME We cannot require callers to have write permissions when all they
1413     * are doing is a read request. If we did things right, write permissions
1414     * would be obtained anyway, but internally by the copy-on-read code. As
1415     * long as it is implemented here rather than in a separate filter driver,
1416     * the copy-on-read code doesn't have its own BdrvChild, however, for which
1417     * it could request permissions. Therefore we have to bypass the permission
1418     * system for the moment. */
1419    // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1420
1421    /* Cover entire cluster so no additional backing file I/O is required when
1422     * allocating cluster in the image file.  Note that this value may exceed
1423     * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
1424     * is one reason we loop rather than doing it all at once.
1425     */
1426    bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
1427    skip_bytes = offset - cluster_offset;
1428
1429    trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
1430                                   cluster_offset, cluster_bytes);
1431
1432    while (cluster_bytes) {
1433        int64_t pnum;
1434
1435        if (skip_write) {
1436            ret = 1; /* "already allocated", so nothing will be copied */
1437            pnum = MIN(cluster_bytes, max_transfer);
1438        } else {
1439            ret = bdrv_is_allocated(bs, cluster_offset,
1440                                    MIN(cluster_bytes, max_transfer), &pnum);
1441            if (ret < 0) {
1442                /*
1443                 * Safe to treat errors in querying allocation as if
1444                 * unallocated; we'll probably fail again soon on the
1445                 * read, but at least that will set a decent errno.
1446                 */
1447                pnum = MIN(cluster_bytes, max_transfer);
1448            }
1449
1450            /* Stop at EOF if the image ends in the middle of the cluster */
1451            if (ret == 0 && pnum == 0) {
1452                assert(progress >= bytes);
1453                break;
1454            }
1455
1456            assert(skip_bytes < pnum);
1457        }
1458
1459        if (ret <= 0) {
1460            QEMUIOVector local_qiov;
1461
1462            /* Must copy-on-read; use the bounce buffer */
1463            pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
1464            if (!bounce_buffer) {
1465                int64_t max_we_need = MAX(pnum, cluster_bytes - pnum);
1466                int64_t max_allowed = MIN(max_transfer, MAX_BOUNCE_BUFFER);
1467                int64_t bounce_buffer_len = MIN(max_we_need, max_allowed);
1468
1469                bounce_buffer = qemu_try_blockalign(bs, bounce_buffer_len);
1470                if (!bounce_buffer) {
1471                    ret = -ENOMEM;
1472                    goto err;
1473                }
1474            }
1475            qemu_iovec_init_buf(&local_qiov, bounce_buffer, pnum);
1476
1477            ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
1478                                     &local_qiov, 0, 0);
1479            if (ret < 0) {
1480                goto err;
1481            }
1482
1483            bdrv_debug_event(bs, BLKDBG_COR_WRITE);
1484            if (drv->bdrv_co_pwrite_zeroes &&
1485                buffer_is_zero(bounce_buffer, pnum)) {
1486                /* FIXME: Should we (perhaps conditionally) be setting
1487                 * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
1488                 * that still correctly reads as zero? */
1489                ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum,
1490                                               BDRV_REQ_WRITE_UNCHANGED);
1491            } else {
1492                /* This does not change the data on the disk, it is not
1493                 * necessary to flush even in cache=writethrough mode.
1494                 */
1495                ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
1496                                          &local_qiov, 0,
1497                                          BDRV_REQ_WRITE_UNCHANGED);
1498            }
1499
1500            if (ret < 0) {
1501                /* It might be okay to ignore write errors for guest
1502                 * requests.  If this is a deliberate copy-on-read
1503                 * then we don't want to ignore the error.  Simply
1504                 * report it in all cases.
1505                 */
1506                goto err;
1507            }
1508
1509            if (!(flags & BDRV_REQ_PREFETCH)) {
1510                qemu_iovec_from_buf(qiov, qiov_offset + progress,
1511                                    bounce_buffer + skip_bytes,
1512                                    MIN(pnum - skip_bytes, bytes - progress));
1513            }
1514        } else if (!(flags & BDRV_REQ_PREFETCH)) {
1515            /* Read directly into the destination */
1516            ret = bdrv_driver_preadv(bs, offset + progress,
1517                                     MIN(pnum - skip_bytes, bytes - progress),
1518                                     qiov, qiov_offset + progress, 0);
1519            if (ret < 0) {
1520                goto err;
1521            }
1522        }
1523
1524        cluster_offset += pnum;
1525        cluster_bytes -= pnum;
1526        progress += pnum - skip_bytes;
1527        skip_bytes = 0;
1528    }
1529    ret = 0;
1530
1531err:
1532    qemu_vfree(bounce_buffer);
1533    return ret;
1534}
1535
1536/*
1537 * Forwards an already correctly aligned request to the BlockDriver. This
1538 * handles copy on read, zeroing after EOF, and fragmentation of large
1539 * reads; any other features must be implemented by the caller.
1540 */
1541static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
1542    BdrvTrackedRequest *req, int64_t offset, int64_t bytes,
1543    int64_t align, QEMUIOVector *qiov, size_t qiov_offset, int flags)
1544{
1545    BlockDriverState *bs = child->bs;
1546    int64_t total_bytes, max_bytes;
1547    int ret = 0;
1548    int64_t bytes_remaining = bytes;
1549    int max_transfer;
1550
1551    bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
1552    assert(is_power_of_2(align));
1553    assert((offset & (align - 1)) == 0);
1554    assert((bytes & (align - 1)) == 0);
1555    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1556    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1557                                   align);
1558
1559    /* TODO: We would need a per-BDS .supported_read_flags and
1560     * potential fallback support, if we ever implement any read flags
1561     * to pass through to drivers.  For now, there aren't any
1562     * passthrough flags.  */
1563    assert(!(flags & ~(BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH)));
1564
1565    /* Handle Copy on Read and associated serialisation */
1566    if (flags & BDRV_REQ_COPY_ON_READ) {
1567        /* If we touch the same cluster it counts as an overlap.  This
1568         * guarantees that allocating writes will be serialized and not race
1569         * with each other for the same cluster.  For example, in copy-on-read
1570         * it ensures that the CoR read and write operations are atomic and
1571         * guest writes cannot interleave between them. */
1572        bdrv_make_request_serialising(req, bdrv_get_cluster_size(bs));
1573    } else {
1574        bdrv_wait_serialising_requests(req);
1575    }
1576
1577    if (flags & BDRV_REQ_COPY_ON_READ) {
1578        int64_t pnum;
1579
1580        /* The flag BDRV_REQ_COPY_ON_READ has reached its addressee */
1581        flags &= ~BDRV_REQ_COPY_ON_READ;
1582
1583        ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
1584        if (ret < 0) {
1585            goto out;
1586        }
1587
1588        if (!ret || pnum != bytes) {
1589            ret = bdrv_co_do_copy_on_readv(child, offset, bytes,
1590                                           qiov, qiov_offset, flags);
1591            goto out;
1592        } else if (flags & BDRV_REQ_PREFETCH) {
1593            goto out;
1594        }
1595    }
1596
1597    /* Forward the request to the BlockDriver, possibly fragmenting it */
1598    total_bytes = bdrv_getlength(bs);
1599    if (total_bytes < 0) {
1600        ret = total_bytes;
1601        goto out;
1602    }
1603
1604    assert(!(flags & ~bs->supported_read_flags));
1605
1606    max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1607    if (bytes <= max_bytes && bytes <= max_transfer) {
1608        ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, flags);
1609        goto out;
1610    }
1611
1612    while (bytes_remaining) {
1613        int64_t num;
1614
1615        if (max_bytes) {
1616            num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1617            assert(num);
1618
1619            ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1620                                     num, qiov,
1621                                     qiov_offset + bytes - bytes_remaining,
1622                                     flags);
1623            max_bytes -= num;
1624        } else {
1625            num = bytes_remaining;
1626            ret = qemu_iovec_memset(qiov, qiov_offset + bytes - bytes_remaining,
1627                                    0, bytes_remaining);
1628        }
1629        if (ret < 0) {
1630            goto out;
1631        }
1632        bytes_remaining -= num;
1633    }
1634
1635out:
1636    return ret < 0 ? ret : 0;
1637}
1638
1639/*
1640 * Request padding
1641 *
1642 *  |<---- align ----->|                     |<----- align ---->|
1643 *  |<- head ->|<------------- bytes ------------->|<-- tail -->|
1644 *  |          |       |                     |     |            |
1645 * -*----------$-------*-------- ... --------*-----$------------*---
1646 *  |          |       |                     |     |            |
1647 *  |          offset  |                     |     end          |
1648 *  ALIGN_DOWN(offset) ALIGN_UP(offset)      ALIGN_DOWN(end)   ALIGN_UP(end)
1649 *  [buf   ... )                             [tail_buf          )
1650 *
1651 * @buf is an aligned allocation needed to store @head and @tail paddings. @head
1652 * is placed at the beginning of @buf and @tail at the @end.
1653 *
1654 * @tail_buf is a pointer to sub-buffer, corresponding to align-sized chunk
1655 * around tail, if tail exists.
1656 *
1657 * @merge_reads is true for small requests,
1658 * if @buf_len == @head + bytes + @tail. In this case it is possible that both
1659 * head and tail exist but @buf_len == align and @tail_buf == @buf.
1660 */
1661typedef struct BdrvRequestPadding {
1662    uint8_t *buf;
1663    size_t buf_len;
1664    uint8_t *tail_buf;
1665    size_t head;
1666    size_t tail;
1667    bool merge_reads;
1668    QEMUIOVector local_qiov;
1669} BdrvRequestPadding;
1670
1671static bool bdrv_init_padding(BlockDriverState *bs,
1672                              int64_t offset, int64_t bytes,
1673                              BdrvRequestPadding *pad)
1674{
1675    int64_t align = bs->bl.request_alignment;
1676    int64_t sum;
1677
1678    bdrv_check_request(offset, bytes, &error_abort);
1679    assert(align <= INT_MAX); /* documented in block/block_int.h */
1680    assert(align <= SIZE_MAX / 2); /* so we can allocate the buffer */
1681
1682    memset(pad, 0, sizeof(*pad));
1683
1684    pad->head = offset & (align - 1);
1685    pad->tail = ((offset + bytes) & (align - 1));
1686    if (pad->tail) {
1687        pad->tail = align - pad->tail;
1688    }
1689
1690    if (!pad->head && !pad->tail) {
1691        return false;
1692    }
1693
1694    assert(bytes); /* Nothing good in aligning zero-length requests */
1695
1696    sum = pad->head + bytes + pad->tail;
1697    pad->buf_len = (sum > align && pad->head && pad->tail) ? 2 * align : align;
1698    pad->buf = qemu_blockalign(bs, pad->buf_len);
1699    pad->merge_reads = sum == pad->buf_len;
1700    if (pad->tail) {
1701        pad->tail_buf = pad->buf + pad->buf_len - align;
1702    }
1703
1704    return true;
1705}
1706
1707static int bdrv_padding_rmw_read(BdrvChild *child,
1708                                 BdrvTrackedRequest *req,
1709                                 BdrvRequestPadding *pad,
1710                                 bool zero_middle)
1711{
1712    QEMUIOVector local_qiov;
1713    BlockDriverState *bs = child->bs;
1714    uint64_t align = bs->bl.request_alignment;
1715    int ret;
1716
1717    assert(req->serialising && pad->buf);
1718
1719    if (pad->head || pad->merge_reads) {
1720        int64_t bytes = pad->merge_reads ? pad->buf_len : align;
1721
1722        qemu_iovec_init_buf(&local_qiov, pad->buf, bytes);
1723
1724        if (pad->head) {
1725            bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1726        }
1727        if (pad->merge_reads && pad->tail) {
1728            bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1729        }
1730        ret = bdrv_aligned_preadv(child, req, req->overlap_offset, bytes,
1731                                  align, &local_qiov, 0, 0);
1732        if (ret < 0) {
1733            return ret;
1734        }
1735        if (pad->head) {
1736            bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1737        }
1738        if (pad->merge_reads && pad->tail) {
1739            bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1740        }
1741
1742        if (pad->merge_reads) {
1743            goto zero_mem;
1744        }
1745    }
1746
1747    if (pad->tail) {
1748        qemu_iovec_init_buf(&local_qiov, pad->tail_buf, align);
1749
1750        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1751        ret = bdrv_aligned_preadv(
1752                child, req,
1753                req->overlap_offset + req->overlap_bytes - align,
1754                align, align, &local_qiov, 0, 0);
1755        if (ret < 0) {
1756            return ret;
1757        }
1758        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1759    }
1760
1761zero_mem:
1762    if (zero_middle) {
1763        memset(pad->buf + pad->head, 0, pad->buf_len - pad->head - pad->tail);
1764    }
1765
1766    return 0;
1767}
1768
1769static void bdrv_padding_destroy(BdrvRequestPadding *pad)
1770{
1771    if (pad->buf) {
1772        qemu_vfree(pad->buf);
1773        qemu_iovec_destroy(&pad->local_qiov);
1774    }
1775    memset(pad, 0, sizeof(*pad));
1776}
1777
1778/*
1779 * bdrv_pad_request
1780 *
1781 * Exchange request parameters with padded request if needed. Don't include RMW
1782 * read of padding, bdrv_padding_rmw_read() should be called separately if
1783 * needed.
1784 *
1785 * Request parameters (@qiov, &qiov_offset, &offset, &bytes) are in-out:
1786 *  - on function start they represent original request
1787 *  - on failure or when padding is not needed they are unchanged
1788 *  - on success when padding is needed they represent padded request
1789 */
1790static int bdrv_pad_request(BlockDriverState *bs,
1791                            QEMUIOVector **qiov, size_t *qiov_offset,
1792                            int64_t *offset, int64_t *bytes,
1793                            BdrvRequestPadding *pad, bool *padded)
1794{
1795    int ret;
1796
1797    bdrv_check_qiov_request(*offset, *bytes, *qiov, *qiov_offset, &error_abort);
1798
1799    if (!bdrv_init_padding(bs, *offset, *bytes, pad)) {
1800        if (padded) {
1801            *padded = false;
1802        }
1803        return 0;
1804    }
1805
1806    ret = qemu_iovec_init_extended(&pad->local_qiov, pad->buf, pad->head,
1807                                   *qiov, *qiov_offset, *bytes,
1808                                   pad->buf + pad->buf_len - pad->tail,
1809                                   pad->tail);
1810    if (ret < 0) {
1811        bdrv_padding_destroy(pad);
1812        return ret;
1813    }
1814    *bytes += pad->head + pad->tail;
1815    *offset -= pad->head;
1816    *qiov = &pad->local_qiov;
1817    *qiov_offset = 0;
1818    if (padded) {
1819        *padded = true;
1820    }
1821
1822    return 0;
1823}
1824
1825int coroutine_fn bdrv_co_preadv(BdrvChild *child,
1826    int64_t offset, int64_t bytes, QEMUIOVector *qiov,
1827    BdrvRequestFlags flags)
1828{
1829    IO_CODE();
1830    return bdrv_co_preadv_part(child, offset, bytes, qiov, 0, flags);
1831}
1832
1833int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
1834    int64_t offset, int64_t bytes,
1835    QEMUIOVector *qiov, size_t qiov_offset,
1836    BdrvRequestFlags flags)
1837{
1838    BlockDriverState *bs = child->bs;
1839    BdrvTrackedRequest req;
1840    BdrvRequestPadding pad;
1841    int ret;
1842    IO_CODE();
1843
1844    trace_bdrv_co_preadv_part(bs, offset, bytes, flags);
1845
1846    if (!bdrv_is_inserted(bs)) {
1847        return -ENOMEDIUM;
1848    }
1849
1850    ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset);
1851    if (ret < 0) {
1852        return ret;
1853    }
1854
1855    if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
1856        /*
1857         * Aligning zero request is nonsense. Even if driver has special meaning
1858         * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
1859         * it to driver due to request_alignment.
1860         *
1861         * Still, no reason to return an error if someone do unaligned
1862         * zero-length read occasionally.
1863         */
1864        return 0;
1865    }
1866
1867    bdrv_inc_in_flight(bs);
1868
1869    /* Don't do copy-on-read if we read data before write operation */
1870    if (qatomic_read(&bs->copy_on_read)) {
1871        flags |= BDRV_REQ_COPY_ON_READ;
1872    }
1873
1874    ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
1875                           NULL);
1876    if (ret < 0) {
1877        goto fail;
1878    }
1879
1880    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1881    ret = bdrv_aligned_preadv(child, &req, offset, bytes,
1882                              bs->bl.request_alignment,
1883                              qiov, qiov_offset, flags);
1884    tracked_request_end(&req);
1885    bdrv_padding_destroy(&pad);
1886
1887fail:
1888    bdrv_dec_in_flight(bs);
1889
1890    return ret;
1891}
1892
1893static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1894    int64_t offset, int64_t bytes, BdrvRequestFlags flags)
1895{
1896    BlockDriver *drv = bs->drv;
1897    QEMUIOVector qiov;
1898    void *buf = NULL;
1899    int ret = 0;
1900    bool need_flush = false;
1901    int head = 0;
1902    int tail = 0;
1903
1904    int64_t max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes,
1905                                            INT64_MAX);
1906    int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1907                        bs->bl.request_alignment);
1908    int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
1909
1910    bdrv_check_request(offset, bytes, &error_abort);
1911
1912    if (!drv) {
1913        return -ENOMEDIUM;
1914    }
1915
1916    if ((flags & ~bs->supported_zero_flags) & BDRV_REQ_NO_FALLBACK) {
1917        return -ENOTSUP;
1918    }
1919
1920    /* Invalidate the cached block-status data range if this write overlaps */
1921    bdrv_bsc_invalidate_range(bs, offset, bytes);
1922
1923    assert(alignment % bs->bl.request_alignment == 0);
1924    head = offset % alignment;
1925    tail = (offset + bytes) % alignment;
1926    max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1927    assert(max_write_zeroes >= bs->bl.request_alignment);
1928
1929    while (bytes > 0 && !ret) {
1930        int64_t num = bytes;
1931
1932        /* Align request.  Block drivers can expect the "bulk" of the request
1933         * to be aligned, and that unaligned requests do not cross cluster
1934         * boundaries.
1935         */
1936        if (head) {
1937            /* Make a small request up to the first aligned sector. For
1938             * convenience, limit this request to max_transfer even if
1939             * we don't need to fall back to writes.  */
1940            num = MIN(MIN(bytes, max_transfer), alignment - head);
1941            head = (head + num) % alignment;
1942            assert(num < max_write_zeroes);
1943        } else if (tail && num > alignment) {
1944            /* Shorten the request to the last aligned sector.  */
1945            num -= tail;
1946        }
1947
1948        /* limit request size */
1949        if (num > max_write_zeroes) {
1950            num = max_write_zeroes;
1951        }
1952
1953        ret = -ENOTSUP;
1954        /* First try the efficient write zeroes operation */
1955        if (drv->bdrv_co_pwrite_zeroes) {
1956            ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1957                                             flags & bs->supported_zero_flags);
1958            if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1959                !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1960                need_flush = true;
1961            }
1962        } else {
1963            assert(!bs->supported_zero_flags);
1964        }
1965
1966        if (ret == -ENOTSUP && !(flags & BDRV_REQ_NO_FALLBACK)) {
1967            /* Fall back to bounce buffer if write zeroes is unsupported */
1968            BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1969
1970            if ((flags & BDRV_REQ_FUA) &&
1971                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1972                /* No need for bdrv_driver_pwrite() to do a fallback
1973                 * flush on each chunk; use just one at the end */
1974                write_flags &= ~BDRV_REQ_FUA;
1975                need_flush = true;
1976            }
1977            num = MIN(num, max_transfer);
1978            if (buf == NULL) {
1979                buf = qemu_try_blockalign0(bs, num);
1980                if (buf == NULL) {
1981                    ret = -ENOMEM;
1982                    goto fail;
1983                }
1984            }
1985            qemu_iovec_init_buf(&qiov, buf, num);
1986
1987            ret = bdrv_driver_pwritev(bs, offset, num, &qiov, 0, write_flags);
1988
1989            /* Keep bounce buffer around if it is big enough for all
1990             * all future requests.
1991             */
1992            if (num < max_transfer) {
1993                qemu_vfree(buf);
1994                buf = NULL;
1995            }
1996        }
1997
1998        offset += num;
1999        bytes -= num;
2000    }
2001
2002fail:
2003    if (ret == 0 && need_flush) {
2004        ret = bdrv_co_flush(bs);
2005    }
2006    qemu_vfree(buf);
2007    return ret;
2008}
2009
2010static inline int coroutine_fn
2011bdrv_co_write_req_prepare(BdrvChild *child, int64_t offset, int64_t bytes,
2012                          BdrvTrackedRequest *req, int flags)
2013{
2014    BlockDriverState *bs = child->bs;
2015
2016    bdrv_check_request(offset, bytes, &error_abort);
2017
2018    if (bdrv_is_read_only(bs)) {
2019        return -EPERM;
2020    }
2021
2022    assert(!(bs->open_flags & BDRV_O_INACTIVE));
2023    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
2024    assert(!(flags & ~BDRV_REQ_MASK));
2025    assert(!((flags & BDRV_REQ_NO_WAIT) && !(flags & BDRV_REQ_SERIALISING)));
2026
2027    if (flags & BDRV_REQ_SERIALISING) {
2028        QEMU_LOCK_GUARD(&bs->reqs_lock);
2029
2030        tracked_request_set_serialising(req, bdrv_get_cluster_size(bs));
2031
2032        if ((flags & BDRV_REQ_NO_WAIT) && bdrv_find_conflicting_request(req)) {
2033            return -EBUSY;
2034        }
2035
2036        bdrv_wait_serialising_requests_locked(req);
2037    } else {
2038        bdrv_wait_serialising_requests(req);
2039    }
2040
2041    assert(req->overlap_offset <= offset);
2042    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
2043    assert(offset + bytes <= bs->total_sectors * BDRV_SECTOR_SIZE ||
2044           child->perm & BLK_PERM_RESIZE);
2045
2046    switch (req->type) {
2047    case BDRV_TRACKED_WRITE:
2048    case BDRV_TRACKED_DISCARD:
2049        if (flags & BDRV_REQ_WRITE_UNCHANGED) {
2050            assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
2051        } else {
2052            assert(child->perm & BLK_PERM_WRITE);
2053        }
2054        bdrv_write_threshold_check_write(bs, offset, bytes);
2055        return 0;
2056    case BDRV_TRACKED_TRUNCATE:
2057        assert(child->perm & BLK_PERM_RESIZE);
2058        return 0;
2059    default:
2060        abort();
2061    }
2062}
2063
2064static inline void coroutine_fn
2065bdrv_co_write_req_finish(BdrvChild *child, int64_t offset, int64_t bytes,
2066                         BdrvTrackedRequest *req, int ret)
2067{
2068    int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
2069    BlockDriverState *bs = child->bs;
2070
2071    bdrv_check_request(offset, bytes, &error_abort);
2072
2073    qatomic_inc(&bs->write_gen);
2074
2075    /*
2076     * Discard cannot extend the image, but in error handling cases, such as
2077     * when reverting a qcow2 cluster allocation, the discarded range can pass
2078     * the end of image file, so we cannot assert about BDRV_TRACKED_DISCARD
2079     * here. Instead, just skip it, since semantically a discard request
2080     * beyond EOF cannot expand the image anyway.
2081     */
2082    if (ret == 0 &&
2083        (req->type == BDRV_TRACKED_TRUNCATE ||
2084         end_sector > bs->total_sectors) &&
2085        req->type != BDRV_TRACKED_DISCARD) {
2086        bs->total_sectors = end_sector;
2087        bdrv_parent_cb_resize(bs);
2088        bdrv_dirty_bitmap_truncate(bs, end_sector << BDRV_SECTOR_BITS);
2089    }
2090    if (req->bytes) {
2091        switch (req->type) {
2092        case BDRV_TRACKED_WRITE:
2093            stat64_max(&bs->wr_highest_offset, offset + bytes);
2094            /* fall through, to set dirty bits */
2095        case BDRV_TRACKED_DISCARD:
2096            bdrv_set_dirty(bs, offset, bytes);
2097            break;
2098        default:
2099            break;
2100        }
2101    }
2102}
2103
2104/*
2105 * Forwards an already correctly aligned write request to the BlockDriver,
2106 * after possibly fragmenting it.
2107 */
2108static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
2109    BdrvTrackedRequest *req, int64_t offset, int64_t bytes,
2110    int64_t align, QEMUIOVector *qiov, size_t qiov_offset,
2111    BdrvRequestFlags flags)
2112{
2113    BlockDriverState *bs = child->bs;
2114    BlockDriver *drv = bs->drv;
2115    int ret;
2116
2117    int64_t bytes_remaining = bytes;
2118    int max_transfer;
2119
2120    bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, &error_abort);
2121
2122    if (!drv) {
2123        return -ENOMEDIUM;
2124    }
2125
2126    if (bdrv_has_readonly_bitmaps(bs)) {
2127        return -EPERM;
2128    }
2129
2130    assert(is_power_of_2(align));
2131    assert((offset & (align - 1)) == 0);
2132    assert((bytes & (align - 1)) == 0);
2133    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
2134                                   align);
2135
2136    ret = bdrv_co_write_req_prepare(child, offset, bytes, req, flags);
2137
2138    if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
2139        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
2140        qemu_iovec_is_zero(qiov, qiov_offset, bytes)) {
2141        flags |= BDRV_REQ_ZERO_WRITE;
2142        if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
2143            flags |= BDRV_REQ_MAY_UNMAP;
2144        }
2145    }
2146
2147    if (ret < 0) {
2148        /* Do nothing, write notifier decided to fail this request */
2149    } else if (flags & BDRV_REQ_ZERO_WRITE) {
2150        bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
2151        ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
2152    } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
2153        ret = bdrv_driver_pwritev_compressed(bs, offset, bytes,
2154                                             qiov, qiov_offset);
2155    } else if (bytes <= max_transfer) {
2156        bdrv_debug_event(bs, BLKDBG_PWRITEV);
2157        ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, qiov_offset, flags);
2158    } else {
2159        bdrv_debug_event(bs, BLKDBG_PWRITEV);
2160        while (bytes_remaining) {
2161            int num = MIN(bytes_remaining, max_transfer);
2162            int local_flags = flags;
2163
2164            assert(num);
2165            if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
2166                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
2167                /* If FUA is going to be emulated by flush, we only
2168                 * need to flush on the last iteration */
2169                local_flags &= ~BDRV_REQ_FUA;
2170            }
2171
2172            ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
2173                                      num, qiov,
2174                                      qiov_offset + bytes - bytes_remaining,
2175                                      local_flags);
2176            if (ret < 0) {
2177                break;
2178            }
2179            bytes_remaining -= num;
2180        }
2181    }
2182    bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
2183
2184    if (ret >= 0) {
2185        ret = 0;
2186    }
2187    bdrv_co_write_req_finish(child, offset, bytes, req, ret);
2188
2189    return ret;
2190}
2191
2192static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
2193                                                int64_t offset,
2194                                                int64_t bytes,
2195                                                BdrvRequestFlags flags,
2196                                                BdrvTrackedRequest *req)
2197{
2198    BlockDriverState *bs = child->bs;
2199    QEMUIOVector local_qiov;
2200    uint64_t align = bs->bl.request_alignment;
2201    int ret = 0;
2202    bool padding;
2203    BdrvRequestPadding pad;
2204
2205    padding = bdrv_init_padding(bs, offset, bytes, &pad);
2206    if (padding) {
2207        assert(!(flags & BDRV_REQ_NO_WAIT));
2208        bdrv_make_request_serialising(req, align);
2209
2210        bdrv_padding_rmw_read(child, req, &pad, true);
2211
2212        if (pad.head || pad.merge_reads) {
2213            int64_t aligned_offset = offset & ~(align - 1);
2214            int64_t write_bytes = pad.merge_reads ? pad.buf_len : align;
2215
2216            qemu_iovec_init_buf(&local_qiov, pad.buf, write_bytes);
2217            ret = bdrv_aligned_pwritev(child, req, aligned_offset, write_bytes,
2218                                       align, &local_qiov, 0,
2219                                       flags & ~BDRV_REQ_ZERO_WRITE);
2220            if (ret < 0 || pad.merge_reads) {
2221                /* Error or all work is done */
2222                goto out;
2223            }
2224            offset += write_bytes - pad.head;
2225            bytes -= write_bytes - pad.head;
2226        }
2227    }
2228
2229    assert(!bytes || (offset & (align - 1)) == 0);
2230    if (bytes >= align) {
2231        /* Write the aligned part in the middle. */
2232        int64_t aligned_bytes = bytes & ~(align - 1);
2233        ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
2234                                   NULL, 0, flags);
2235        if (ret < 0) {
2236            goto out;
2237        }
2238        bytes -= aligned_bytes;
2239        offset += aligned_bytes;
2240    }
2241
2242    assert(!bytes || (offset & (align - 1)) == 0);
2243    if (bytes) {
2244        assert(align == pad.tail + bytes);
2245
2246        qemu_iovec_init_buf(&local_qiov, pad.tail_buf, align);
2247        ret = bdrv_aligned_pwritev(child, req, offset, align, align,
2248                                   &local_qiov, 0,
2249                                   flags & ~BDRV_REQ_ZERO_WRITE);
2250    }
2251
2252out:
2253    bdrv_padding_destroy(&pad);
2254
2255    return ret;
2256}
2257
2258/*
2259 * Handle a write request in coroutine context
2260 */
2261int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
2262    int64_t offset, int64_t bytes, QEMUIOVector *qiov,
2263    BdrvRequestFlags flags)
2264{
2265    IO_CODE();
2266    return bdrv_co_pwritev_part(child, offset, bytes, qiov, 0, flags);
2267}
2268
2269int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
2270    int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset,
2271    BdrvRequestFlags flags)
2272{
2273    BlockDriverState *bs = child->bs;
2274    BdrvTrackedRequest req;
2275    uint64_t align = bs->bl.request_alignment;
2276    BdrvRequestPadding pad;
2277    int ret;
2278    bool padded = false;
2279    IO_CODE();
2280
2281    trace_bdrv_co_pwritev_part(child->bs, offset, bytes, flags);
2282
2283    if (!bdrv_is_inserted(bs)) {
2284        return -ENOMEDIUM;
2285    }
2286
2287    if (flags & BDRV_REQ_ZERO_WRITE) {
2288        ret = bdrv_check_qiov_request(offset, bytes, qiov, qiov_offset, NULL);
2289    } else {
2290        ret = bdrv_check_request32(offset, bytes, qiov, qiov_offset);
2291    }
2292    if (ret < 0) {
2293        return ret;
2294    }
2295
2296    /* If the request is misaligned then we can't make it efficient */
2297    if ((flags & BDRV_REQ_NO_FALLBACK) &&
2298        !QEMU_IS_ALIGNED(offset | bytes, align))
2299    {
2300        return -ENOTSUP;
2301    }
2302
2303    if (bytes == 0 && !QEMU_IS_ALIGNED(offset, bs->bl.request_alignment)) {
2304        /*
2305         * Aligning zero request is nonsense. Even if driver has special meaning
2306         * of zero-length (like qcow2_co_pwritev_compressed_part), we can't pass
2307         * it to driver due to request_alignment.
2308         *
2309         * Still, no reason to return an error if someone do unaligned
2310         * zero-length write occasionally.
2311         */
2312        return 0;
2313    }
2314
2315    if (!(flags & BDRV_REQ_ZERO_WRITE)) {
2316        /*
2317         * Pad request for following read-modify-write cycle.
2318         * bdrv_co_do_zero_pwritev() does aligning by itself, so, we do
2319         * alignment only if there is no ZERO flag.
2320         */
2321        ret = bdrv_pad_request(bs, &qiov, &qiov_offset, &offset, &bytes, &pad,
2322                               &padded);
2323        if (ret < 0) {
2324            return ret;
2325        }
2326    }
2327
2328    bdrv_inc_in_flight(bs);
2329    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
2330
2331    if (flags & BDRV_REQ_ZERO_WRITE) {
2332        assert(!padded);
2333        ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
2334        goto out;
2335    }
2336
2337    if (padded) {
2338        /*
2339         * Request was unaligned to request_alignment and therefore
2340         * padded.  We are going to do read-modify-write, and must
2341         * serialize the request to prevent interactions of the
2342         * widened region with other transactions.
2343         */
2344        assert(!(flags & BDRV_REQ_NO_WAIT));
2345        bdrv_make_request_serialising(&req, align);
2346        bdrv_padding_rmw_read(child, &req, &pad, false);
2347    }
2348
2349    ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
2350                               qiov, qiov_offset, flags);
2351
2352    bdrv_padding_destroy(&pad);
2353
2354out:
2355    tracked_request_end(&req);
2356    bdrv_dec_in_flight(bs);
2357
2358    return ret;
2359}
2360
2361int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
2362                                       int64_t bytes, BdrvRequestFlags flags)
2363{
2364    IO_CODE();
2365    trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
2366
2367    if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
2368        flags &= ~BDRV_REQ_MAY_UNMAP;
2369    }
2370
2371    return bdrv_co_pwritev(child, offset, bytes, NULL,
2372                           BDRV_REQ_ZERO_WRITE | flags);
2373}
2374
2375/*
2376 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
2377 */
2378int bdrv_flush_all(void)
2379{
2380    BdrvNextIterator it;
2381    BlockDriverState *bs = NULL;
2382    int result = 0;
2383
2384    GLOBAL_STATE_CODE();
2385
2386    /*
2387     * bdrv queue is managed by record/replay,
2388     * creating new flush request for stopping
2389     * the VM may break the determinism
2390     */
2391    if (replay_events_enabled()) {
2392        return result;
2393    }
2394
2395    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
2396        AioContext *aio_context = bdrv_get_aio_context(bs);
2397        int ret;
2398
2399        aio_context_acquire(aio_context);
2400        ret = bdrv_flush(bs);
2401        if (ret < 0 && !result) {
2402            result = ret;
2403        }
2404        aio_context_release(aio_context);
2405    }
2406
2407    return result;
2408}
2409
2410/*
2411 * Returns the allocation status of the specified sectors.
2412 * Drivers not implementing the functionality are assumed to not support
2413 * backing files, hence all their sectors are reported as allocated.
2414 *
2415 * If 'want_zero' is true, the caller is querying for mapping
2416 * purposes, with a focus on valid BDRV_BLOCK_OFFSET_VALID, _DATA, and
2417 * _ZERO where possible; otherwise, the result favors larger 'pnum',
2418 * with a focus on accurate BDRV_BLOCK_ALLOCATED.
2419 *
2420 * If 'offset' is beyond the end of the disk image the return value is
2421 * BDRV_BLOCK_EOF and 'pnum' is set to 0.
2422 *
2423 * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
2424 * beyond the end of the disk image it will be clamped; if 'pnum' is set to
2425 * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
2426 *
2427 * 'pnum' is set to the number of bytes (including and immediately
2428 * following the specified offset) that are easily known to be in the
2429 * same allocated/unallocated state.  Note that a second call starting
2430 * at the original offset plus returned pnum may have the same status.
2431 * The returned value is non-zero on success except at end-of-file.
2432 *
2433 * Returns negative errno on failure.  Otherwise, if the
2434 * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
2435 * set to the host mapping and BDS corresponding to the guest offset.
2436 */
2437static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
2438                                             bool want_zero,
2439                                             int64_t offset, int64_t bytes,
2440                                             int64_t *pnum, int64_t *map,
2441                                             BlockDriverState **file)
2442{
2443    int64_t total_size;
2444    int64_t n; /* bytes */
2445    int ret;
2446    int64_t local_map = 0;
2447    BlockDriverState *local_file = NULL;
2448    int64_t aligned_offset, aligned_bytes;
2449    uint32_t align;
2450    bool has_filtered_child;
2451
2452    assert(pnum);
2453    *pnum = 0;
2454    total_size = bdrv_getlength(bs);
2455    if (total_size < 0) {
2456        ret = total_size;
2457        goto early_out;
2458    }
2459
2460    if (offset >= total_size) {
2461        ret = BDRV_BLOCK_EOF;
2462        goto early_out;
2463    }
2464    if (!bytes) {
2465        ret = 0;
2466        goto early_out;
2467    }
2468
2469    n = total_size - offset;
2470    if (n < bytes) {
2471        bytes = n;
2472    }
2473
2474    /* Must be non-NULL or bdrv_getlength() would have failed */
2475    assert(bs->drv);
2476    has_filtered_child = bdrv_filter_child(bs);
2477    if (!bs->drv->bdrv_co_block_status && !has_filtered_child) {
2478        *pnum = bytes;
2479        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
2480        if (offset + bytes == total_size) {
2481            ret |= BDRV_BLOCK_EOF;
2482        }
2483        if (bs->drv->protocol_name) {
2484            ret |= BDRV_BLOCK_OFFSET_VALID;
2485            local_map = offset;
2486            local_file = bs;
2487        }
2488        goto early_out;
2489    }
2490
2491    bdrv_inc_in_flight(bs);
2492
2493    /* Round out to request_alignment boundaries */
2494    align = bs->bl.request_alignment;
2495    aligned_offset = QEMU_ALIGN_DOWN(offset, align);
2496    aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
2497
2498    if (bs->drv->bdrv_co_block_status) {
2499        /*
2500         * Use the block-status cache only for protocol nodes: Format
2501         * drivers are generally quick to inquire the status, but protocol
2502         * drivers often need to get information from outside of qemu, so
2503         * we do not have control over the actual implementation.  There
2504         * have been cases where inquiring the status took an unreasonably
2505         * long time, and we can do nothing in qemu to fix it.
2506         * This is especially problematic for images with large data areas,
2507         * because finding the few holes in them and giving them special
2508         * treatment does not gain much performance.  Therefore, we try to
2509         * cache the last-identified data region.
2510         *
2511         * Second, limiting ourselves to protocol nodes allows us to assume
2512         * the block status for data regions to be DATA | OFFSET_VALID, and
2513         * that the host offset is the same as the guest offset.
2514         *
2515         * Note that it is possible that external writers zero parts of
2516         * the cached regions without the cache being invalidated, and so
2517         * we may report zeroes as data.  This is not catastrophic,
2518         * however, because reporting zeroes as data is fine.
2519         */
2520        if (QLIST_EMPTY(&bs->children) &&
2521            bdrv_bsc_is_data(bs, aligned_offset, pnum))
2522        {
2523            ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
2524            local_file = bs;
2525            local_map = aligned_offset;
2526        } else {
2527            ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset,
2528                                                aligned_bytes, pnum, &local_map,
2529                                                &local_file);
2530
2531            /*
2532             * Note that checking QLIST_EMPTY(&bs->children) is also done when
2533             * the cache is queried above.  Technically, we do not need to check
2534             * it here; the worst that can happen is that we fill the cache for
2535             * non-protocol nodes, and then it is never used.  However, filling
2536             * the cache requires an RCU update, so double check here to avoid
2537             * such an update if possible.
2538             *
2539             * Check want_zero, because we only want to update the cache when we
2540             * have accurate information about what is zero and what is data.
2541             */
2542            if (want_zero &&
2543                ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) &&
2544                QLIST_EMPTY(&bs->children))
2545            {
2546                /*
2547                 * When a protocol driver reports BLOCK_OFFSET_VALID, the
2548                 * returned local_map value must be the same as the offset we
2549                 * have passed (aligned_offset), and local_bs must be the node
2550                 * itself.
2551                 * Assert this, because we follow this rule when reading from
2552                 * the cache (see the `local_file = bs` and
2553                 * `local_map = aligned_offset` assignments above), and the
2554                 * result the cache delivers must be the same as the driver
2555                 * would deliver.
2556                 */
2557                assert(local_file == bs);
2558                assert(local_map == aligned_offset);
2559                bdrv_bsc_fill(bs, aligned_offset, *pnum);
2560            }
2561        }
2562    } else {
2563        /* Default code for filters */
2564
2565        local_file = bdrv_filter_bs(bs);
2566        assert(local_file);
2567
2568        *pnum = aligned_bytes;
2569        local_map = aligned_offset;
2570        ret = BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
2571    }
2572    if (ret < 0) {
2573        *pnum = 0;
2574        goto out;
2575    }
2576
2577    /*
2578     * The driver's result must be a non-zero multiple of request_alignment.
2579     * Clamp pnum and adjust map to original request.
2580     */
2581    assert(*pnum && QEMU_IS_ALIGNED(*pnum, align) &&
2582           align > offset - aligned_offset);
2583    if (ret & BDRV_BLOCK_RECURSE) {
2584        assert(ret & BDRV_BLOCK_DATA);
2585        assert(ret & BDRV_BLOCK_OFFSET_VALID);
2586        assert(!(ret & BDRV_BLOCK_ZERO));
2587    }
2588
2589    *pnum -= offset - aligned_offset;
2590    if (*pnum > bytes) {
2591        *pnum = bytes;
2592    }
2593    if (ret & BDRV_BLOCK_OFFSET_VALID) {
2594        local_map += offset - aligned_offset;
2595    }
2596
2597    if (ret & BDRV_BLOCK_RAW) {
2598        assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
2599        ret = bdrv_co_block_status(local_file, want_zero, local_map,
2600                                   *pnum, pnum, &local_map, &local_file);
2601        goto out;
2602    }
2603
2604    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
2605        ret |= BDRV_BLOCK_ALLOCATED;
2606    } else if (bs->drv->supports_backing) {
2607        BlockDriverState *cow_bs = bdrv_cow_bs(bs);
2608
2609        if (!cow_bs) {
2610            ret |= BDRV_BLOCK_ZERO;
2611        } else if (want_zero) {
2612            int64_t size2 = bdrv_getlength(cow_bs);
2613
2614            if (size2 >= 0 && offset >= size2) {
2615                ret |= BDRV_BLOCK_ZERO;
2616            }
2617        }
2618    }
2619
2620    if (want_zero && ret & BDRV_BLOCK_RECURSE &&
2621        local_file && local_file != bs &&
2622        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
2623        (ret & BDRV_BLOCK_OFFSET_VALID)) {
2624        int64_t file_pnum;
2625        int ret2;
2626
2627        ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
2628                                    *pnum, &file_pnum, NULL, NULL);
2629        if (ret2 >= 0) {
2630            /* Ignore errors.  This is just providing extra information, it
2631             * is useful but not necessary.
2632             */
2633            if (ret2 & BDRV_BLOCK_EOF &&
2634                (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
2635                /*
2636                 * It is valid for the format block driver to read
2637                 * beyond the end of the underlying file's current
2638                 * size; such areas read as zero.
2639                 */
2640                ret |= BDRV_BLOCK_ZERO;
2641            } else {
2642                /* Limit request to the range reported by the protocol driver */
2643                *pnum = file_pnum;
2644                ret |= (ret2 & BDRV_BLOCK_ZERO);
2645            }
2646        }
2647    }
2648
2649out:
2650    bdrv_dec_in_flight(bs);
2651    if (ret >= 0 && offset + *pnum == total_size) {
2652        ret |= BDRV_BLOCK_EOF;
2653    }
2654early_out:
2655    if (file) {
2656        *file = local_file;
2657    }
2658    if (map) {
2659        *map = local_map;
2660    }
2661    return ret;
2662}
2663
2664int coroutine_fn
2665bdrv_co_common_block_status_above(BlockDriverState *bs,
2666                                  BlockDriverState *base,
2667                                  bool include_base,
2668                                  bool want_zero,
2669                                  int64_t offset,
2670                                  int64_t bytes,
2671                                  int64_t *pnum,
2672                                  int64_t *map,
2673                                  BlockDriverState **file,
2674                                  int *depth)
2675{
2676    int ret;
2677    BlockDriverState *p;
2678    int64_t eof = 0;
2679    int dummy;
2680    IO_CODE();
2681
2682    assert(!include_base || base); /* Can't include NULL base */
2683
2684    if (!depth) {
2685        depth = &dummy;
2686    }
2687    *depth = 0;
2688
2689    if (!include_base && bs == base) {
2690        *pnum = bytes;
2691        return 0;
2692    }
2693
2694    ret = bdrv_co_block_status(bs, want_zero, offset, bytes, pnum, map, file);
2695    ++*depth;
2696    if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED || bs == base) {
2697        return ret;
2698    }
2699
2700    if (ret & BDRV_BLOCK_EOF) {
2701        eof = offset + *pnum;
2702    }
2703
2704    assert(*pnum <= bytes);
2705    bytes = *pnum;
2706
2707    for (p = bdrv_filter_or_cow_bs(bs); include_base || p != base;
2708         p = bdrv_filter_or_cow_bs(p))
2709    {
2710        ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
2711                                   file);
2712        ++*depth;
2713        if (ret < 0) {
2714            return ret;
2715        }
2716        if (*pnum == 0) {
2717            /*
2718             * The top layer deferred to this layer, and because this layer is
2719             * short, any zeroes that we synthesize beyond EOF behave as if they
2720             * were allocated at this layer.
2721             *
2722             * We don't include BDRV_BLOCK_EOF into ret, as upper layer may be
2723             * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see
2724             * below.
2725             */
2726            assert(ret & BDRV_BLOCK_EOF);
2727            *pnum = bytes;
2728            if (file) {
2729                *file = p;
2730            }
2731            ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_ALLOCATED;
2732            break;
2733        }
2734        if (ret & BDRV_BLOCK_ALLOCATED) {
2735            /*
2736             * We've found the node and the status, we must break.
2737             *
2738             * Drop BDRV_BLOCK_EOF, as it's not for upper layer, which may be
2739             * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see
2740             * below.
2741             */
2742            ret &= ~BDRV_BLOCK_EOF;
2743            break;
2744        }
2745
2746        if (p == base) {
2747            assert(include_base);
2748            break;
2749        }
2750
2751        /*
2752         * OK, [offset, offset + *pnum) region is unallocated on this layer,
2753         * let's continue the diving.
2754         */
2755        assert(*pnum <= bytes);
2756        bytes = *pnum;
2757    }
2758
2759    if (offset + *pnum == eof) {
2760        ret |= BDRV_BLOCK_EOF;
2761    }
2762
2763    return ret;
2764}
2765
2766int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
2767                            int64_t offset, int64_t bytes, int64_t *pnum,
2768                            int64_t *map, BlockDriverState **file)
2769{
2770    IO_CODE();
2771    return bdrv_common_block_status_above(bs, base, false, true, offset, bytes,
2772                                          pnum, map, file, NULL);
2773}
2774
2775int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
2776                      int64_t *pnum, int64_t *map, BlockDriverState **file)
2777{
2778    IO_CODE();
2779    return bdrv_block_status_above(bs, bdrv_filter_or_cow_bs(bs),
2780                                   offset, bytes, pnum, map, file);
2781}
2782
2783/*
2784 * Check @bs (and its backing chain) to see if the range defined
2785 * by @offset and @bytes is known to read as zeroes.
2786 * Return 1 if that is the case, 0 otherwise and -errno on error.
2787 * This test is meant to be fast rather than accurate so returning 0
2788 * does not guarantee non-zero data.
2789 */
2790int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
2791                                      int64_t bytes)
2792{
2793    int ret;
2794    int64_t pnum = bytes;
2795    IO_CODE();
2796
2797    if (!bytes) {
2798        return 1;
2799    }
2800
2801    ret = bdrv_common_block_status_above(bs, NULL, false, false, offset,
2802                                         bytes, &pnum, NULL, NULL, NULL);
2803
2804    if (ret < 0) {
2805        return ret;
2806    }
2807
2808    return (pnum == bytes) && (ret & BDRV_BLOCK_ZERO);
2809}
2810
2811int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
2812                                   int64_t bytes, int64_t *pnum)
2813{
2814    int ret;
2815    int64_t dummy;
2816    IO_CODE();
2817
2818    ret = bdrv_common_block_status_above(bs, bs, true, false, offset,
2819                                         bytes, pnum ? pnum : &dummy, NULL,
2820                                         NULL, NULL);
2821    if (ret < 0) {
2822        return ret;
2823    }
2824    return !!(ret & BDRV_BLOCK_ALLOCATED);
2825}
2826
2827/*
2828 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2829 *
2830 * Return a positive depth if (a prefix of) the given range is allocated
2831 * in any image between BASE and TOP (BASE is only included if include_base
2832 * is set).  Depth 1 is TOP, 2 is the first backing layer, and so forth.
2833 * BASE can be NULL to check if the given offset is allocated in any
2834 * image of the chain.  Return 0 otherwise, or negative errno on
2835 * failure.
2836 *
2837 * 'pnum' is set to the number of bytes (including and immediately
2838 * following the specified offset) that are known to be in the same
2839 * allocated/unallocated state.  Note that a subsequent call starting
2840 * at 'offset + *pnum' may return the same allocation status (in other
2841 * words, the result is not necessarily the maximum possible range);
2842 * but 'pnum' will only be 0 when end of file is reached.
2843 */
2844int bdrv_is_allocated_above(BlockDriverState *top,
2845                            BlockDriverState *base,
2846                            bool include_base, int64_t offset,
2847                            int64_t bytes, int64_t *pnum)
2848{
2849    int depth;
2850    int ret = bdrv_common_block_status_above(top, base, include_base, false,
2851                                             offset, bytes, pnum, NULL, NULL,
2852                                             &depth);
2853    IO_CODE();
2854    if (ret < 0) {
2855        return ret;
2856    }
2857
2858    if (ret & BDRV_BLOCK_ALLOCATED) {
2859        return depth;
2860    }
2861    return 0;
2862}
2863
2864int coroutine_fn
2865bdrv_co_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2866{
2867    BlockDriver *drv = bs->drv;
2868    BlockDriverState *child_bs = bdrv_primary_bs(bs);
2869    int ret;
2870    IO_CODE();
2871
2872    ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
2873    if (ret < 0) {
2874        return ret;
2875    }
2876
2877    if (!drv) {
2878        return -ENOMEDIUM;
2879    }
2880
2881    bdrv_inc_in_flight(bs);
2882
2883    if (drv->bdrv_load_vmstate) {
2884        ret = drv->bdrv_load_vmstate(bs, qiov, pos);
2885    } else if (child_bs) {
2886        ret = bdrv_co_readv_vmstate(child_bs, qiov, pos);
2887    } else {
2888        ret = -ENOTSUP;
2889    }
2890
2891    bdrv_dec_in_flight(bs);
2892
2893    return ret;
2894}
2895
2896int coroutine_fn
2897bdrv_co_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2898{
2899    BlockDriver *drv = bs->drv;
2900    BlockDriverState *child_bs = bdrv_primary_bs(bs);
2901    int ret;
2902    IO_CODE();
2903
2904    ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
2905    if (ret < 0) {
2906        return ret;
2907    }
2908
2909    if (!drv) {
2910        return -ENOMEDIUM;
2911    }
2912
2913    bdrv_inc_in_flight(bs);
2914
2915    if (drv->bdrv_save_vmstate) {
2916        ret = drv->bdrv_save_vmstate(bs, qiov, pos);
2917    } else if (child_bs) {
2918        ret = bdrv_co_writev_vmstate(child_bs, qiov, pos);
2919    } else {
2920        ret = -ENOTSUP;
2921    }
2922
2923    bdrv_dec_in_flight(bs);
2924
2925    return ret;
2926}
2927
2928int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2929                      int64_t pos, int size)
2930{
2931    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2932    int ret = bdrv_writev_vmstate(bs, &qiov, pos);
2933    IO_CODE();
2934
2935    return ret < 0 ? ret : size;
2936}
2937
2938int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2939                      int64_t pos, int size)
2940{
2941    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, size);
2942    int ret = bdrv_readv_vmstate(bs, &qiov, pos);
2943    IO_CODE();
2944
2945    return ret < 0 ? ret : size;
2946}
2947
2948/**************************************************************/
2949/* async I/Os */
2950
2951void bdrv_aio_cancel(BlockAIOCB *acb)
2952{
2953    IO_CODE();
2954    qemu_aio_ref(acb);
2955    bdrv_aio_cancel_async(acb);
2956    while (acb->refcnt > 1) {
2957        if (acb->aiocb_info->get_aio_context) {
2958            aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2959        } else if (acb->bs) {
2960            /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
2961             * assert that we're not using an I/O thread.  Thread-safe
2962             * code should use bdrv_aio_cancel_async exclusively.
2963             */
2964            assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
2965            aio_poll(bdrv_get_aio_context(acb->bs), true);
2966        } else {
2967            abort();
2968        }
2969    }
2970    qemu_aio_unref(acb);
2971}
2972
2973/* Async version of aio cancel. The caller is not blocked if the acb implements
2974 * cancel_async, otherwise we do nothing and let the request normally complete.
2975 * In either case the completion callback must be called. */
2976void bdrv_aio_cancel_async(BlockAIOCB *acb)
2977{
2978    IO_CODE();
2979    if (acb->aiocb_info->cancel_async) {
2980        acb->aiocb_info->cancel_async(acb);
2981    }
2982}
2983
2984/**************************************************************/
2985/* Coroutine block device emulation */
2986
2987int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2988{
2989    BdrvChild *primary_child = bdrv_primary_child(bs);
2990    BdrvChild *child;
2991    int current_gen;
2992    int ret = 0;
2993    IO_CODE();
2994
2995    bdrv_inc_in_flight(bs);
2996
2997    if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2998        bdrv_is_sg(bs)) {
2999        goto early_exit;
3000    }
3001
3002    qemu_co_mutex_lock(&bs->reqs_lock);
3003    current_gen = qatomic_read(&bs->write_gen);
3004
3005    /* Wait until any previous flushes are completed */
3006    while (bs->active_flush_req) {
3007        qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
3008    }
3009
3010    /* Flushes reach this point in nondecreasing current_gen order.  */
3011    bs->active_flush_req = true;
3012    qemu_co_mutex_unlock(&bs->reqs_lock);
3013
3014    /* Write back all layers by calling one driver function */
3015    if (bs->drv->bdrv_co_flush) {
3016        ret = bs->drv->bdrv_co_flush(bs);
3017        goto out;
3018    }
3019
3020    /* Write back cached data to the OS even with cache=unsafe */
3021    BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_OS);
3022    if (bs->drv->bdrv_co_flush_to_os) {
3023        ret = bs->drv->bdrv_co_flush_to_os(bs);
3024        if (ret < 0) {
3025            goto out;
3026        }
3027    }
3028
3029    /* But don't actually force it to the disk with cache=unsafe */
3030    if (bs->open_flags & BDRV_O_NO_FLUSH) {
3031        goto flush_children;
3032    }
3033
3034    /* Check if we really need to flush anything */
3035    if (bs->flushed_gen == current_gen) {
3036        goto flush_children;
3037    }
3038
3039    BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK);
3040    if (!bs->drv) {
3041        /* bs->drv->bdrv_co_flush() might have ejected the BDS
3042         * (even in case of apparent success) */
3043        ret = -ENOMEDIUM;
3044        goto out;
3045    }
3046    if (bs->drv->bdrv_co_flush_to_disk) {
3047        ret = bs->drv->bdrv_co_flush_to_disk(bs);
3048    } else if (bs->drv->bdrv_aio_flush) {
3049        BlockAIOCB *acb;
3050        CoroutineIOCompletion co = {
3051            .coroutine = qemu_coroutine_self(),
3052        };
3053
3054        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
3055        if (acb == NULL) {
3056            ret = -EIO;
3057        } else {
3058            qemu_coroutine_yield();
3059            ret = co.ret;
3060        }
3061    } else {
3062        /*
3063         * Some block drivers always operate in either writethrough or unsafe
3064         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
3065         * know how the server works (because the behaviour is hardcoded or
3066         * depends on server-side configuration), so we can't ensure that
3067         * everything is safe on disk. Returning an error doesn't work because
3068         * that would break guests even if the server operates in writethrough
3069         * mode.
3070         *
3071         * Let's hope the user knows what he's doing.
3072         */
3073        ret = 0;
3074    }
3075
3076    if (ret < 0) {
3077        goto out;
3078    }
3079
3080    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
3081     * in the case of cache=unsafe, so there are no useless flushes.
3082     */
3083flush_children:
3084    ret = 0;
3085    QLIST_FOREACH(child, &bs->children, next) {
3086        if (child->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED)) {
3087            int this_child_ret = bdrv_co_flush(child->bs);
3088            if (!ret) {
3089                ret = this_child_ret;
3090            }
3091        }
3092    }
3093
3094out:
3095    /* Notify any pending flushes that we have completed */
3096    if (ret == 0) {
3097        bs->flushed_gen = current_gen;
3098    }
3099
3100    qemu_co_mutex_lock(&bs->reqs_lock);
3101    bs->active_flush_req = false;
3102    /* Return value is ignored - it's ok if wait queue is empty */
3103    qemu_co_queue_next(&bs->flush_queue);
3104    qemu_co_mutex_unlock(&bs->reqs_lock);
3105
3106early_exit:
3107    bdrv_dec_in_flight(bs);
3108    return ret;
3109}
3110
3111int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, int64_t offset,
3112                                  int64_t bytes)
3113{
3114    BdrvTrackedRequest req;
3115    int ret;
3116    int64_t max_pdiscard;
3117    int head, tail, align;
3118    BlockDriverState *bs = child->bs;
3119    IO_CODE();
3120
3121    if (!bs || !bs->drv || !bdrv_is_inserted(bs)) {
3122        return -ENOMEDIUM;
3123    }
3124
3125    if (bdrv_has_readonly_bitmaps(bs)) {
3126        return -EPERM;
3127    }
3128
3129    ret = bdrv_check_request(offset, bytes, NULL);
3130    if (ret < 0) {
3131        return ret;
3132    }
3133
3134    /* Do nothing if disabled.  */
3135    if (!(bs->open_flags & BDRV_O_UNMAP)) {
3136        return 0;
3137    }
3138
3139    if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
3140        return 0;
3141    }
3142
3143    /* Invalidate the cached block-status data range if this discard overlaps */
3144    bdrv_bsc_invalidate_range(bs, offset, bytes);
3145
3146    /* Discard is advisory, but some devices track and coalesce
3147     * unaligned requests, so we must pass everything down rather than
3148     * round here.  Still, most devices will just silently ignore
3149     * unaligned requests (by returning -ENOTSUP), so we must fragment
3150     * the request accordingly.  */
3151    align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
3152    assert(align % bs->bl.request_alignment == 0);
3153    head = offset % align;
3154    tail = (offset + bytes) % align;
3155
3156    bdrv_inc_in_flight(bs);
3157    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
3158
3159    ret = bdrv_co_write_req_prepare(child, offset, bytes, &req, 0);
3160    if (ret < 0) {
3161        goto out;
3162    }
3163
3164    max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT64_MAX),
3165                                   align);
3166    assert(max_pdiscard >= bs->bl.request_alignment);
3167
3168    while (bytes > 0) {
3169        int64_t num = bytes;
3170
3171        if (head) {
3172            /* Make small requests to get to alignment boundaries. */
3173            num = MIN(bytes, align - head);
3174            if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
3175                num %= bs->bl.request_alignment;
3176            }
3177            head = (head + num) % align;
3178            assert(num < max_pdiscard);
3179        } else if (tail) {
3180            if (num > align) {
3181                /* Shorten the request to the last aligned cluster.  */
3182                num -= tail;
3183            } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
3184                       tail > bs->bl.request_alignment) {
3185                tail %= bs->bl.request_alignment;
3186                num -= tail;
3187            }
3188        }
3189        /* limit request size */
3190        if (num > max_pdiscard) {
3191            num = max_pdiscard;
3192        }
3193
3194        if (!bs->drv) {
3195            ret = -ENOMEDIUM;
3196            goto out;
3197        }
3198        if (bs->drv->bdrv_co_pdiscard) {
3199            ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
3200        } else {
3201            BlockAIOCB *acb;
3202            CoroutineIOCompletion co = {
3203                .coroutine = qemu_coroutine_self(),
3204            };
3205
3206            acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
3207                                             bdrv_co_io_em_complete, &co);
3208            if (acb == NULL) {
3209                ret = -EIO;
3210                goto out;
3211            } else {
3212                qemu_coroutine_yield();
3213                ret = co.ret;
3214            }
3215        }
3216        if (ret && ret != -ENOTSUP) {
3217            goto out;
3218        }
3219
3220        offset += num;
3221        bytes -= num;
3222    }
3223    ret = 0;
3224out:
3225    bdrv_co_write_req_finish(child, req.offset, req.bytes, &req, ret);
3226    tracked_request_end(&req);
3227    bdrv_dec_in_flight(bs);
3228    return ret;
3229}
3230
3231int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
3232{
3233    BlockDriver *drv = bs->drv;
3234    CoroutineIOCompletion co = {
3235        .coroutine = qemu_coroutine_self(),
3236    };
3237    BlockAIOCB *acb;
3238    IO_CODE();
3239
3240    bdrv_inc_in_flight(bs);
3241    if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
3242        co.ret = -ENOTSUP;
3243        goto out;
3244    }
3245
3246    if (drv->bdrv_co_ioctl) {
3247        co.ret = drv->bdrv_co_ioctl(bs, req, buf);
3248    } else {
3249        acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
3250        if (!acb) {
3251            co.ret = -ENOTSUP;
3252            goto out;
3253        }
3254        qemu_coroutine_yield();
3255    }
3256out:
3257    bdrv_dec_in_flight(bs);
3258    return co.ret;
3259}
3260
3261void *qemu_blockalign(BlockDriverState *bs, size_t size)
3262{
3263    IO_CODE();
3264    return qemu_memalign(bdrv_opt_mem_align(bs), size);
3265}
3266
3267void *qemu_blockalign0(BlockDriverState *bs, size_t size)
3268{
3269    IO_CODE();
3270    return memset(qemu_blockalign(bs, size), 0, size);
3271}
3272
3273void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
3274{
3275    size_t align = bdrv_opt_mem_align(bs);
3276    IO_CODE();
3277
3278    /* Ensure that NULL is never returned on success */
3279    assert(align > 0);
3280    if (size == 0) {
3281        size = align;
3282    }
3283
3284    return qemu_try_memalign(align, size);
3285}
3286
3287void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
3288{
3289    void *mem = qemu_try_blockalign(bs, size);
3290    IO_CODE();
3291
3292    if (mem) {
3293        memset(mem, 0, size);
3294    }
3295
3296    return mem;
3297}
3298
3299/*
3300 * Check if all memory in this vector is sector aligned.
3301 */
3302bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
3303{
3304    int i;
3305    size_t alignment = bdrv_min_mem_align(bs);
3306    IO_CODE();
3307
3308    for (i = 0; i < qiov->niov; i++) {
3309        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
3310            return false;
3311        }
3312        if (qiov->iov[i].iov_len % alignment) {
3313            return false;
3314        }
3315    }
3316
3317    return true;
3318}
3319
3320void bdrv_io_plug(BlockDriverState *bs)
3321{
3322    BdrvChild *child;
3323    IO_CODE();
3324
3325    QLIST_FOREACH(child, &bs->children, next) {
3326        bdrv_io_plug(child->bs);
3327    }
3328
3329    if (qatomic_fetch_inc(&bs->io_plugged) == 0) {
3330        BlockDriver *drv = bs->drv;
3331        if (drv && drv->bdrv_io_plug) {
3332            drv->bdrv_io_plug(bs);
3333        }
3334    }
3335}
3336
3337void bdrv_io_unplug(BlockDriverState *bs)
3338{
3339    BdrvChild *child;
3340    IO_CODE();
3341
3342    assert(bs->io_plugged);
3343    if (qatomic_fetch_dec(&bs->io_plugged) == 1) {
3344        BlockDriver *drv = bs->drv;
3345        if (drv && drv->bdrv_io_unplug) {
3346            drv->bdrv_io_unplug(bs);
3347        }
3348    }
3349
3350    QLIST_FOREACH(child, &bs->children, next) {
3351        bdrv_io_unplug(child->bs);
3352    }
3353}
3354
3355void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size)
3356{
3357    BdrvChild *child;
3358
3359    GLOBAL_STATE_CODE();
3360    if (bs->drv && bs->drv->bdrv_register_buf) {
3361        bs->drv->bdrv_register_buf(bs, host, size);
3362    }
3363    QLIST_FOREACH(child, &bs->children, next) {
3364        bdrv_register_buf(child->bs, host, size);
3365    }
3366}
3367
3368void bdrv_unregister_buf(BlockDriverState *bs, void *host)
3369{
3370    BdrvChild *child;
3371
3372    GLOBAL_STATE_CODE();
3373    if (bs->drv && bs->drv->bdrv_unregister_buf) {
3374        bs->drv->bdrv_unregister_buf(bs, host);
3375    }
3376    QLIST_FOREACH(child, &bs->children, next) {
3377        bdrv_unregister_buf(child->bs, host);
3378    }
3379}
3380
3381static int coroutine_fn bdrv_co_copy_range_internal(
3382        BdrvChild *src, int64_t src_offset, BdrvChild *dst,
3383        int64_t dst_offset, int64_t bytes,
3384        BdrvRequestFlags read_flags, BdrvRequestFlags write_flags,
3385        bool recurse_src)
3386{
3387    BdrvTrackedRequest req;
3388    int ret;
3389
3390    /* TODO We can support BDRV_REQ_NO_FALLBACK here */
3391    assert(!(read_flags & BDRV_REQ_NO_FALLBACK));
3392    assert(!(write_flags & BDRV_REQ_NO_FALLBACK));
3393    assert(!(read_flags & BDRV_REQ_NO_WAIT));
3394    assert(!(write_flags & BDRV_REQ_NO_WAIT));
3395
3396    if (!dst || !dst->bs || !bdrv_is_inserted(dst->bs)) {
3397        return -ENOMEDIUM;
3398    }
3399    ret = bdrv_check_request32(dst_offset, bytes, NULL, 0);
3400    if (ret) {
3401        return ret;
3402    }
3403    if (write_flags & BDRV_REQ_ZERO_WRITE) {
3404        return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, write_flags);
3405    }
3406
3407    if (!src || !src->bs || !bdrv_is_inserted(src->bs)) {
3408        return -ENOMEDIUM;
3409    }
3410    ret = bdrv_check_request32(src_offset, bytes, NULL, 0);
3411    if (ret) {
3412        return ret;
3413    }
3414
3415    if (!src->bs->drv->bdrv_co_copy_range_from
3416        || !dst->bs->drv->bdrv_co_copy_range_to
3417        || src->bs->encrypted || dst->bs->encrypted) {
3418        return -ENOTSUP;
3419    }
3420
3421    if (recurse_src) {
3422        bdrv_inc_in_flight(src->bs);
3423        tracked_request_begin(&req, src->bs, src_offset, bytes,
3424                              BDRV_TRACKED_READ);
3425
3426        /* BDRV_REQ_SERIALISING is only for write operation */
3427        assert(!(read_flags & BDRV_REQ_SERIALISING));
3428        bdrv_wait_serialising_requests(&req);
3429
3430        ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
3431                                                    src, src_offset,
3432                                                    dst, dst_offset,
3433                                                    bytes,
3434                                                    read_flags, write_flags);
3435
3436        tracked_request_end(&req);
3437        bdrv_dec_in_flight(src->bs);
3438    } else {
3439        bdrv_inc_in_flight(dst->bs);
3440        tracked_request_begin(&req, dst->bs, dst_offset, bytes,
3441                              BDRV_TRACKED_WRITE);
3442        ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req,
3443                                        write_flags);
3444        if (!ret) {
3445            ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
3446                                                      src, src_offset,
3447                                                      dst, dst_offset,
3448                                                      bytes,
3449                                                      read_flags, write_flags);
3450        }
3451        bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret);
3452        tracked_request_end(&req);
3453        bdrv_dec_in_flight(dst->bs);
3454    }
3455
3456    return ret;
3457}
3458
3459/* Copy range from @src to @dst.
3460 *
3461 * See the comment of bdrv_co_copy_range for the parameter and return value
3462 * semantics. */
3463int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset,
3464                                         BdrvChild *dst, int64_t dst_offset,
3465                                         int64_t bytes,
3466                                         BdrvRequestFlags read_flags,
3467                                         BdrvRequestFlags write_flags)
3468{
3469    IO_CODE();
3470    trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes,
3471                                  read_flags, write_flags);
3472    return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
3473                                       bytes, read_flags, write_flags, true);
3474}
3475
3476/* Copy range from @src to @dst.
3477 *
3478 * See the comment of bdrv_co_copy_range for the parameter and return value
3479 * semantics. */
3480int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset,
3481                                       BdrvChild *dst, int64_t dst_offset,
3482                                       int64_t bytes,
3483                                       BdrvRequestFlags read_flags,
3484                                       BdrvRequestFlags write_flags)
3485{
3486    IO_CODE();
3487    trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes,
3488                                read_flags, write_flags);
3489    return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
3490                                       bytes, read_flags, write_flags, false);
3491}
3492
3493int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset,
3494                                    BdrvChild *dst, int64_t dst_offset,
3495                                    int64_t bytes, BdrvRequestFlags read_flags,
3496                                    BdrvRequestFlags write_flags)
3497{
3498    IO_CODE();
3499    return bdrv_co_copy_range_from(src, src_offset,
3500                                   dst, dst_offset,
3501                                   bytes, read_flags, write_flags);
3502}
3503
3504static void bdrv_parent_cb_resize(BlockDriverState *bs)
3505{
3506    BdrvChild *c;
3507    QLIST_FOREACH(c, &bs->parents, next_parent) {
3508        if (c->klass->resize) {
3509            c->klass->resize(c);
3510        }
3511    }
3512}
3513
3514/**
3515 * Truncate file to 'offset' bytes (needed only for file protocols)
3516 *
3517 * If 'exact' is true, the file must be resized to exactly the given
3518 * 'offset'.  Otherwise, it is sufficient for the node to be at least
3519 * 'offset' bytes in length.
3520 */
3521int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
3522                                  PreallocMode prealloc, BdrvRequestFlags flags,
3523                                  Error **errp)
3524{
3525    BlockDriverState *bs = child->bs;
3526    BdrvChild *filtered, *backing;
3527    BlockDriver *drv = bs->drv;
3528    BdrvTrackedRequest req;
3529    int64_t old_size, new_bytes;
3530    int ret;
3531    IO_CODE();
3532
3533    /* if bs->drv == NULL, bs is closed, so there's nothing to do here */
3534    if (!drv) {
3535        error_setg(errp, "No medium inserted");
3536        return -ENOMEDIUM;
3537    }
3538    if (offset < 0) {
3539        error_setg(errp, "Image size cannot be negative");
3540        return -EINVAL;
3541    }
3542
3543    ret = bdrv_check_request(offset, 0, errp);
3544    if (ret < 0) {
3545        return ret;
3546    }
3547
3548    old_size = bdrv_getlength(bs);
3549    if (old_size < 0) {
3550        error_setg_errno(errp, -old_size, "Failed to get old image size");
3551        return old_size;
3552    }
3553
3554    if (bdrv_is_read_only(bs)) {
3555        error_setg(errp, "Image is read-only");
3556        return -EACCES;
3557    }
3558
3559    if (offset > old_size) {
3560        new_bytes = offset - old_size;
3561    } else {
3562        new_bytes = 0;
3563    }
3564
3565    bdrv_inc_in_flight(bs);
3566    tracked_request_begin(&req, bs, offset - new_bytes, new_bytes,
3567                          BDRV_TRACKED_TRUNCATE);
3568
3569    /* If we are growing the image and potentially using preallocation for the
3570     * new area, we need to make sure that no write requests are made to it
3571     * concurrently or they might be overwritten by preallocation. */
3572    if (new_bytes) {
3573        bdrv_make_request_serialising(&req, 1);
3574    }
3575    ret = bdrv_co_write_req_prepare(child, offset - new_bytes, new_bytes, &req,
3576                                    0);
3577    if (ret < 0) {
3578        error_setg_errno(errp, -ret,
3579                         "Failed to prepare request for truncation");
3580        goto out;
3581    }
3582
3583    filtered = bdrv_filter_child(bs);
3584    backing = bdrv_cow_child(bs);
3585
3586    /*
3587     * If the image has a backing file that is large enough that it would
3588     * provide data for the new area, we cannot leave it unallocated because
3589     * then the backing file content would become visible. Instead, zero-fill
3590     * the new area.
3591     *
3592     * Note that if the image has a backing file, but was opened without the
3593     * backing file, taking care of keeping things consistent with that backing
3594     * file is the user's responsibility.
3595     */
3596    if (new_bytes && backing) {
3597        int64_t backing_len;
3598
3599        backing_len = bdrv_getlength(backing->bs);
3600        if (backing_len < 0) {
3601            ret = backing_len;
3602            error_setg_errno(errp, -ret, "Could not get backing file size");
3603            goto out;
3604        }
3605
3606        if (backing_len > old_size) {
3607            flags |= BDRV_REQ_ZERO_WRITE;
3608        }
3609    }
3610
3611    if (drv->bdrv_co_truncate) {
3612        if (flags & ~bs->supported_truncate_flags) {
3613            error_setg(errp, "Block driver does not support requested flags");
3614            ret = -ENOTSUP;
3615            goto out;
3616        }
3617        ret = drv->bdrv_co_truncate(bs, offset, exact, prealloc, flags, errp);
3618    } else if (filtered) {
3619        ret = bdrv_co_truncate(filtered, offset, exact, prealloc, flags, errp);
3620    } else {
3621        error_setg(errp, "Image format driver does not support resize");
3622        ret = -ENOTSUP;
3623        goto out;
3624    }
3625    if (ret < 0) {
3626        goto out;
3627    }
3628
3629    ret = refresh_total_sectors(bs, offset >> BDRV_SECTOR_BITS);
3630    if (ret < 0) {
3631        error_setg_errno(errp, -ret, "Could not refresh total sector count");
3632    } else {
3633        offset = bs->total_sectors * BDRV_SECTOR_SIZE;
3634    }
3635    /* It's possible that truncation succeeded but refresh_total_sectors
3636     * failed, but the latter doesn't affect how we should finish the request.
3637     * Pass 0 as the last parameter so that dirty bitmaps etc. are handled. */
3638    bdrv_co_write_req_finish(child, offset - new_bytes, new_bytes, &req, 0);
3639
3640out:
3641    tracked_request_end(&req);
3642    bdrv_dec_in_flight(bs);
3643
3644    return ret;
3645}
3646
3647void bdrv_cancel_in_flight(BlockDriverState *bs)
3648{
3649    GLOBAL_STATE_CODE();
3650    if (!bs || !bs->drv) {
3651        return;
3652    }
3653
3654    if (bs->drv->bdrv_cancel_in_flight) {
3655        bs->drv->bdrv_cancel_in_flight(bs);
3656    }
3657}
3658
3659int coroutine_fn
3660bdrv_co_preadv_snapshot(BdrvChild *child, int64_t offset, int64_t bytes,
3661                        QEMUIOVector *qiov, size_t qiov_offset)
3662{
3663    BlockDriverState *bs = child->bs;
3664    BlockDriver *drv = bs->drv;
3665    int ret;
3666    IO_CODE();
3667
3668    if (!drv) {
3669        return -ENOMEDIUM;
3670    }
3671
3672    if (!drv->bdrv_co_preadv_snapshot) {
3673        return -ENOTSUP;
3674    }
3675
3676    bdrv_inc_in_flight(bs);
3677    ret = drv->bdrv_co_preadv_snapshot(bs, offset, bytes, qiov, qiov_offset);
3678    bdrv_dec_in_flight(bs);
3679
3680    return ret;
3681}
3682
3683int coroutine_fn
3684bdrv_co_snapshot_block_status(BlockDriverState *bs,
3685                              bool want_zero, int64_t offset, int64_t bytes,
3686                              int64_t *pnum, int64_t *map,
3687                              BlockDriverState **file)
3688{
3689    BlockDriver *drv = bs->drv;
3690    int ret;
3691    IO_CODE();
3692
3693    if (!drv) {
3694        return -ENOMEDIUM;
3695    }
3696
3697    if (!drv->bdrv_co_snapshot_block_status) {
3698        return -ENOTSUP;
3699    }
3700
3701    bdrv_inc_in_flight(bs);
3702    ret = drv->bdrv_co_snapshot_block_status(bs, want_zero, offset, bytes,
3703                                             pnum, map, file);
3704    bdrv_dec_in_flight(bs);
3705
3706    return ret;
3707}
3708
3709int coroutine_fn
3710bdrv_co_pdiscard_snapshot(BlockDriverState *bs, int64_t offset, int64_t bytes)
3711{
3712    BlockDriver *drv = bs->drv;
3713    int ret;
3714    IO_CODE();
3715
3716    if (!drv) {
3717        return -ENOMEDIUM;
3718    }
3719
3720    if (!drv->bdrv_co_pdiscard_snapshot) {
3721        return -ENOTSUP;
3722    }
3723
3724    bdrv_inc_in_flight(bs);
3725    ret = drv->bdrv_co_pdiscard_snapshot(bs, offset, bytes);
3726    bdrv_dec_in_flight(bs);
3727
3728    return ret;
3729}
3730