qemu/block/io.c
<<
>>
Prefs
   1/*
   2 * Block layer I/O functions
   3 *
   4 * Copyright (c) 2003 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "trace.h"
  27#include "sysemu/block-backend.h"
  28#include "block/blockjob.h"
  29#include "block/block_int.h"
  30#include "qemu/cutils.h"
  31#include "qapi/error.h"
  32#include "qemu/error-report.h"
  33
  34#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  35
  36static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
  37                                          int64_t offset,
  38                                          QEMUIOVector *qiov,
  39                                          BdrvRequestFlags flags,
  40                                          BlockCompletionFunc *cb,
  41                                          void *opaque,
  42                                          bool is_write);
  43static void coroutine_fn bdrv_co_do_rw(void *opaque);
  44static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
  45    int64_t offset, int count, BdrvRequestFlags flags);
  46
  47static void bdrv_parent_drained_begin(BlockDriverState *bs)
  48{
  49    BdrvChild *c;
  50
  51    QLIST_FOREACH(c, &bs->parents, next_parent) {
  52        if (c->role->drained_begin) {
  53            c->role->drained_begin(c);
  54        }
  55    }
  56}
  57
  58static void bdrv_parent_drained_end(BlockDriverState *bs)
  59{
  60    BdrvChild *c;
  61
  62    QLIST_FOREACH(c, &bs->parents, next_parent) {
  63        if (c->role->drained_end) {
  64            c->role->drained_end(c);
  65        }
  66    }
  67}
  68
  69static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
  70{
  71    dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
  72    dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
  73    dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
  74                                 src->opt_mem_alignment);
  75    dst->min_mem_alignment = MAX(dst->min_mem_alignment,
  76                                 src->min_mem_alignment);
  77    dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
  78}
  79
  80void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
  81{
  82    BlockDriver *drv = bs->drv;
  83    Error *local_err = NULL;
  84
  85    memset(&bs->bl, 0, sizeof(bs->bl));
  86
  87    if (!drv) {
  88        return;
  89    }
  90
  91    /* Default alignment based on whether driver has byte interface */
  92    bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512;
  93
  94    /* Take some limits from the children as a default */
  95    if (bs->file) {
  96        bdrv_refresh_limits(bs->file->bs, &local_err);
  97        if (local_err) {
  98            error_propagate(errp, local_err);
  99            return;
 100        }
 101        bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
 102    } else {
 103        bs->bl.min_mem_alignment = 512;
 104        bs->bl.opt_mem_alignment = getpagesize();
 105
 106        /* Safe default since most protocols use readv()/writev()/etc */
 107        bs->bl.max_iov = IOV_MAX;
 108    }
 109
 110    if (bs->backing) {
 111        bdrv_refresh_limits(bs->backing->bs, &local_err);
 112        if (local_err) {
 113            error_propagate(errp, local_err);
 114            return;
 115        }
 116        bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
 117    }
 118
 119    /* Then let the driver override it */
 120    if (drv->bdrv_refresh_limits) {
 121        drv->bdrv_refresh_limits(bs, errp);
 122    }
 123}
 124
 125/**
 126 * The copy-on-read flag is actually a reference count so multiple users may
 127 * use the feature without worrying about clobbering its previous state.
 128 * Copy-on-read stays enabled until all users have called to disable it.
 129 */
 130void bdrv_enable_copy_on_read(BlockDriverState *bs)
 131{
 132    bs->copy_on_read++;
 133}
 134
 135void bdrv_disable_copy_on_read(BlockDriverState *bs)
 136{
 137    assert(bs->copy_on_read > 0);
 138    bs->copy_on_read--;
 139}
 140
 141/* Check if any requests are in-flight (including throttled requests) */
 142bool bdrv_requests_pending(BlockDriverState *bs)
 143{
 144    BdrvChild *child;
 145
 146    if (atomic_read(&bs->in_flight)) {
 147        return true;
 148    }
 149
 150    QLIST_FOREACH(child, &bs->children, next) {
 151        if (bdrv_requests_pending(child->bs)) {
 152            return true;
 153        }
 154    }
 155
 156    return false;
 157}
 158
 159static bool bdrv_drain_recurse(BlockDriverState *bs)
 160{
 161    BdrvChild *child;
 162    bool waited;
 163
 164    waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
 165
 166    if (bs->drv && bs->drv->bdrv_drain) {
 167        bs->drv->bdrv_drain(bs);
 168    }
 169
 170    QLIST_FOREACH(child, &bs->children, next) {
 171        waited |= bdrv_drain_recurse(child->bs);
 172    }
 173
 174    return waited;
 175}
 176
 177typedef struct {
 178    Coroutine *co;
 179    BlockDriverState *bs;
 180    bool done;
 181} BdrvCoDrainData;
 182
 183static void bdrv_co_drain_bh_cb(void *opaque)
 184{
 185    BdrvCoDrainData *data = opaque;
 186    Coroutine *co = data->co;
 187    BlockDriverState *bs = data->bs;
 188
 189    bdrv_dec_in_flight(bs);
 190    bdrv_drained_begin(bs);
 191    data->done = true;
 192    qemu_coroutine_enter(co);
 193}
 194
 195static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
 196{
 197    BdrvCoDrainData data;
 198
 199    /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
 200     * other coroutines run if they were queued from
 201     * qemu_co_queue_run_restart(). */
 202
 203    assert(qemu_in_coroutine());
 204    data = (BdrvCoDrainData) {
 205        .co = qemu_coroutine_self(),
 206        .bs = bs,
 207        .done = false,
 208    };
 209    bdrv_inc_in_flight(bs);
 210    aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
 211                            bdrv_co_drain_bh_cb, &data);
 212
 213    qemu_coroutine_yield();
 214    /* If we are resumed from some other event (such as an aio completion or a
 215     * timer callback), it is a bug in the caller that should be fixed. */
 216    assert(data.done);
 217}
 218
 219void bdrv_drained_begin(BlockDriverState *bs)
 220{
 221    if (qemu_in_coroutine()) {
 222        bdrv_co_yield_to_drain(bs);
 223        return;
 224    }
 225
 226    if (!bs->quiesce_counter++) {
 227        aio_disable_external(bdrv_get_aio_context(bs));
 228        bdrv_parent_drained_begin(bs);
 229    }
 230
 231    bdrv_io_unplugged_begin(bs);
 232    bdrv_drain_recurse(bs);
 233    bdrv_io_unplugged_end(bs);
 234}
 235
 236void bdrv_drained_end(BlockDriverState *bs)
 237{
 238    assert(bs->quiesce_counter > 0);
 239    if (--bs->quiesce_counter > 0) {
 240        return;
 241    }
 242
 243    bdrv_parent_drained_end(bs);
 244    aio_enable_external(bdrv_get_aio_context(bs));
 245}
 246
 247/*
 248 * Wait for pending requests to complete on a single BlockDriverState subtree,
 249 * and suspend block driver's internal I/O until next request arrives.
 250 *
 251 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
 252 * AioContext.
 253 *
 254 * Only this BlockDriverState's AioContext is run, so in-flight requests must
 255 * not depend on events in other AioContexts.  In that case, use
 256 * bdrv_drain_all() instead.
 257 */
 258void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
 259{
 260    assert(qemu_in_coroutine());
 261    bdrv_drained_begin(bs);
 262    bdrv_drained_end(bs);
 263}
 264
 265void bdrv_drain(BlockDriverState *bs)
 266{
 267    bdrv_drained_begin(bs);
 268    bdrv_drained_end(bs);
 269}
 270
 271/*
 272 * Wait for pending requests to complete across all BlockDriverStates
 273 *
 274 * This function does not flush data to disk, use bdrv_flush_all() for that
 275 * after calling this function.
 276 *
 277 * This pauses all block jobs and disables external clients. It must
 278 * be paired with bdrv_drain_all_end().
 279 *
 280 * NOTE: no new block jobs or BlockDriverStates can be created between
 281 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
 282 */
 283void bdrv_drain_all_begin(void)
 284{
 285    /* Always run first iteration so any pending completion BHs run */
 286    bool waited = true;
 287    BlockDriverState *bs;
 288    BdrvNextIterator it;
 289    BlockJob *job = NULL;
 290    GSList *aio_ctxs = NULL, *ctx;
 291
 292    while ((job = block_job_next(job))) {
 293        AioContext *aio_context = blk_get_aio_context(job->blk);
 294
 295        aio_context_acquire(aio_context);
 296        block_job_pause(job);
 297        aio_context_release(aio_context);
 298    }
 299
 300    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 301        AioContext *aio_context = bdrv_get_aio_context(bs);
 302
 303        aio_context_acquire(aio_context);
 304        bdrv_parent_drained_begin(bs);
 305        bdrv_io_unplugged_begin(bs);
 306        aio_disable_external(aio_context);
 307        aio_context_release(aio_context);
 308
 309        if (!g_slist_find(aio_ctxs, aio_context)) {
 310            aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
 311        }
 312    }
 313
 314    /* Note that completion of an asynchronous I/O operation can trigger any
 315     * number of other I/O operations on other devices---for example a
 316     * coroutine can submit an I/O request to another device in response to
 317     * request completion.  Therefore we must keep looping until there was no
 318     * more activity rather than simply draining each device independently.
 319     */
 320    while (waited) {
 321        waited = false;
 322
 323        for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
 324            AioContext *aio_context = ctx->data;
 325
 326            aio_context_acquire(aio_context);
 327            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 328                if (aio_context == bdrv_get_aio_context(bs)) {
 329                    waited |= bdrv_drain_recurse(bs);
 330                }
 331            }
 332            aio_context_release(aio_context);
 333        }
 334    }
 335
 336    g_slist_free(aio_ctxs);
 337}
 338
 339void bdrv_drain_all_end(void)
 340{
 341    BlockDriverState *bs;
 342    BdrvNextIterator it;
 343    BlockJob *job = NULL;
 344
 345    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 346        AioContext *aio_context = bdrv_get_aio_context(bs);
 347
 348        aio_context_acquire(aio_context);
 349        aio_enable_external(aio_context);
 350        bdrv_io_unplugged_end(bs);
 351        bdrv_parent_drained_end(bs);
 352        aio_context_release(aio_context);
 353    }
 354
 355    while ((job = block_job_next(job))) {
 356        AioContext *aio_context = blk_get_aio_context(job->blk);
 357
 358        aio_context_acquire(aio_context);
 359        block_job_resume(job);
 360        aio_context_release(aio_context);
 361    }
 362}
 363
 364void bdrv_drain_all(void)
 365{
 366    bdrv_drain_all_begin();
 367    bdrv_drain_all_end();
 368}
 369
 370/**
 371 * Remove an active request from the tracked requests list
 372 *
 373 * This function should be called when a tracked request is completing.
 374 */
 375static void tracked_request_end(BdrvTrackedRequest *req)
 376{
 377    if (req->serialising) {
 378        req->bs->serialising_in_flight--;
 379    }
 380
 381    QLIST_REMOVE(req, list);
 382    qemu_co_queue_restart_all(&req->wait_queue);
 383}
 384
 385/**
 386 * Add an active request to the tracked requests list
 387 */
 388static void tracked_request_begin(BdrvTrackedRequest *req,
 389                                  BlockDriverState *bs,
 390                                  int64_t offset,
 391                                  unsigned int bytes,
 392                                  enum BdrvTrackedRequestType type)
 393{
 394    *req = (BdrvTrackedRequest){
 395        .bs = bs,
 396        .offset         = offset,
 397        .bytes          = bytes,
 398        .type           = type,
 399        .co             = qemu_coroutine_self(),
 400        .serialising    = false,
 401        .overlap_offset = offset,
 402        .overlap_bytes  = bytes,
 403    };
 404
 405    qemu_co_queue_init(&req->wait_queue);
 406
 407    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
 408}
 409
 410static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
 411{
 412    int64_t overlap_offset = req->offset & ~(align - 1);
 413    unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
 414                               - overlap_offset;
 415
 416    if (!req->serialising) {
 417        req->bs->serialising_in_flight++;
 418        req->serialising = true;
 419    }
 420
 421    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
 422    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
 423}
 424
 425/**
 426 * Round a region to cluster boundaries (sector-based)
 427 */
 428void bdrv_round_sectors_to_clusters(BlockDriverState *bs,
 429                                    int64_t sector_num, int nb_sectors,
 430                                    int64_t *cluster_sector_num,
 431                                    int *cluster_nb_sectors)
 432{
 433    BlockDriverInfo bdi;
 434
 435    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
 436        *cluster_sector_num = sector_num;
 437        *cluster_nb_sectors = nb_sectors;
 438    } else {
 439        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
 440        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
 441        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
 442                                            nb_sectors, c);
 443    }
 444}
 445
 446/**
 447 * Round a region to cluster boundaries
 448 */
 449void bdrv_round_to_clusters(BlockDriverState *bs,
 450                            int64_t offset, unsigned int bytes,
 451                            int64_t *cluster_offset,
 452                            unsigned int *cluster_bytes)
 453{
 454    BlockDriverInfo bdi;
 455
 456    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
 457        *cluster_offset = offset;
 458        *cluster_bytes = bytes;
 459    } else {
 460        int64_t c = bdi.cluster_size;
 461        *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
 462        *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
 463    }
 464}
 465
 466static int bdrv_get_cluster_size(BlockDriverState *bs)
 467{
 468    BlockDriverInfo bdi;
 469    int ret;
 470
 471    ret = bdrv_get_info(bs, &bdi);
 472    if (ret < 0 || bdi.cluster_size == 0) {
 473        return bs->bl.request_alignment;
 474    } else {
 475        return bdi.cluster_size;
 476    }
 477}
 478
 479static bool tracked_request_overlaps(BdrvTrackedRequest *req,
 480                                     int64_t offset, unsigned int bytes)
 481{
 482    /*        aaaa   bbbb */
 483    if (offset >= req->overlap_offset + req->overlap_bytes) {
 484        return false;
 485    }
 486    /* bbbb   aaaa        */
 487    if (req->overlap_offset >= offset + bytes) {
 488        return false;
 489    }
 490    return true;
 491}
 492
 493void bdrv_inc_in_flight(BlockDriverState *bs)
 494{
 495    atomic_inc(&bs->in_flight);
 496}
 497
 498static void dummy_bh_cb(void *opaque)
 499{
 500}
 501
 502void bdrv_wakeup(BlockDriverState *bs)
 503{
 504    if (bs->wakeup) {
 505        aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
 506    }
 507}
 508
 509void bdrv_dec_in_flight(BlockDriverState *bs)
 510{
 511    atomic_dec(&bs->in_flight);
 512    bdrv_wakeup(bs);
 513}
 514
 515static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
 516{
 517    BlockDriverState *bs = self->bs;
 518    BdrvTrackedRequest *req;
 519    bool retry;
 520    bool waited = false;
 521
 522    if (!bs->serialising_in_flight) {
 523        return false;
 524    }
 525
 526    do {
 527        retry = false;
 528        QLIST_FOREACH(req, &bs->tracked_requests, list) {
 529            if (req == self || (!req->serialising && !self->serialising)) {
 530                continue;
 531            }
 532            if (tracked_request_overlaps(req, self->overlap_offset,
 533                                         self->overlap_bytes))
 534            {
 535                /* Hitting this means there was a reentrant request, for
 536                 * example, a block driver issuing nested requests.  This must
 537                 * never happen since it means deadlock.
 538                 */
 539                assert(qemu_coroutine_self() != req->co);
 540
 541                /* If the request is already (indirectly) waiting for us, or
 542                 * will wait for us as soon as it wakes up, then just go on
 543                 * (instead of producing a deadlock in the former case). */
 544                if (!req->waiting_for) {
 545                    self->waiting_for = req;
 546                    qemu_co_queue_wait(&req->wait_queue);
 547                    self->waiting_for = NULL;
 548                    retry = true;
 549                    waited = true;
 550                    break;
 551                }
 552            }
 553        }
 554    } while (retry);
 555
 556    return waited;
 557}
 558
 559static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 560                                   size_t size)
 561{
 562    if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
 563        return -EIO;
 564    }
 565
 566    if (!bdrv_is_inserted(bs)) {
 567        return -ENOMEDIUM;
 568    }
 569
 570    if (offset < 0) {
 571        return -EIO;
 572    }
 573
 574    return 0;
 575}
 576
 577typedef struct RwCo {
 578    BdrvChild *child;
 579    int64_t offset;
 580    QEMUIOVector *qiov;
 581    bool is_write;
 582    int ret;
 583    BdrvRequestFlags flags;
 584} RwCo;
 585
 586static void coroutine_fn bdrv_rw_co_entry(void *opaque)
 587{
 588    RwCo *rwco = opaque;
 589
 590    if (!rwco->is_write) {
 591        rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
 592                                   rwco->qiov->size, rwco->qiov,
 593                                   rwco->flags);
 594    } else {
 595        rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
 596                                    rwco->qiov->size, rwco->qiov,
 597                                    rwco->flags);
 598    }
 599}
 600
 601/*
 602 * Process a vectored synchronous request using coroutines
 603 */
 604static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
 605                        QEMUIOVector *qiov, bool is_write,
 606                        BdrvRequestFlags flags)
 607{
 608    Coroutine *co;
 609    RwCo rwco = {
 610        .child = child,
 611        .offset = offset,
 612        .qiov = qiov,
 613        .is_write = is_write,
 614        .ret = NOT_DONE,
 615        .flags = flags,
 616    };
 617
 618    if (qemu_in_coroutine()) {
 619        /* Fast-path if already in coroutine context */
 620        bdrv_rw_co_entry(&rwco);
 621    } else {
 622        co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
 623        qemu_coroutine_enter(co);
 624        BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
 625    }
 626    return rwco.ret;
 627}
 628
 629/*
 630 * Process a synchronous request using coroutines
 631 */
 632static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf,
 633                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
 634{
 635    QEMUIOVector qiov;
 636    struct iovec iov = {
 637        .iov_base = (void *)buf,
 638        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
 639    };
 640
 641    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
 642        return -EINVAL;
 643    }
 644
 645    qemu_iovec_init_external(&qiov, &iov, 1);
 646    return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS,
 647                        &qiov, is_write, flags);
 648}
 649
 650/* return < 0 if error. See bdrv_write() for the return codes */
 651int bdrv_read(BdrvChild *child, int64_t sector_num,
 652              uint8_t *buf, int nb_sectors)
 653{
 654    return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0);
 655}
 656
 657/* Return < 0 if error. Important errors are:
 658  -EIO         generic I/O error (may happen for all errors)
 659  -ENOMEDIUM   No media inserted.
 660  -EINVAL      Invalid sector number or nb_sectors
 661  -EACCES      Trying to write a read-only device
 662*/
 663int bdrv_write(BdrvChild *child, int64_t sector_num,
 664               const uint8_t *buf, int nb_sectors)
 665{
 666    return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
 667}
 668
 669int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
 670                       int count, BdrvRequestFlags flags)
 671{
 672    QEMUIOVector qiov;
 673    struct iovec iov = {
 674        .iov_base = NULL,
 675        .iov_len = count,
 676    };
 677
 678    qemu_iovec_init_external(&qiov, &iov, 1);
 679    return bdrv_prwv_co(child, offset, &qiov, true,
 680                        BDRV_REQ_ZERO_WRITE | flags);
 681}
 682
 683/*
 684 * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
 685 * The operation is sped up by checking the block status and only writing
 686 * zeroes to the device if they currently do not return zeroes. Optional
 687 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
 688 * BDRV_REQ_FUA).
 689 *
 690 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
 691 */
 692int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
 693{
 694    int64_t target_sectors, ret, nb_sectors, sector_num = 0;
 695    BlockDriverState *bs = child->bs;
 696    BlockDriverState *file;
 697    int n;
 698
 699    target_sectors = bdrv_nb_sectors(bs);
 700    if (target_sectors < 0) {
 701        return target_sectors;
 702    }
 703
 704    for (;;) {
 705        nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
 706        if (nb_sectors <= 0) {
 707            return 0;
 708        }
 709        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file);
 710        if (ret < 0) {
 711            error_report("error getting block status at sector %" PRId64 ": %s",
 712                         sector_num, strerror(-ret));
 713            return ret;
 714        }
 715        if (ret & BDRV_BLOCK_ZERO) {
 716            sector_num += n;
 717            continue;
 718        }
 719        ret = bdrv_pwrite_zeroes(child, sector_num << BDRV_SECTOR_BITS,
 720                                 n << BDRV_SECTOR_BITS, flags);
 721        if (ret < 0) {
 722            error_report("error writing zeroes at sector %" PRId64 ": %s",
 723                         sector_num, strerror(-ret));
 724            return ret;
 725        }
 726        sector_num += n;
 727    }
 728}
 729
 730int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
 731{
 732    int ret;
 733
 734    ret = bdrv_prwv_co(child, offset, qiov, false, 0);
 735    if (ret < 0) {
 736        return ret;
 737    }
 738
 739    return qiov->size;
 740}
 741
 742int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
 743{
 744    QEMUIOVector qiov;
 745    struct iovec iov = {
 746        .iov_base = (void *)buf,
 747        .iov_len = bytes,
 748    };
 749
 750    if (bytes < 0) {
 751        return -EINVAL;
 752    }
 753
 754    qemu_iovec_init_external(&qiov, &iov, 1);
 755    return bdrv_preadv(child, offset, &qiov);
 756}
 757
 758int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
 759{
 760    int ret;
 761
 762    ret = bdrv_prwv_co(child, offset, qiov, true, 0);
 763    if (ret < 0) {
 764        return ret;
 765    }
 766
 767    return qiov->size;
 768}
 769
 770int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
 771{
 772    QEMUIOVector qiov;
 773    struct iovec iov = {
 774        .iov_base   = (void *) buf,
 775        .iov_len    = bytes,
 776    };
 777
 778    if (bytes < 0) {
 779        return -EINVAL;
 780    }
 781
 782    qemu_iovec_init_external(&qiov, &iov, 1);
 783    return bdrv_pwritev(child, offset, &qiov);
 784}
 785
 786/*
 787 * Writes to the file and ensures that no writes are reordered across this
 788 * request (acts as a barrier)
 789 *
 790 * Returns 0 on success, -errno in error cases.
 791 */
 792int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
 793                     const void *buf, int count)
 794{
 795    int ret;
 796
 797    ret = bdrv_pwrite(child, offset, buf, count);
 798    if (ret < 0) {
 799        return ret;
 800    }
 801
 802    ret = bdrv_flush(child->bs);
 803    if (ret < 0) {
 804        return ret;
 805    }
 806
 807    return 0;
 808}
 809
 810typedef struct CoroutineIOCompletion {
 811    Coroutine *coroutine;
 812    int ret;
 813} CoroutineIOCompletion;
 814
 815static void bdrv_co_io_em_complete(void *opaque, int ret)
 816{
 817    CoroutineIOCompletion *co = opaque;
 818
 819    co->ret = ret;
 820    qemu_coroutine_enter(co->coroutine);
 821}
 822
 823static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
 824                                           uint64_t offset, uint64_t bytes,
 825                                           QEMUIOVector *qiov, int flags)
 826{
 827    BlockDriver *drv = bs->drv;
 828    int64_t sector_num;
 829    unsigned int nb_sectors;
 830
 831    assert(!(flags & ~BDRV_REQ_MASK));
 832
 833    if (drv->bdrv_co_preadv) {
 834        return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
 835    }
 836
 837    sector_num = offset >> BDRV_SECTOR_BITS;
 838    nb_sectors = bytes >> BDRV_SECTOR_BITS;
 839
 840    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
 841    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 842    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 843
 844    if (drv->bdrv_co_readv) {
 845        return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
 846    } else {
 847        BlockAIOCB *acb;
 848        CoroutineIOCompletion co = {
 849            .coroutine = qemu_coroutine_self(),
 850        };
 851
 852        acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
 853                                      bdrv_co_io_em_complete, &co);
 854        if (acb == NULL) {
 855            return -EIO;
 856        } else {
 857            qemu_coroutine_yield();
 858            return co.ret;
 859        }
 860    }
 861}
 862
 863static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
 864                                            uint64_t offset, uint64_t bytes,
 865                                            QEMUIOVector *qiov, int flags)
 866{
 867    BlockDriver *drv = bs->drv;
 868    int64_t sector_num;
 869    unsigned int nb_sectors;
 870    int ret;
 871
 872    assert(!(flags & ~BDRV_REQ_MASK));
 873
 874    if (drv->bdrv_co_pwritev) {
 875        ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
 876                                   flags & bs->supported_write_flags);
 877        flags &= ~bs->supported_write_flags;
 878        goto emulate_flags;
 879    }
 880
 881    sector_num = offset >> BDRV_SECTOR_BITS;
 882    nb_sectors = bytes >> BDRV_SECTOR_BITS;
 883
 884    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
 885    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 886    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 887
 888    if (drv->bdrv_co_writev_flags) {
 889        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
 890                                        flags & bs->supported_write_flags);
 891        flags &= ~bs->supported_write_flags;
 892    } else if (drv->bdrv_co_writev) {
 893        assert(!bs->supported_write_flags);
 894        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
 895    } else {
 896        BlockAIOCB *acb;
 897        CoroutineIOCompletion co = {
 898            .coroutine = qemu_coroutine_self(),
 899        };
 900
 901        acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
 902                                       bdrv_co_io_em_complete, &co);
 903        if (acb == NULL) {
 904            ret = -EIO;
 905        } else {
 906            qemu_coroutine_yield();
 907            ret = co.ret;
 908        }
 909    }
 910
 911emulate_flags:
 912    if (ret == 0 && (flags & BDRV_REQ_FUA)) {
 913        ret = bdrv_co_flush(bs);
 914    }
 915
 916    return ret;
 917}
 918
 919static int coroutine_fn
 920bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
 921                               uint64_t bytes, QEMUIOVector *qiov)
 922{
 923    BlockDriver *drv = bs->drv;
 924
 925    if (!drv->bdrv_co_pwritev_compressed) {
 926        return -ENOTSUP;
 927    }
 928
 929    return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
 930}
 931
 932static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
 933        int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
 934{
 935    /* Perform I/O through a temporary buffer so that users who scribble over
 936     * their read buffer while the operation is in progress do not end up
 937     * modifying the image file.  This is critical for zero-copy guest I/O
 938     * where anything might happen inside guest memory.
 939     */
 940    void *bounce_buffer;
 941
 942    BlockDriver *drv = bs->drv;
 943    struct iovec iov;
 944    QEMUIOVector bounce_qiov;
 945    int64_t cluster_offset;
 946    unsigned int cluster_bytes;
 947    size_t skip_bytes;
 948    int ret;
 949
 950    /* Cover entire cluster so no additional backing file I/O is required when
 951     * allocating cluster in the image file.
 952     */
 953    bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
 954
 955    trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
 956                                   cluster_offset, cluster_bytes);
 957
 958    iov.iov_len = cluster_bytes;
 959    iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
 960    if (bounce_buffer == NULL) {
 961        ret = -ENOMEM;
 962        goto err;
 963    }
 964
 965    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
 966
 967    ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes,
 968                             &bounce_qiov, 0);
 969    if (ret < 0) {
 970        goto err;
 971    }
 972
 973    if (drv->bdrv_co_pwrite_zeroes &&
 974        buffer_is_zero(bounce_buffer, iov.iov_len)) {
 975        /* FIXME: Should we (perhaps conditionally) be setting
 976         * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
 977         * that still correctly reads as zero? */
 978        ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0);
 979    } else {
 980        /* This does not change the data on the disk, it is not necessary
 981         * to flush even in cache=writethrough mode.
 982         */
 983        ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes,
 984                                  &bounce_qiov, 0);
 985    }
 986
 987    if (ret < 0) {
 988        /* It might be okay to ignore write errors for guest requests.  If this
 989         * is a deliberate copy-on-read then we don't want to ignore the error.
 990         * Simply report it in all cases.
 991         */
 992        goto err;
 993    }
 994
 995    skip_bytes = offset - cluster_offset;
 996    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes);
 997
 998err:
 999    qemu_vfree(bounce_buffer);
1000    return ret;
1001}
1002
1003/*
1004 * Forwards an already correctly aligned request to the BlockDriver. This
1005 * handles copy on read, zeroing after EOF, and fragmentation of large
1006 * reads; any other features must be implemented by the caller.
1007 */
1008static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
1009    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1010    int64_t align, QEMUIOVector *qiov, int flags)
1011{
1012    int64_t total_bytes, max_bytes;
1013    int ret = 0;
1014    uint64_t bytes_remaining = bytes;
1015    int max_transfer;
1016
1017    assert(is_power_of_2(align));
1018    assert((offset & (align - 1)) == 0);
1019    assert((bytes & (align - 1)) == 0);
1020    assert(!qiov || bytes == qiov->size);
1021    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1022    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1023                                   align);
1024
1025    /* TODO: We would need a per-BDS .supported_read_flags and
1026     * potential fallback support, if we ever implement any read flags
1027     * to pass through to drivers.  For now, there aren't any
1028     * passthrough flags.  */
1029    assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
1030
1031    /* Handle Copy on Read and associated serialisation */
1032    if (flags & BDRV_REQ_COPY_ON_READ) {
1033        /* If we touch the same cluster it counts as an overlap.  This
1034         * guarantees that allocating writes will be serialized and not race
1035         * with each other for the same cluster.  For example, in copy-on-read
1036         * it ensures that the CoR read and write operations are atomic and
1037         * guest writes cannot interleave between them. */
1038        mark_request_serialising(req, bdrv_get_cluster_size(bs));
1039    }
1040
1041    if (!(flags & BDRV_REQ_NO_SERIALISING)) {
1042        wait_serialising_requests(req);
1043    }
1044
1045    if (flags & BDRV_REQ_COPY_ON_READ) {
1046        int64_t start_sector = offset >> BDRV_SECTOR_BITS;
1047        int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1048        unsigned int nb_sectors = end_sector - start_sector;
1049        int pnum;
1050
1051        ret = bdrv_is_allocated(bs, start_sector, nb_sectors, &pnum);
1052        if (ret < 0) {
1053            goto out;
1054        }
1055
1056        if (!ret || pnum != nb_sectors) {
1057            ret = bdrv_co_do_copy_on_readv(bs, offset, bytes, qiov);
1058            goto out;
1059        }
1060    }
1061
1062    /* Forward the request to the BlockDriver, possibly fragmenting it */
1063    total_bytes = bdrv_getlength(bs);
1064    if (total_bytes < 0) {
1065        ret = total_bytes;
1066        goto out;
1067    }
1068
1069    max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1070    if (bytes <= max_bytes && bytes <= max_transfer) {
1071        ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
1072        goto out;
1073    }
1074
1075    while (bytes_remaining) {
1076        int num;
1077
1078        if (max_bytes) {
1079            QEMUIOVector local_qiov;
1080
1081            num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1082            assert(num);
1083            qemu_iovec_init(&local_qiov, qiov->niov);
1084            qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1085
1086            ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1087                                     num, &local_qiov, 0);
1088            max_bytes -= num;
1089            qemu_iovec_destroy(&local_qiov);
1090        } else {
1091            num = bytes_remaining;
1092            ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
1093                                    bytes_remaining);
1094        }
1095        if (ret < 0) {
1096            goto out;
1097        }
1098        bytes_remaining -= num;
1099    }
1100
1101out:
1102    return ret < 0 ? ret : 0;
1103}
1104
1105/*
1106 * Handle a read request in coroutine context
1107 */
1108int coroutine_fn bdrv_co_preadv(BdrvChild *child,
1109    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1110    BdrvRequestFlags flags)
1111{
1112    BlockDriverState *bs = child->bs;
1113    BlockDriver *drv = bs->drv;
1114    BdrvTrackedRequest req;
1115
1116    uint64_t align = bs->bl.request_alignment;
1117    uint8_t *head_buf = NULL;
1118    uint8_t *tail_buf = NULL;
1119    QEMUIOVector local_qiov;
1120    bool use_local_qiov = false;
1121    int ret;
1122
1123    if (!drv) {
1124        return -ENOMEDIUM;
1125    }
1126
1127    ret = bdrv_check_byte_request(bs, offset, bytes);
1128    if (ret < 0) {
1129        return ret;
1130    }
1131
1132    bdrv_inc_in_flight(bs);
1133
1134    /* Don't do copy-on-read if we read data before write operation */
1135    if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
1136        flags |= BDRV_REQ_COPY_ON_READ;
1137    }
1138
1139    /* Align read if necessary by padding qiov */
1140    if (offset & (align - 1)) {
1141        head_buf = qemu_blockalign(bs, align);
1142        qemu_iovec_init(&local_qiov, qiov->niov + 2);
1143        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1144        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1145        use_local_qiov = true;
1146
1147        bytes += offset & (align - 1);
1148        offset = offset & ~(align - 1);
1149    }
1150
1151    if ((offset + bytes) & (align - 1)) {
1152        if (!use_local_qiov) {
1153            qemu_iovec_init(&local_qiov, qiov->niov + 1);
1154            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1155            use_local_qiov = true;
1156        }
1157        tail_buf = qemu_blockalign(bs, align);
1158        qemu_iovec_add(&local_qiov, tail_buf,
1159                       align - ((offset + bytes) & (align - 1)));
1160
1161        bytes = ROUND_UP(bytes, align);
1162    }
1163
1164    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1165    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1166                              use_local_qiov ? &local_qiov : qiov,
1167                              flags);
1168    tracked_request_end(&req);
1169    bdrv_dec_in_flight(bs);
1170
1171    if (use_local_qiov) {
1172        qemu_iovec_destroy(&local_qiov);
1173        qemu_vfree(head_buf);
1174        qemu_vfree(tail_buf);
1175    }
1176
1177    return ret;
1178}
1179
1180static int coroutine_fn bdrv_co_do_readv(BdrvChild *child,
1181    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1182    BdrvRequestFlags flags)
1183{
1184    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1185        return -EINVAL;
1186    }
1187
1188    return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS,
1189                          nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1190}
1191
1192int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
1193                               int nb_sectors, QEMUIOVector *qiov)
1194{
1195    trace_bdrv_co_readv(child->bs, sector_num, nb_sectors);
1196
1197    return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0);
1198}
1199
1200/* Maximum buffer for write zeroes fallback, in bytes */
1201#define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
1202
1203static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1204    int64_t offset, int count, BdrvRequestFlags flags)
1205{
1206    BlockDriver *drv = bs->drv;
1207    QEMUIOVector qiov;
1208    struct iovec iov = {0};
1209    int ret = 0;
1210    bool need_flush = false;
1211    int head = 0;
1212    int tail = 0;
1213
1214    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1215    int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1216                        bs->bl.request_alignment);
1217    int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1218                                    MAX_WRITE_ZEROES_BOUNCE_BUFFER);
1219
1220    assert(alignment % bs->bl.request_alignment == 0);
1221    head = offset % alignment;
1222    tail = (offset + count) % alignment;
1223    max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1224    assert(max_write_zeroes >= bs->bl.request_alignment);
1225
1226    while (count > 0 && !ret) {
1227        int num = count;
1228
1229        /* Align request.  Block drivers can expect the "bulk" of the request
1230         * to be aligned, and that unaligned requests do not cross cluster
1231         * boundaries.
1232         */
1233        if (head) {
1234            /* Make a small request up to the first aligned sector. For
1235             * convenience, limit this request to max_transfer even if
1236             * we don't need to fall back to writes.  */
1237            num = MIN(MIN(count, max_transfer), alignment - head);
1238            head = (head + num) % alignment;
1239            assert(num < max_write_zeroes);
1240        } else if (tail && num > alignment) {
1241            /* Shorten the request to the last aligned sector.  */
1242            num -= tail;
1243        }
1244
1245        /* limit request size */
1246        if (num > max_write_zeroes) {
1247            num = max_write_zeroes;
1248        }
1249
1250        ret = -ENOTSUP;
1251        /* First try the efficient write zeroes operation */
1252        if (drv->bdrv_co_pwrite_zeroes) {
1253            ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1254                                             flags & bs->supported_zero_flags);
1255            if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1256                !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1257                need_flush = true;
1258            }
1259        } else {
1260            assert(!bs->supported_zero_flags);
1261        }
1262
1263        if (ret == -ENOTSUP) {
1264            /* Fall back to bounce buffer if write zeroes is unsupported */
1265            BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1266
1267            if ((flags & BDRV_REQ_FUA) &&
1268                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1269                /* No need for bdrv_driver_pwrite() to do a fallback
1270                 * flush on each chunk; use just one at the end */
1271                write_flags &= ~BDRV_REQ_FUA;
1272                need_flush = true;
1273            }
1274            num = MIN(num, max_transfer);
1275            iov.iov_len = num;
1276            if (iov.iov_base == NULL) {
1277                iov.iov_base = qemu_try_blockalign(bs, num);
1278                if (iov.iov_base == NULL) {
1279                    ret = -ENOMEM;
1280                    goto fail;
1281                }
1282                memset(iov.iov_base, 0, num);
1283            }
1284            qemu_iovec_init_external(&qiov, &iov, 1);
1285
1286            ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
1287
1288            /* Keep bounce buffer around if it is big enough for all
1289             * all future requests.
1290             */
1291            if (num < max_transfer) {
1292                qemu_vfree(iov.iov_base);
1293                iov.iov_base = NULL;
1294            }
1295        }
1296
1297        offset += num;
1298        count -= num;
1299    }
1300
1301fail:
1302    if (ret == 0 && need_flush) {
1303        ret = bdrv_co_flush(bs);
1304    }
1305    qemu_vfree(iov.iov_base);
1306    return ret;
1307}
1308
1309/*
1310 * Forwards an already correctly aligned write request to the BlockDriver,
1311 * after possibly fragmenting it.
1312 */
1313static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
1314    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1315    int64_t align, QEMUIOVector *qiov, int flags)
1316{
1317    BlockDriver *drv = bs->drv;
1318    bool waited;
1319    int ret;
1320
1321    int64_t start_sector = offset >> BDRV_SECTOR_BITS;
1322    int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1323    uint64_t bytes_remaining = bytes;
1324    int max_transfer;
1325
1326    assert(is_power_of_2(align));
1327    assert((offset & (align - 1)) == 0);
1328    assert((bytes & (align - 1)) == 0);
1329    assert(!qiov || bytes == qiov->size);
1330    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1331    assert(!(flags & ~BDRV_REQ_MASK));
1332    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1333                                   align);
1334
1335    waited = wait_serialising_requests(req);
1336    assert(!waited || !req->serialising);
1337    assert(req->overlap_offset <= offset);
1338    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1339
1340    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1341
1342    if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1343        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
1344        qemu_iovec_is_zero(qiov)) {
1345        flags |= BDRV_REQ_ZERO_WRITE;
1346        if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1347            flags |= BDRV_REQ_MAY_UNMAP;
1348        }
1349    }
1350
1351    if (ret < 0) {
1352        /* Do nothing, write notifier decided to fail this request */
1353    } else if (flags & BDRV_REQ_ZERO_WRITE) {
1354        bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1355        ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
1356    } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
1357        ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov);
1358    } else if (bytes <= max_transfer) {
1359        bdrv_debug_event(bs, BLKDBG_PWRITEV);
1360        ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
1361    } else {
1362        bdrv_debug_event(bs, BLKDBG_PWRITEV);
1363        while (bytes_remaining) {
1364            int num = MIN(bytes_remaining, max_transfer);
1365            QEMUIOVector local_qiov;
1366            int local_flags = flags;
1367
1368            assert(num);
1369            if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
1370                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1371                /* If FUA is going to be emulated by flush, we only
1372                 * need to flush on the last iteration */
1373                local_flags &= ~BDRV_REQ_FUA;
1374            }
1375            qemu_iovec_init(&local_qiov, qiov->niov);
1376            qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1377
1378            ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
1379                                      num, &local_qiov, local_flags);
1380            qemu_iovec_destroy(&local_qiov);
1381            if (ret < 0) {
1382                break;
1383            }
1384            bytes_remaining -= num;
1385        }
1386    }
1387    bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1388
1389    ++bs->write_gen;
1390    bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
1391
1392    if (bs->wr_highest_offset < offset + bytes) {
1393        bs->wr_highest_offset = offset + bytes;
1394    }
1395
1396    if (ret >= 0) {
1397        bs->total_sectors = MAX(bs->total_sectors, end_sector);
1398        ret = 0;
1399    }
1400
1401    return ret;
1402}
1403
1404static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
1405                                                int64_t offset,
1406                                                unsigned int bytes,
1407                                                BdrvRequestFlags flags,
1408                                                BdrvTrackedRequest *req)
1409{
1410    uint8_t *buf = NULL;
1411    QEMUIOVector local_qiov;
1412    struct iovec iov;
1413    uint64_t align = bs->bl.request_alignment;
1414    unsigned int head_padding_bytes, tail_padding_bytes;
1415    int ret = 0;
1416
1417    head_padding_bytes = offset & (align - 1);
1418    tail_padding_bytes = align - ((offset + bytes) & (align - 1));
1419
1420
1421    assert(flags & BDRV_REQ_ZERO_WRITE);
1422    if (head_padding_bytes || tail_padding_bytes) {
1423        buf = qemu_blockalign(bs, align);
1424        iov = (struct iovec) {
1425            .iov_base   = buf,
1426            .iov_len    = align,
1427        };
1428        qemu_iovec_init_external(&local_qiov, &iov, 1);
1429    }
1430    if (head_padding_bytes) {
1431        uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1432
1433        /* RMW the unaligned part before head. */
1434        mark_request_serialising(req, align);
1435        wait_serialising_requests(req);
1436        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1437        ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
1438                                  align, &local_qiov, 0);
1439        if (ret < 0) {
1440            goto fail;
1441        }
1442        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1443
1444        memset(buf + head_padding_bytes, 0, zero_bytes);
1445        ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
1446                                   align, &local_qiov,
1447                                   flags & ~BDRV_REQ_ZERO_WRITE);
1448        if (ret < 0) {
1449            goto fail;
1450        }
1451        offset += zero_bytes;
1452        bytes -= zero_bytes;
1453    }
1454
1455    assert(!bytes || (offset & (align - 1)) == 0);
1456    if (bytes >= align) {
1457        /* Write the aligned part in the middle. */
1458        uint64_t aligned_bytes = bytes & ~(align - 1);
1459        ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, align,
1460                                   NULL, flags);
1461        if (ret < 0) {
1462            goto fail;
1463        }
1464        bytes -= aligned_bytes;
1465        offset += aligned_bytes;
1466    }
1467
1468    assert(!bytes || (offset & (align - 1)) == 0);
1469    if (bytes) {
1470        assert(align == tail_padding_bytes + bytes);
1471        /* RMW the unaligned part after tail. */
1472        mark_request_serialising(req, align);
1473        wait_serialising_requests(req);
1474        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1475        ret = bdrv_aligned_preadv(bs, req, offset, align,
1476                                  align, &local_qiov, 0);
1477        if (ret < 0) {
1478            goto fail;
1479        }
1480        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1481
1482        memset(buf, 0, bytes);
1483        ret = bdrv_aligned_pwritev(bs, req, offset, align, align,
1484                                   &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1485    }
1486fail:
1487    qemu_vfree(buf);
1488    return ret;
1489
1490}
1491
1492/*
1493 * Handle a write request in coroutine context
1494 */
1495int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
1496    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1497    BdrvRequestFlags flags)
1498{
1499    BlockDriverState *bs = child->bs;
1500    BdrvTrackedRequest req;
1501    uint64_t align = bs->bl.request_alignment;
1502    uint8_t *head_buf = NULL;
1503    uint8_t *tail_buf = NULL;
1504    QEMUIOVector local_qiov;
1505    bool use_local_qiov = false;
1506    int ret;
1507
1508    if (!bs->drv) {
1509        return -ENOMEDIUM;
1510    }
1511    if (bs->read_only) {
1512        return -EPERM;
1513    }
1514    assert(!(bs->open_flags & BDRV_O_INACTIVE));
1515
1516    ret = bdrv_check_byte_request(bs, offset, bytes);
1517    if (ret < 0) {
1518        return ret;
1519    }
1520
1521    bdrv_inc_in_flight(bs);
1522    /*
1523     * Align write if necessary by performing a read-modify-write cycle.
1524     * Pad qiov with the read parts and be sure to have a tracked request not
1525     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1526     */
1527    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1528
1529    if (!qiov) {
1530        ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
1531        goto out;
1532    }
1533
1534    if (offset & (align - 1)) {
1535        QEMUIOVector head_qiov;
1536        struct iovec head_iov;
1537
1538        mark_request_serialising(&req, align);
1539        wait_serialising_requests(&req);
1540
1541        head_buf = qemu_blockalign(bs, align);
1542        head_iov = (struct iovec) {
1543            .iov_base   = head_buf,
1544            .iov_len    = align,
1545        };
1546        qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1547
1548        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1549        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
1550                                  align, &head_qiov, 0);
1551        if (ret < 0) {
1552            goto fail;
1553        }
1554        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1555
1556        qemu_iovec_init(&local_qiov, qiov->niov + 2);
1557        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1558        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1559        use_local_qiov = true;
1560
1561        bytes += offset & (align - 1);
1562        offset = offset & ~(align - 1);
1563
1564        /* We have read the tail already if the request is smaller
1565         * than one aligned block.
1566         */
1567        if (bytes < align) {
1568            qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1569            bytes = align;
1570        }
1571    }
1572
1573    if ((offset + bytes) & (align - 1)) {
1574        QEMUIOVector tail_qiov;
1575        struct iovec tail_iov;
1576        size_t tail_bytes;
1577        bool waited;
1578
1579        mark_request_serialising(&req, align);
1580        waited = wait_serialising_requests(&req);
1581        assert(!waited || !use_local_qiov);
1582
1583        tail_buf = qemu_blockalign(bs, align);
1584        tail_iov = (struct iovec) {
1585            .iov_base   = tail_buf,
1586            .iov_len    = align,
1587        };
1588        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1589
1590        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1591        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
1592                                  align, &tail_qiov, 0);
1593        if (ret < 0) {
1594            goto fail;
1595        }
1596        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1597
1598        if (!use_local_qiov) {
1599            qemu_iovec_init(&local_qiov, qiov->niov + 1);
1600            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1601            use_local_qiov = true;
1602        }
1603
1604        tail_bytes = (offset + bytes) & (align - 1);
1605        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1606
1607        bytes = ROUND_UP(bytes, align);
1608    }
1609
1610    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, align,
1611                               use_local_qiov ? &local_qiov : qiov,
1612                               flags);
1613
1614fail:
1615
1616    if (use_local_qiov) {
1617        qemu_iovec_destroy(&local_qiov);
1618    }
1619    qemu_vfree(head_buf);
1620    qemu_vfree(tail_buf);
1621out:
1622    tracked_request_end(&req);
1623    bdrv_dec_in_flight(bs);
1624    return ret;
1625}
1626
1627static int coroutine_fn bdrv_co_do_writev(BdrvChild *child,
1628    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1629    BdrvRequestFlags flags)
1630{
1631    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1632        return -EINVAL;
1633    }
1634
1635    return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS,
1636                           nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1637}
1638
1639int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num,
1640    int nb_sectors, QEMUIOVector *qiov)
1641{
1642    trace_bdrv_co_writev(child->bs, sector_num, nb_sectors);
1643
1644    return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0);
1645}
1646
1647int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
1648                                       int count, BdrvRequestFlags flags)
1649{
1650    trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags);
1651
1652    if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
1653        flags &= ~BDRV_REQ_MAY_UNMAP;
1654    }
1655
1656    return bdrv_co_pwritev(child, offset, count, NULL,
1657                           BDRV_REQ_ZERO_WRITE | flags);
1658}
1659
1660/*
1661 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
1662 */
1663int bdrv_flush_all(void)
1664{
1665    BdrvNextIterator it;
1666    BlockDriverState *bs = NULL;
1667    int result = 0;
1668
1669    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
1670        AioContext *aio_context = bdrv_get_aio_context(bs);
1671        int ret;
1672
1673        aio_context_acquire(aio_context);
1674        ret = bdrv_flush(bs);
1675        if (ret < 0 && !result) {
1676            result = ret;
1677        }
1678        aio_context_release(aio_context);
1679    }
1680
1681    return result;
1682}
1683
1684
1685typedef struct BdrvCoGetBlockStatusData {
1686    BlockDriverState *bs;
1687    BlockDriverState *base;
1688    BlockDriverState **file;
1689    int64_t sector_num;
1690    int nb_sectors;
1691    int *pnum;
1692    int64_t ret;
1693    bool done;
1694} BdrvCoGetBlockStatusData;
1695
1696/*
1697 * Returns the allocation status of the specified sectors.
1698 * Drivers not implementing the functionality are assumed to not support
1699 * backing files, hence all their sectors are reported as allocated.
1700 *
1701 * If 'sector_num' is beyond the end of the disk image the return value is 0
1702 * and 'pnum' is set to 0.
1703 *
1704 * 'pnum' is set to the number of sectors (including and immediately following
1705 * the specified sector) that are known to be in the same
1706 * allocated/unallocated state.
1707 *
1708 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1709 * beyond the end of the disk image it will be clamped.
1710 *
1711 * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
1712 * points to the BDS which the sector range is allocated in.
1713 */
1714static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
1715                                                     int64_t sector_num,
1716                                                     int nb_sectors, int *pnum,
1717                                                     BlockDriverState **file)
1718{
1719    int64_t total_sectors;
1720    int64_t n;
1721    int64_t ret, ret2;
1722
1723    total_sectors = bdrv_nb_sectors(bs);
1724    if (total_sectors < 0) {
1725        return total_sectors;
1726    }
1727
1728    if (sector_num >= total_sectors) {
1729        *pnum = 0;
1730        return 0;
1731    }
1732
1733    n = total_sectors - sector_num;
1734    if (n < nb_sectors) {
1735        nb_sectors = n;
1736    }
1737
1738    if (!bs->drv->bdrv_co_get_block_status) {
1739        *pnum = nb_sectors;
1740        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1741        if (bs->drv->protocol_name) {
1742            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
1743        }
1744        return ret;
1745    }
1746
1747    *file = NULL;
1748    bdrv_inc_in_flight(bs);
1749    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
1750                                            file);
1751    if (ret < 0) {
1752        *pnum = 0;
1753        goto out;
1754    }
1755
1756    if (ret & BDRV_BLOCK_RAW) {
1757        assert(ret & BDRV_BLOCK_OFFSET_VALID);
1758        ret = bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
1759                                    *pnum, pnum, file);
1760        goto out;
1761    }
1762
1763    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1764        ret |= BDRV_BLOCK_ALLOCATED;
1765    } else {
1766        if (bdrv_unallocated_blocks_are_zero(bs)) {
1767            ret |= BDRV_BLOCK_ZERO;
1768        } else if (bs->backing) {
1769            BlockDriverState *bs2 = bs->backing->bs;
1770            int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
1771            if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
1772                ret |= BDRV_BLOCK_ZERO;
1773            }
1774        }
1775    }
1776
1777    if (*file && *file != bs &&
1778        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1779        (ret & BDRV_BLOCK_OFFSET_VALID)) {
1780        BlockDriverState *file2;
1781        int file_pnum;
1782
1783        ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
1784                                        *pnum, &file_pnum, &file2);
1785        if (ret2 >= 0) {
1786            /* Ignore errors.  This is just providing extra information, it
1787             * is useful but not necessary.
1788             */
1789            if (!file_pnum) {
1790                /* !file_pnum indicates an offset at or beyond the EOF; it is
1791                 * perfectly valid for the format block driver to point to such
1792                 * offsets, so catch it and mark everything as zero */
1793                ret |= BDRV_BLOCK_ZERO;
1794            } else {
1795                /* Limit request to the range reported by the protocol driver */
1796                *pnum = file_pnum;
1797                ret |= (ret2 & BDRV_BLOCK_ZERO);
1798            }
1799        }
1800    }
1801
1802out:
1803    bdrv_dec_in_flight(bs);
1804    return ret;
1805}
1806
1807static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
1808        BlockDriverState *base,
1809        int64_t sector_num,
1810        int nb_sectors,
1811        int *pnum,
1812        BlockDriverState **file)
1813{
1814    BlockDriverState *p;
1815    int64_t ret = 0;
1816
1817    assert(bs != base);
1818    for (p = bs; p != base; p = backing_bs(p)) {
1819        ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
1820        if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
1821            break;
1822        }
1823        /* [sector_num, pnum] unallocated on this layer, which could be only
1824         * the first part of [sector_num, nb_sectors].  */
1825        nb_sectors = MIN(nb_sectors, *pnum);
1826    }
1827    return ret;
1828}
1829
1830/* Coroutine wrapper for bdrv_get_block_status_above() */
1831static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
1832{
1833    BdrvCoGetBlockStatusData *data = opaque;
1834
1835    data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
1836                                               data->sector_num,
1837                                               data->nb_sectors,
1838                                               data->pnum,
1839                                               data->file);
1840    data->done = true;
1841}
1842
1843/*
1844 * Synchronous wrapper around bdrv_co_get_block_status_above().
1845 *
1846 * See bdrv_co_get_block_status_above() for details.
1847 */
1848int64_t bdrv_get_block_status_above(BlockDriverState *bs,
1849                                    BlockDriverState *base,
1850                                    int64_t sector_num,
1851                                    int nb_sectors, int *pnum,
1852                                    BlockDriverState **file)
1853{
1854    Coroutine *co;
1855    BdrvCoGetBlockStatusData data = {
1856        .bs = bs,
1857        .base = base,
1858        .file = file,
1859        .sector_num = sector_num,
1860        .nb_sectors = nb_sectors,
1861        .pnum = pnum,
1862        .done = false,
1863    };
1864
1865    if (qemu_in_coroutine()) {
1866        /* Fast-path if already in coroutine context */
1867        bdrv_get_block_status_above_co_entry(&data);
1868    } else {
1869        co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry,
1870                                   &data);
1871        qemu_coroutine_enter(co);
1872        BDRV_POLL_WHILE(bs, !data.done);
1873    }
1874    return data.ret;
1875}
1876
1877int64_t bdrv_get_block_status(BlockDriverState *bs,
1878                              int64_t sector_num,
1879                              int nb_sectors, int *pnum,
1880                              BlockDriverState **file)
1881{
1882    return bdrv_get_block_status_above(bs, backing_bs(bs),
1883                                       sector_num, nb_sectors, pnum, file);
1884}
1885
1886int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
1887                                   int nb_sectors, int *pnum)
1888{
1889    BlockDriverState *file;
1890    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum,
1891                                        &file);
1892    if (ret < 0) {
1893        return ret;
1894    }
1895    return !!(ret & BDRV_BLOCK_ALLOCATED);
1896}
1897
1898/*
1899 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
1900 *
1901 * Return true if the given sector is allocated in any image between
1902 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
1903 * sector is allocated in any image of the chain.  Return false otherwise.
1904 *
1905 * 'pnum' is set to the number of sectors (including and immediately following
1906 *  the specified sector) that are known to be in the same
1907 *  allocated/unallocated state.
1908 *
1909 */
1910int bdrv_is_allocated_above(BlockDriverState *top,
1911                            BlockDriverState *base,
1912                            int64_t sector_num,
1913                            int nb_sectors, int *pnum)
1914{
1915    BlockDriverState *intermediate;
1916    int ret, n = nb_sectors;
1917
1918    intermediate = top;
1919    while (intermediate && intermediate != base) {
1920        int pnum_inter;
1921        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
1922                                &pnum_inter);
1923        if (ret < 0) {
1924            return ret;
1925        } else if (ret) {
1926            *pnum = pnum_inter;
1927            return 1;
1928        }
1929
1930        /*
1931         * [sector_num, nb_sectors] is unallocated on top but intermediate
1932         * might have
1933         *
1934         * [sector_num+x, nr_sectors] allocated.
1935         */
1936        if (n > pnum_inter &&
1937            (intermediate == top ||
1938             sector_num + pnum_inter < intermediate->total_sectors)) {
1939            n = pnum_inter;
1940        }
1941
1942        intermediate = backing_bs(intermediate);
1943    }
1944
1945    *pnum = n;
1946    return 0;
1947}
1948
1949typedef struct BdrvVmstateCo {
1950    BlockDriverState    *bs;
1951    QEMUIOVector        *qiov;
1952    int64_t             pos;
1953    bool                is_read;
1954    int                 ret;
1955} BdrvVmstateCo;
1956
1957static int coroutine_fn
1958bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
1959                   bool is_read)
1960{
1961    BlockDriver *drv = bs->drv;
1962
1963    if (!drv) {
1964        return -ENOMEDIUM;
1965    } else if (drv->bdrv_load_vmstate) {
1966        return is_read ? drv->bdrv_load_vmstate(bs, qiov, pos)
1967                       : drv->bdrv_save_vmstate(bs, qiov, pos);
1968    } else if (bs->file) {
1969        return bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
1970    }
1971
1972    return -ENOTSUP;
1973}
1974
1975static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
1976{
1977    BdrvVmstateCo *co = opaque;
1978    co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
1979}
1980
1981static inline int
1982bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
1983                bool is_read)
1984{
1985    if (qemu_in_coroutine()) {
1986        return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
1987    } else {
1988        BdrvVmstateCo data = {
1989            .bs         = bs,
1990            .qiov       = qiov,
1991            .pos        = pos,
1992            .is_read    = is_read,
1993            .ret        = -EINPROGRESS,
1994        };
1995        Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
1996
1997        qemu_coroutine_enter(co);
1998        while (data.ret == -EINPROGRESS) {
1999            aio_poll(bdrv_get_aio_context(bs), true);
2000        }
2001        return data.ret;
2002    }
2003}
2004
2005int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2006                      int64_t pos, int size)
2007{
2008    QEMUIOVector qiov;
2009    struct iovec iov = {
2010        .iov_base   = (void *) buf,
2011        .iov_len    = size,
2012    };
2013    int ret;
2014
2015    qemu_iovec_init_external(&qiov, &iov, 1);
2016
2017    ret = bdrv_writev_vmstate(bs, &qiov, pos);
2018    if (ret < 0) {
2019        return ret;
2020    }
2021
2022    return size;
2023}
2024
2025int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2026{
2027    return bdrv_rw_vmstate(bs, qiov, pos, false);
2028}
2029
2030int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2031                      int64_t pos, int size)
2032{
2033    QEMUIOVector qiov;
2034    struct iovec iov = {
2035        .iov_base   = buf,
2036        .iov_len    = size,
2037    };
2038    int ret;
2039
2040    qemu_iovec_init_external(&qiov, &iov, 1);
2041    ret = bdrv_readv_vmstate(bs, &qiov, pos);
2042    if (ret < 0) {
2043        return ret;
2044    }
2045
2046    return size;
2047}
2048
2049int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2050{
2051    return bdrv_rw_vmstate(bs, qiov, pos, true);
2052}
2053
2054/**************************************************************/
2055/* async I/Os */
2056
2057BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num,
2058                           QEMUIOVector *qiov, int nb_sectors,
2059                           BlockCompletionFunc *cb, void *opaque)
2060{
2061    trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque);
2062
2063    assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
2064    return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
2065                                  0, cb, opaque, false);
2066}
2067
2068BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num,
2069                            QEMUIOVector *qiov, int nb_sectors,
2070                            BlockCompletionFunc *cb, void *opaque)
2071{
2072    trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque);
2073
2074    assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
2075    return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
2076                                  0, cb, opaque, true);
2077}
2078
2079void bdrv_aio_cancel(BlockAIOCB *acb)
2080{
2081    qemu_aio_ref(acb);
2082    bdrv_aio_cancel_async(acb);
2083    while (acb->refcnt > 1) {
2084        if (acb->aiocb_info->get_aio_context) {
2085            aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2086        } else if (acb->bs) {
2087            aio_poll(bdrv_get_aio_context(acb->bs), true);
2088        } else {
2089            abort();
2090        }
2091    }
2092    qemu_aio_unref(acb);
2093}
2094
2095/* Async version of aio cancel. The caller is not blocked if the acb implements
2096 * cancel_async, otherwise we do nothing and let the request normally complete.
2097 * In either case the completion callback must be called. */
2098void bdrv_aio_cancel_async(BlockAIOCB *acb)
2099{
2100    if (acb->aiocb_info->cancel_async) {
2101        acb->aiocb_info->cancel_async(acb);
2102    }
2103}
2104
2105/**************************************************************/
2106/* async block device emulation */
2107
2108typedef struct BlockRequest {
2109    union {
2110        /* Used during read, write, trim */
2111        struct {
2112            int64_t offset;
2113            int bytes;
2114            int flags;
2115            QEMUIOVector *qiov;
2116        };
2117        /* Used during ioctl */
2118        struct {
2119            int req;
2120            void *buf;
2121        };
2122    };
2123    BlockCompletionFunc *cb;
2124    void *opaque;
2125
2126    int error;
2127} BlockRequest;
2128
2129typedef struct BlockAIOCBCoroutine {
2130    BlockAIOCB common;
2131    BdrvChild *child;
2132    BlockRequest req;
2133    bool is_write;
2134    bool need_bh;
2135    bool *done;
2136} BlockAIOCBCoroutine;
2137
2138static const AIOCBInfo bdrv_em_co_aiocb_info = {
2139    .aiocb_size         = sizeof(BlockAIOCBCoroutine),
2140};
2141
2142static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
2143{
2144    if (!acb->need_bh) {
2145        bdrv_dec_in_flight(acb->common.bs);
2146        acb->common.cb(acb->common.opaque, acb->req.error);
2147        qemu_aio_unref(acb);
2148    }
2149}
2150
2151static void bdrv_co_em_bh(void *opaque)
2152{
2153    BlockAIOCBCoroutine *acb = opaque;
2154
2155    assert(!acb->need_bh);
2156    bdrv_co_complete(acb);
2157}
2158
2159static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
2160{
2161    acb->need_bh = false;
2162    if (acb->req.error != -EINPROGRESS) {
2163        BlockDriverState *bs = acb->common.bs;
2164
2165        aio_bh_schedule_oneshot(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
2166    }
2167}
2168
2169/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2170static void coroutine_fn bdrv_co_do_rw(void *opaque)
2171{
2172    BlockAIOCBCoroutine *acb = opaque;
2173
2174    if (!acb->is_write) {
2175        acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset,
2176            acb->req.qiov->size, acb->req.qiov, acb->req.flags);
2177    } else {
2178        acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset,
2179            acb->req.qiov->size, acb->req.qiov, acb->req.flags);
2180    }
2181
2182    bdrv_co_complete(acb);
2183}
2184
2185static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
2186                                          int64_t offset,
2187                                          QEMUIOVector *qiov,
2188                                          BdrvRequestFlags flags,
2189                                          BlockCompletionFunc *cb,
2190                                          void *opaque,
2191                                          bool is_write)
2192{
2193    Coroutine *co;
2194    BlockAIOCBCoroutine *acb;
2195
2196    /* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
2197    bdrv_inc_in_flight(child->bs);
2198
2199    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, child->bs, cb, opaque);
2200    acb->child = child;
2201    acb->need_bh = true;
2202    acb->req.error = -EINPROGRESS;
2203    acb->req.offset = offset;
2204    acb->req.qiov = qiov;
2205    acb->req.flags = flags;
2206    acb->is_write = is_write;
2207
2208    co = qemu_coroutine_create(bdrv_co_do_rw, acb);
2209    qemu_coroutine_enter(co);
2210
2211    bdrv_co_maybe_schedule_bh(acb);
2212    return &acb->common;
2213}
2214
2215static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
2216{
2217    BlockAIOCBCoroutine *acb = opaque;
2218    BlockDriverState *bs = acb->common.bs;
2219
2220    acb->req.error = bdrv_co_flush(bs);
2221    bdrv_co_complete(acb);
2222}
2223
2224BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2225        BlockCompletionFunc *cb, void *opaque)
2226{
2227    trace_bdrv_aio_flush(bs, opaque);
2228
2229    Coroutine *co;
2230    BlockAIOCBCoroutine *acb;
2231
2232    /* Matched by bdrv_co_complete's bdrv_dec_in_flight.  */
2233    bdrv_inc_in_flight(bs);
2234
2235    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2236    acb->need_bh = true;
2237    acb->req.error = -EINPROGRESS;
2238
2239    co = qemu_coroutine_create(bdrv_aio_flush_co_entry, acb);
2240    qemu_coroutine_enter(co);
2241
2242    bdrv_co_maybe_schedule_bh(acb);
2243    return &acb->common;
2244}
2245
2246void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
2247                   BlockCompletionFunc *cb, void *opaque)
2248{
2249    BlockAIOCB *acb;
2250
2251    acb = g_malloc(aiocb_info->aiocb_size);
2252    acb->aiocb_info = aiocb_info;
2253    acb->bs = bs;
2254    acb->cb = cb;
2255    acb->opaque = opaque;
2256    acb->refcnt = 1;
2257    return acb;
2258}
2259
2260void qemu_aio_ref(void *p)
2261{
2262    BlockAIOCB *acb = p;
2263    acb->refcnt++;
2264}
2265
2266void qemu_aio_unref(void *p)
2267{
2268    BlockAIOCB *acb = p;
2269    assert(acb->refcnt > 0);
2270    if (--acb->refcnt == 0) {
2271        g_free(acb);
2272    }
2273}
2274
2275/**************************************************************/
2276/* Coroutine block device emulation */
2277
2278typedef struct FlushCo {
2279    BlockDriverState *bs;
2280    int ret;
2281} FlushCo;
2282
2283
2284static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2285{
2286    FlushCo *rwco = opaque;
2287
2288    rwco->ret = bdrv_co_flush(rwco->bs);
2289}
2290
2291int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2292{
2293    int ret;
2294
2295    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2296        bdrv_is_sg(bs)) {
2297        return 0;
2298    }
2299
2300    bdrv_inc_in_flight(bs);
2301
2302    int current_gen = bs->write_gen;
2303
2304    /* Wait until any previous flushes are completed */
2305    while (bs->active_flush_req) {
2306        qemu_co_queue_wait(&bs->flush_queue);
2307    }
2308
2309    bs->active_flush_req = true;
2310
2311    /* Write back all layers by calling one driver function */
2312    if (bs->drv->bdrv_co_flush) {
2313        ret = bs->drv->bdrv_co_flush(bs);
2314        goto out;
2315    }
2316
2317    /* Write back cached data to the OS even with cache=unsafe */
2318    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2319    if (bs->drv->bdrv_co_flush_to_os) {
2320        ret = bs->drv->bdrv_co_flush_to_os(bs);
2321        if (ret < 0) {
2322            goto out;
2323        }
2324    }
2325
2326    /* But don't actually force it to the disk with cache=unsafe */
2327    if (bs->open_flags & BDRV_O_NO_FLUSH) {
2328        goto flush_parent;
2329    }
2330
2331    /* Check if we really need to flush anything */
2332    if (bs->flushed_gen == current_gen) {
2333        goto flush_parent;
2334    }
2335
2336    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2337    if (bs->drv->bdrv_co_flush_to_disk) {
2338        ret = bs->drv->bdrv_co_flush_to_disk(bs);
2339    } else if (bs->drv->bdrv_aio_flush) {
2340        BlockAIOCB *acb;
2341        CoroutineIOCompletion co = {
2342            .coroutine = qemu_coroutine_self(),
2343        };
2344
2345        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2346        if (acb == NULL) {
2347            ret = -EIO;
2348        } else {
2349            qemu_coroutine_yield();
2350            ret = co.ret;
2351        }
2352    } else {
2353        /*
2354         * Some block drivers always operate in either writethrough or unsafe
2355         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2356         * know how the server works (because the behaviour is hardcoded or
2357         * depends on server-side configuration), so we can't ensure that
2358         * everything is safe on disk. Returning an error doesn't work because
2359         * that would break guests even if the server operates in writethrough
2360         * mode.
2361         *
2362         * Let's hope the user knows what he's doing.
2363         */
2364        ret = 0;
2365    }
2366
2367    if (ret < 0) {
2368        goto out;
2369    }
2370
2371    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2372     * in the case of cache=unsafe, so there are no useless flushes.
2373     */
2374flush_parent:
2375    ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2376out:
2377    /* Notify any pending flushes that we have completed */
2378    if (ret == 0) {
2379        bs->flushed_gen = current_gen;
2380    }
2381    bs->active_flush_req = false;
2382    /* Return value is ignored - it's ok if wait queue is empty */
2383    qemu_co_queue_next(&bs->flush_queue);
2384
2385    bdrv_dec_in_flight(bs);
2386    return ret;
2387}
2388
2389int bdrv_flush(BlockDriverState *bs)
2390{
2391    Coroutine *co;
2392    FlushCo flush_co = {
2393        .bs = bs,
2394        .ret = NOT_DONE,
2395    };
2396
2397    if (qemu_in_coroutine()) {
2398        /* Fast-path if already in coroutine context */
2399        bdrv_flush_co_entry(&flush_co);
2400    } else {
2401        co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
2402        qemu_coroutine_enter(co);
2403        BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
2404    }
2405
2406    return flush_co.ret;
2407}
2408
2409typedef struct DiscardCo {
2410    BlockDriverState *bs;
2411    int64_t offset;
2412    int count;
2413    int ret;
2414} DiscardCo;
2415static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
2416{
2417    DiscardCo *rwco = opaque;
2418
2419    rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count);
2420}
2421
2422int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
2423                                  int count)
2424{
2425    BdrvTrackedRequest req;
2426    int max_pdiscard, ret;
2427    int head, tail, align;
2428
2429    if (!bs->drv) {
2430        return -ENOMEDIUM;
2431    }
2432
2433    ret = bdrv_check_byte_request(bs, offset, count);
2434    if (ret < 0) {
2435        return ret;
2436    } else if (bs->read_only) {
2437        return -EPERM;
2438    }
2439    assert(!(bs->open_flags & BDRV_O_INACTIVE));
2440
2441    /* Do nothing if disabled.  */
2442    if (!(bs->open_flags & BDRV_O_UNMAP)) {
2443        return 0;
2444    }
2445
2446    if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
2447        return 0;
2448    }
2449
2450    /* Discard is advisory, but some devices track and coalesce
2451     * unaligned requests, so we must pass everything down rather than
2452     * round here.  Still, most devices will just silently ignore
2453     * unaligned requests (by returning -ENOTSUP), so we must fragment
2454     * the request accordingly.  */
2455    align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2456    assert(align % bs->bl.request_alignment == 0);
2457    head = offset % align;
2458    tail = (offset + count) % align;
2459
2460    bdrv_inc_in_flight(bs);
2461    tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD);
2462
2463    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2464    if (ret < 0) {
2465        goto out;
2466    }
2467
2468    max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
2469                                   align);
2470    assert(max_pdiscard >= bs->bl.request_alignment);
2471
2472    while (count > 0) {
2473        int ret;
2474        int num = count;
2475
2476        if (head) {
2477            /* Make small requests to get to alignment boundaries. */
2478            num = MIN(count, align - head);
2479            if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
2480                num %= bs->bl.request_alignment;
2481            }
2482            head = (head + num) % align;
2483            assert(num < max_pdiscard);
2484        } else if (tail) {
2485            if (num > align) {
2486                /* Shorten the request to the last aligned cluster.  */
2487                num -= tail;
2488            } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
2489                       tail > bs->bl.request_alignment) {
2490                tail %= bs->bl.request_alignment;
2491                num -= tail;
2492            }
2493        }
2494        /* limit request size */
2495        if (num > max_pdiscard) {
2496            num = max_pdiscard;
2497        }
2498
2499        if (bs->drv->bdrv_co_pdiscard) {
2500            ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
2501        } else {
2502            BlockAIOCB *acb;
2503            CoroutineIOCompletion co = {
2504                .coroutine = qemu_coroutine_self(),
2505            };
2506
2507            acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
2508                                             bdrv_co_io_em_complete, &co);
2509            if (acb == NULL) {
2510                ret = -EIO;
2511                goto out;
2512            } else {
2513                qemu_coroutine_yield();
2514                ret = co.ret;
2515            }
2516        }
2517        if (ret && ret != -ENOTSUP) {
2518            goto out;
2519        }
2520
2521        offset += num;
2522        count -= num;
2523    }
2524    ret = 0;
2525out:
2526    ++bs->write_gen;
2527    bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
2528                   req.bytes >> BDRV_SECTOR_BITS);
2529    tracked_request_end(&req);
2530    bdrv_dec_in_flight(bs);
2531    return ret;
2532}
2533
2534int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
2535{
2536    Coroutine *co;
2537    DiscardCo rwco = {
2538        .bs = bs,
2539        .offset = offset,
2540        .count = count,
2541        .ret = NOT_DONE,
2542    };
2543
2544    if (qemu_in_coroutine()) {
2545        /* Fast-path if already in coroutine context */
2546        bdrv_pdiscard_co_entry(&rwco);
2547    } else {
2548        co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
2549        qemu_coroutine_enter(co);
2550        BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE);
2551    }
2552
2553    return rwco.ret;
2554}
2555
2556int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
2557{
2558    BlockDriver *drv = bs->drv;
2559    CoroutineIOCompletion co = {
2560        .coroutine = qemu_coroutine_self(),
2561    };
2562    BlockAIOCB *acb;
2563
2564    bdrv_inc_in_flight(bs);
2565    if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
2566        co.ret = -ENOTSUP;
2567        goto out;
2568    }
2569
2570    if (drv->bdrv_co_ioctl) {
2571        co.ret = drv->bdrv_co_ioctl(bs, req, buf);
2572    } else {
2573        acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2574        if (!acb) {
2575            co.ret = -ENOTSUP;
2576            goto out;
2577        }
2578        qemu_coroutine_yield();
2579    }
2580out:
2581    bdrv_dec_in_flight(bs);
2582    return co.ret;
2583}
2584
2585void *qemu_blockalign(BlockDriverState *bs, size_t size)
2586{
2587    return qemu_memalign(bdrv_opt_mem_align(bs), size);
2588}
2589
2590void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2591{
2592    return memset(qemu_blockalign(bs, size), 0, size);
2593}
2594
2595void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2596{
2597    size_t align = bdrv_opt_mem_align(bs);
2598
2599    /* Ensure that NULL is never returned on success */
2600    assert(align > 0);
2601    if (size == 0) {
2602        size = align;
2603    }
2604
2605    return qemu_try_memalign(align, size);
2606}
2607
2608void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2609{
2610    void *mem = qemu_try_blockalign(bs, size);
2611
2612    if (mem) {
2613        memset(mem, 0, size);
2614    }
2615
2616    return mem;
2617}
2618
2619/*
2620 * Check if all memory in this vector is sector aligned.
2621 */
2622bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2623{
2624    int i;
2625    size_t alignment = bdrv_min_mem_align(bs);
2626
2627    for (i = 0; i < qiov->niov; i++) {
2628        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2629            return false;
2630        }
2631        if (qiov->iov[i].iov_len % alignment) {
2632            return false;
2633        }
2634    }
2635
2636    return true;
2637}
2638
2639void bdrv_add_before_write_notifier(BlockDriverState *bs,
2640                                    NotifierWithReturn *notifier)
2641{
2642    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2643}
2644
2645void bdrv_io_plug(BlockDriverState *bs)
2646{
2647    BdrvChild *child;
2648
2649    QLIST_FOREACH(child, &bs->children, next) {
2650        bdrv_io_plug(child->bs);
2651    }
2652
2653    if (bs->io_plugged++ == 0 && bs->io_plug_disabled == 0) {
2654        BlockDriver *drv = bs->drv;
2655        if (drv && drv->bdrv_io_plug) {
2656            drv->bdrv_io_plug(bs);
2657        }
2658    }
2659}
2660
2661void bdrv_io_unplug(BlockDriverState *bs)
2662{
2663    BdrvChild *child;
2664
2665    assert(bs->io_plugged);
2666    if (--bs->io_plugged == 0 && bs->io_plug_disabled == 0) {
2667        BlockDriver *drv = bs->drv;
2668        if (drv && drv->bdrv_io_unplug) {
2669            drv->bdrv_io_unplug(bs);
2670        }
2671    }
2672
2673    QLIST_FOREACH(child, &bs->children, next) {
2674        bdrv_io_unplug(child->bs);
2675    }
2676}
2677
2678void bdrv_io_unplugged_begin(BlockDriverState *bs)
2679{
2680    BdrvChild *child;
2681
2682    if (bs->io_plug_disabled++ == 0 && bs->io_plugged > 0) {
2683        BlockDriver *drv = bs->drv;
2684        if (drv && drv->bdrv_io_unplug) {
2685            drv->bdrv_io_unplug(bs);
2686        }
2687    }
2688
2689    QLIST_FOREACH(child, &bs->children, next) {
2690        bdrv_io_unplugged_begin(child->bs);
2691    }
2692}
2693
2694void bdrv_io_unplugged_end(BlockDriverState *bs)
2695{
2696    BdrvChild *child;
2697
2698    assert(bs->io_plug_disabled);
2699    QLIST_FOREACH(child, &bs->children, next) {
2700        bdrv_io_unplugged_end(child->bs);
2701    }
2702
2703    if (--bs->io_plug_disabled == 0 && bs->io_plugged > 0) {
2704        BlockDriver *drv = bs->drv;
2705        if (drv && drv->bdrv_io_plug) {
2706            drv->bdrv_io_plug(bs);
2707        }
2708    }
2709}
2710