qemu/block/io.c
<<
>>
Prefs
   1/*
   2 * Block layer I/O functions
   3 *
   4 * Copyright (c) 2003 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "trace.h"
  27#include "sysemu/block-backend.h"
  28#include "block/blockjob.h"
  29#include "block/block_int.h"
  30#include "qemu/cutils.h"
  31#include "qapi/error.h"
  32#include "qemu/error-report.h"
  33
  34#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  35
  36static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
  37                                          int64_t offset,
  38                                          QEMUIOVector *qiov,
  39                                          BdrvRequestFlags flags,
  40                                          BlockCompletionFunc *cb,
  41                                          void *opaque,
  42                                          bool is_write);
  43static void coroutine_fn bdrv_co_do_rw(void *opaque);
  44static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
  45    int64_t offset, int count, BdrvRequestFlags flags);
  46
  47static void bdrv_parent_drained_begin(BlockDriverState *bs)
  48{
  49    BdrvChild *c;
  50
  51    QLIST_FOREACH(c, &bs->parents, next_parent) {
  52        if (c->role->drained_begin) {
  53            c->role->drained_begin(c);
  54        }
  55    }
  56}
  57
  58static void bdrv_parent_drained_end(BlockDriverState *bs)
  59{
  60    BdrvChild *c;
  61
  62    QLIST_FOREACH(c, &bs->parents, next_parent) {
  63        if (c->role->drained_end) {
  64            c->role->drained_end(c);
  65        }
  66    }
  67}
  68
  69static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
  70{
  71    dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
  72    dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
  73    dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
  74                                 src->opt_mem_alignment);
  75    dst->min_mem_alignment = MAX(dst->min_mem_alignment,
  76                                 src->min_mem_alignment);
  77    dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
  78}
  79
  80void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
  81{
  82    BlockDriver *drv = bs->drv;
  83    Error *local_err = NULL;
  84
  85    memset(&bs->bl, 0, sizeof(bs->bl));
  86
  87    if (!drv) {
  88        return;
  89    }
  90
  91    /* Default alignment based on whether driver has byte interface */
  92    bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512;
  93
  94    /* Take some limits from the children as a default */
  95    if (bs->file) {
  96        bdrv_refresh_limits(bs->file->bs, &local_err);
  97        if (local_err) {
  98            error_propagate(errp, local_err);
  99            return;
 100        }
 101        bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
 102    } else {
 103        bs->bl.min_mem_alignment = 512;
 104        bs->bl.opt_mem_alignment = getpagesize();
 105
 106        /* Safe default since most protocols use readv()/writev()/etc */
 107        bs->bl.max_iov = IOV_MAX;
 108    }
 109
 110    if (bs->backing) {
 111        bdrv_refresh_limits(bs->backing->bs, &local_err);
 112        if (local_err) {
 113            error_propagate(errp, local_err);
 114            return;
 115        }
 116        bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
 117    }
 118
 119    /* Then let the driver override it */
 120    if (drv->bdrv_refresh_limits) {
 121        drv->bdrv_refresh_limits(bs, errp);
 122    }
 123}
 124
 125/**
 126 * The copy-on-read flag is actually a reference count so multiple users may
 127 * use the feature without worrying about clobbering its previous state.
 128 * Copy-on-read stays enabled until all users have called to disable it.
 129 */
 130void bdrv_enable_copy_on_read(BlockDriverState *bs)
 131{
 132    bs->copy_on_read++;
 133}
 134
 135void bdrv_disable_copy_on_read(BlockDriverState *bs)
 136{
 137    assert(bs->copy_on_read > 0);
 138    bs->copy_on_read--;
 139}
 140
 141/* Check if any requests are in-flight (including throttled requests) */
 142bool bdrv_requests_pending(BlockDriverState *bs)
 143{
 144    BdrvChild *child;
 145
 146    if (!QLIST_EMPTY(&bs->tracked_requests)) {
 147        return true;
 148    }
 149
 150    QLIST_FOREACH(child, &bs->children, next) {
 151        if (bdrv_requests_pending(child->bs)) {
 152            return true;
 153        }
 154    }
 155
 156    return false;
 157}
 158
 159static void bdrv_drain_recurse(BlockDriverState *bs)
 160{
 161    BdrvChild *child;
 162
 163    if (bs->drv && bs->drv->bdrv_drain) {
 164        bs->drv->bdrv_drain(bs);
 165    }
 166    QLIST_FOREACH(child, &bs->children, next) {
 167        bdrv_drain_recurse(child->bs);
 168    }
 169}
 170
 171typedef struct {
 172    Coroutine *co;
 173    BlockDriverState *bs;
 174    QEMUBH *bh;
 175    bool done;
 176} BdrvCoDrainData;
 177
 178static void bdrv_drain_poll(BlockDriverState *bs)
 179{
 180    bool busy = true;
 181
 182    while (busy) {
 183        /* Keep iterating */
 184        busy = bdrv_requests_pending(bs);
 185        busy |= aio_poll(bdrv_get_aio_context(bs), busy);
 186    }
 187}
 188
 189static void bdrv_co_drain_bh_cb(void *opaque)
 190{
 191    BdrvCoDrainData *data = opaque;
 192    Coroutine *co = data->co;
 193
 194    qemu_bh_delete(data->bh);
 195    bdrv_drain_poll(data->bs);
 196    data->done = true;
 197    qemu_coroutine_enter(co);
 198}
 199
 200static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
 201{
 202    BdrvCoDrainData data;
 203
 204    /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
 205     * other coroutines run if they were queued from
 206     * qemu_co_queue_run_restart(). */
 207
 208    assert(qemu_in_coroutine());
 209    data = (BdrvCoDrainData) {
 210        .co = qemu_coroutine_self(),
 211        .bs = bs,
 212        .done = false,
 213        .bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_drain_bh_cb, &data),
 214    };
 215    qemu_bh_schedule(data.bh);
 216
 217    qemu_coroutine_yield();
 218    /* If we are resumed from some other event (such as an aio completion or a
 219     * timer callback), it is a bug in the caller that should be fixed. */
 220    assert(data.done);
 221}
 222
 223void bdrv_drained_begin(BlockDriverState *bs)
 224{
 225    if (!bs->quiesce_counter++) {
 226        aio_disable_external(bdrv_get_aio_context(bs));
 227        bdrv_parent_drained_begin(bs);
 228    }
 229
 230    bdrv_io_unplugged_begin(bs);
 231    bdrv_drain_recurse(bs);
 232    if (qemu_in_coroutine()) {
 233        bdrv_co_yield_to_drain(bs);
 234    } else {
 235        bdrv_drain_poll(bs);
 236    }
 237    bdrv_io_unplugged_end(bs);
 238}
 239
 240void bdrv_drained_end(BlockDriverState *bs)
 241{
 242    assert(bs->quiesce_counter > 0);
 243    if (--bs->quiesce_counter > 0) {
 244        return;
 245    }
 246
 247    bdrv_parent_drained_end(bs);
 248    aio_enable_external(bdrv_get_aio_context(bs));
 249}
 250
 251/*
 252 * Wait for pending requests to complete on a single BlockDriverState subtree,
 253 * and suspend block driver's internal I/O until next request arrives.
 254 *
 255 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
 256 * AioContext.
 257 *
 258 * Only this BlockDriverState's AioContext is run, so in-flight requests must
 259 * not depend on events in other AioContexts.  In that case, use
 260 * bdrv_drain_all() instead.
 261 */
 262void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
 263{
 264    assert(qemu_in_coroutine());
 265    bdrv_drained_begin(bs);
 266    bdrv_drained_end(bs);
 267}
 268
 269void bdrv_drain(BlockDriverState *bs)
 270{
 271    bdrv_drained_begin(bs);
 272    bdrv_drained_end(bs);
 273}
 274
 275/*
 276 * Wait for pending requests to complete across all BlockDriverStates
 277 *
 278 * This function does not flush data to disk, use bdrv_flush_all() for that
 279 * after calling this function.
 280 */
 281void bdrv_drain_all(void)
 282{
 283    /* Always run first iteration so any pending completion BHs run */
 284    bool busy = true;
 285    BlockDriverState *bs;
 286    BdrvNextIterator it;
 287    BlockJob *job = NULL;
 288    GSList *aio_ctxs = NULL, *ctx;
 289
 290    while ((job = block_job_next(job))) {
 291        AioContext *aio_context = blk_get_aio_context(job->blk);
 292
 293        aio_context_acquire(aio_context);
 294        block_job_pause(job);
 295        aio_context_release(aio_context);
 296    }
 297
 298    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 299        AioContext *aio_context = bdrv_get_aio_context(bs);
 300
 301        aio_context_acquire(aio_context);
 302        bdrv_parent_drained_begin(bs);
 303        bdrv_io_unplugged_begin(bs);
 304        bdrv_drain_recurse(bs);
 305        aio_context_release(aio_context);
 306
 307        if (!g_slist_find(aio_ctxs, aio_context)) {
 308            aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
 309        }
 310    }
 311
 312    /* Note that completion of an asynchronous I/O operation can trigger any
 313     * number of other I/O operations on other devices---for example a
 314     * coroutine can submit an I/O request to another device in response to
 315     * request completion.  Therefore we must keep looping until there was no
 316     * more activity rather than simply draining each device independently.
 317     */
 318    while (busy) {
 319        busy = false;
 320
 321        for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
 322            AioContext *aio_context = ctx->data;
 323
 324            aio_context_acquire(aio_context);
 325            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 326                if (aio_context == bdrv_get_aio_context(bs)) {
 327                    if (bdrv_requests_pending(bs)) {
 328                        busy = true;
 329                        aio_poll(aio_context, busy);
 330                    }
 331                }
 332            }
 333            busy |= aio_poll(aio_context, false);
 334            aio_context_release(aio_context);
 335        }
 336    }
 337
 338    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 339        AioContext *aio_context = bdrv_get_aio_context(bs);
 340
 341        aio_context_acquire(aio_context);
 342        bdrv_io_unplugged_end(bs);
 343        bdrv_parent_drained_end(bs);
 344        aio_context_release(aio_context);
 345    }
 346    g_slist_free(aio_ctxs);
 347
 348    job = NULL;
 349    while ((job = block_job_next(job))) {
 350        AioContext *aio_context = blk_get_aio_context(job->blk);
 351
 352        aio_context_acquire(aio_context);
 353        block_job_resume(job);
 354        aio_context_release(aio_context);
 355    }
 356}
 357
 358/**
 359 * Remove an active request from the tracked requests list
 360 *
 361 * This function should be called when a tracked request is completing.
 362 */
 363static void tracked_request_end(BdrvTrackedRequest *req)
 364{
 365    if (req->serialising) {
 366        req->bs->serialising_in_flight--;
 367    }
 368
 369    QLIST_REMOVE(req, list);
 370    qemu_co_queue_restart_all(&req->wait_queue);
 371}
 372
 373/**
 374 * Add an active request to the tracked requests list
 375 */
 376static void tracked_request_begin(BdrvTrackedRequest *req,
 377                                  BlockDriverState *bs,
 378                                  int64_t offset,
 379                                  unsigned int bytes,
 380                                  enum BdrvTrackedRequestType type)
 381{
 382    *req = (BdrvTrackedRequest){
 383        .bs = bs,
 384        .offset         = offset,
 385        .bytes          = bytes,
 386        .type           = type,
 387        .co             = qemu_coroutine_self(),
 388        .serialising    = false,
 389        .overlap_offset = offset,
 390        .overlap_bytes  = bytes,
 391    };
 392
 393    qemu_co_queue_init(&req->wait_queue);
 394
 395    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
 396}
 397
 398static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
 399{
 400    int64_t overlap_offset = req->offset & ~(align - 1);
 401    unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
 402                               - overlap_offset;
 403
 404    if (!req->serialising) {
 405        req->bs->serialising_in_flight++;
 406        req->serialising = true;
 407    }
 408
 409    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
 410    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
 411}
 412
 413/**
 414 * Round a region to cluster boundaries (sector-based)
 415 */
 416void bdrv_round_sectors_to_clusters(BlockDriverState *bs,
 417                                    int64_t sector_num, int nb_sectors,
 418                                    int64_t *cluster_sector_num,
 419                                    int *cluster_nb_sectors)
 420{
 421    BlockDriverInfo bdi;
 422
 423    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
 424        *cluster_sector_num = sector_num;
 425        *cluster_nb_sectors = nb_sectors;
 426    } else {
 427        int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
 428        *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
 429        *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
 430                                            nb_sectors, c);
 431    }
 432}
 433
 434/**
 435 * Round a region to cluster boundaries
 436 */
 437void bdrv_round_to_clusters(BlockDriverState *bs,
 438                            int64_t offset, unsigned int bytes,
 439                            int64_t *cluster_offset,
 440                            unsigned int *cluster_bytes)
 441{
 442    BlockDriverInfo bdi;
 443
 444    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
 445        *cluster_offset = offset;
 446        *cluster_bytes = bytes;
 447    } else {
 448        int64_t c = bdi.cluster_size;
 449        *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
 450        *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
 451    }
 452}
 453
 454static int bdrv_get_cluster_size(BlockDriverState *bs)
 455{
 456    BlockDriverInfo bdi;
 457    int ret;
 458
 459    ret = bdrv_get_info(bs, &bdi);
 460    if (ret < 0 || bdi.cluster_size == 0) {
 461        return bs->bl.request_alignment;
 462    } else {
 463        return bdi.cluster_size;
 464    }
 465}
 466
 467static bool tracked_request_overlaps(BdrvTrackedRequest *req,
 468                                     int64_t offset, unsigned int bytes)
 469{
 470    /*        aaaa   bbbb */
 471    if (offset >= req->overlap_offset + req->overlap_bytes) {
 472        return false;
 473    }
 474    /* bbbb   aaaa        */
 475    if (req->overlap_offset >= offset + bytes) {
 476        return false;
 477    }
 478    return true;
 479}
 480
 481static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
 482{
 483    BlockDriverState *bs = self->bs;
 484    BdrvTrackedRequest *req;
 485    bool retry;
 486    bool waited = false;
 487
 488    if (!bs->serialising_in_flight) {
 489        return false;
 490    }
 491
 492    do {
 493        retry = false;
 494        QLIST_FOREACH(req, &bs->tracked_requests, list) {
 495            if (req == self || (!req->serialising && !self->serialising)) {
 496                continue;
 497            }
 498            if (tracked_request_overlaps(req, self->overlap_offset,
 499                                         self->overlap_bytes))
 500            {
 501                /* Hitting this means there was a reentrant request, for
 502                 * example, a block driver issuing nested requests.  This must
 503                 * never happen since it means deadlock.
 504                 */
 505                assert(qemu_coroutine_self() != req->co);
 506
 507                /* If the request is already (indirectly) waiting for us, or
 508                 * will wait for us as soon as it wakes up, then just go on
 509                 * (instead of producing a deadlock in the former case). */
 510                if (!req->waiting_for) {
 511                    self->waiting_for = req;
 512                    qemu_co_queue_wait(&req->wait_queue);
 513                    self->waiting_for = NULL;
 514                    retry = true;
 515                    waited = true;
 516                    break;
 517                }
 518            }
 519        }
 520    } while (retry);
 521
 522    return waited;
 523}
 524
 525static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 526                                   size_t size)
 527{
 528    if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
 529        return -EIO;
 530    }
 531
 532    if (!bdrv_is_inserted(bs)) {
 533        return -ENOMEDIUM;
 534    }
 535
 536    if (offset < 0) {
 537        return -EIO;
 538    }
 539
 540    return 0;
 541}
 542
 543static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
 544                              int nb_sectors)
 545{
 546    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
 547        return -EIO;
 548    }
 549
 550    return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE,
 551                                   nb_sectors * BDRV_SECTOR_SIZE);
 552}
 553
 554typedef struct RwCo {
 555    BdrvChild *child;
 556    int64_t offset;
 557    QEMUIOVector *qiov;
 558    bool is_write;
 559    int ret;
 560    BdrvRequestFlags flags;
 561} RwCo;
 562
 563static void coroutine_fn bdrv_rw_co_entry(void *opaque)
 564{
 565    RwCo *rwco = opaque;
 566
 567    if (!rwco->is_write) {
 568        rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
 569                                   rwco->qiov->size, rwco->qiov,
 570                                   rwco->flags);
 571    } else {
 572        rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
 573                                    rwco->qiov->size, rwco->qiov,
 574                                    rwco->flags);
 575    }
 576}
 577
 578/*
 579 * Process a vectored synchronous request using coroutines
 580 */
 581static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
 582                        QEMUIOVector *qiov, bool is_write,
 583                        BdrvRequestFlags flags)
 584{
 585    Coroutine *co;
 586    RwCo rwco = {
 587        .child = child,
 588        .offset = offset,
 589        .qiov = qiov,
 590        .is_write = is_write,
 591        .ret = NOT_DONE,
 592        .flags = flags,
 593    };
 594
 595    if (qemu_in_coroutine()) {
 596        /* Fast-path if already in coroutine context */
 597        bdrv_rw_co_entry(&rwco);
 598    } else {
 599        AioContext *aio_context = bdrv_get_aio_context(child->bs);
 600
 601        co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
 602        qemu_coroutine_enter(co);
 603        while (rwco.ret == NOT_DONE) {
 604            aio_poll(aio_context, true);
 605        }
 606    }
 607    return rwco.ret;
 608}
 609
 610/*
 611 * Process a synchronous request using coroutines
 612 */
 613static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf,
 614                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
 615{
 616    QEMUIOVector qiov;
 617    struct iovec iov = {
 618        .iov_base = (void *)buf,
 619        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
 620    };
 621
 622    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
 623        return -EINVAL;
 624    }
 625
 626    qemu_iovec_init_external(&qiov, &iov, 1);
 627    return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS,
 628                        &qiov, is_write, flags);
 629}
 630
 631/* return < 0 if error. See bdrv_write() for the return codes */
 632int bdrv_read(BdrvChild *child, int64_t sector_num,
 633              uint8_t *buf, int nb_sectors)
 634{
 635    return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0);
 636}
 637
 638/* Return < 0 if error. Important errors are:
 639  -EIO         generic I/O error (may happen for all errors)
 640  -ENOMEDIUM   No media inserted.
 641  -EINVAL      Invalid sector number or nb_sectors
 642  -EACCES      Trying to write a read-only device
 643*/
 644int bdrv_write(BdrvChild *child, int64_t sector_num,
 645               const uint8_t *buf, int nb_sectors)
 646{
 647    return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
 648}
 649
 650int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
 651                       int count, BdrvRequestFlags flags)
 652{
 653    QEMUIOVector qiov;
 654    struct iovec iov = {
 655        .iov_base = NULL,
 656        .iov_len = count,
 657    };
 658
 659    qemu_iovec_init_external(&qiov, &iov, 1);
 660    return bdrv_prwv_co(child, offset, &qiov, true,
 661                        BDRV_REQ_ZERO_WRITE | flags);
 662}
 663
 664/*
 665 * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
 666 * The operation is sped up by checking the block status and only writing
 667 * zeroes to the device if they currently do not return zeroes. Optional
 668 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
 669 * BDRV_REQ_FUA).
 670 *
 671 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
 672 */
 673int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
 674{
 675    int64_t target_sectors, ret, nb_sectors, sector_num = 0;
 676    BlockDriverState *bs = child->bs;
 677    BlockDriverState *file;
 678    int n;
 679
 680    target_sectors = bdrv_nb_sectors(bs);
 681    if (target_sectors < 0) {
 682        return target_sectors;
 683    }
 684
 685    for (;;) {
 686        nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
 687        if (nb_sectors <= 0) {
 688            return 0;
 689        }
 690        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file);
 691        if (ret < 0) {
 692            error_report("error getting block status at sector %" PRId64 ": %s",
 693                         sector_num, strerror(-ret));
 694            return ret;
 695        }
 696        if (ret & BDRV_BLOCK_ZERO) {
 697            sector_num += n;
 698            continue;
 699        }
 700        ret = bdrv_pwrite_zeroes(child, sector_num << BDRV_SECTOR_BITS,
 701                                 n << BDRV_SECTOR_BITS, flags);
 702        if (ret < 0) {
 703            error_report("error writing zeroes at sector %" PRId64 ": %s",
 704                         sector_num, strerror(-ret));
 705            return ret;
 706        }
 707        sector_num += n;
 708    }
 709}
 710
 711int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
 712{
 713    int ret;
 714
 715    ret = bdrv_prwv_co(child, offset, qiov, false, 0);
 716    if (ret < 0) {
 717        return ret;
 718    }
 719
 720    return qiov->size;
 721}
 722
 723int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
 724{
 725    QEMUIOVector qiov;
 726    struct iovec iov = {
 727        .iov_base = (void *)buf,
 728        .iov_len = bytes,
 729    };
 730
 731    if (bytes < 0) {
 732        return -EINVAL;
 733    }
 734
 735    qemu_iovec_init_external(&qiov, &iov, 1);
 736    return bdrv_preadv(child, offset, &qiov);
 737}
 738
 739int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
 740{
 741    int ret;
 742
 743    ret = bdrv_prwv_co(child, offset, qiov, true, 0);
 744    if (ret < 0) {
 745        return ret;
 746    }
 747
 748    return qiov->size;
 749}
 750
 751int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
 752{
 753    QEMUIOVector qiov;
 754    struct iovec iov = {
 755        .iov_base   = (void *) buf,
 756        .iov_len    = bytes,
 757    };
 758
 759    if (bytes < 0) {
 760        return -EINVAL;
 761    }
 762
 763    qemu_iovec_init_external(&qiov, &iov, 1);
 764    return bdrv_pwritev(child, offset, &qiov);
 765}
 766
 767/*
 768 * Writes to the file and ensures that no writes are reordered across this
 769 * request (acts as a barrier)
 770 *
 771 * Returns 0 on success, -errno in error cases.
 772 */
 773int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
 774                     const void *buf, int count)
 775{
 776    int ret;
 777
 778    ret = bdrv_pwrite(child, offset, buf, count);
 779    if (ret < 0) {
 780        return ret;
 781    }
 782
 783    ret = bdrv_flush(child->bs);
 784    if (ret < 0) {
 785        return ret;
 786    }
 787
 788    return 0;
 789}
 790
 791typedef struct CoroutineIOCompletion {
 792    Coroutine *coroutine;
 793    int ret;
 794} CoroutineIOCompletion;
 795
 796static void bdrv_co_io_em_complete(void *opaque, int ret)
 797{
 798    CoroutineIOCompletion *co = opaque;
 799
 800    co->ret = ret;
 801    qemu_coroutine_enter(co->coroutine);
 802}
 803
 804static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
 805                                           uint64_t offset, uint64_t bytes,
 806                                           QEMUIOVector *qiov, int flags)
 807{
 808    BlockDriver *drv = bs->drv;
 809    int64_t sector_num;
 810    unsigned int nb_sectors;
 811
 812    assert(!(flags & ~BDRV_REQ_MASK));
 813
 814    if (drv->bdrv_co_preadv) {
 815        return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
 816    }
 817
 818    sector_num = offset >> BDRV_SECTOR_BITS;
 819    nb_sectors = bytes >> BDRV_SECTOR_BITS;
 820
 821    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
 822    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 823    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 824
 825    if (drv->bdrv_co_readv) {
 826        return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
 827    } else {
 828        BlockAIOCB *acb;
 829        CoroutineIOCompletion co = {
 830            .coroutine = qemu_coroutine_self(),
 831        };
 832
 833        acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
 834                                      bdrv_co_io_em_complete, &co);
 835        if (acb == NULL) {
 836            return -EIO;
 837        } else {
 838            qemu_coroutine_yield();
 839            return co.ret;
 840        }
 841    }
 842}
 843
 844static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
 845                                            uint64_t offset, uint64_t bytes,
 846                                            QEMUIOVector *qiov, int flags)
 847{
 848    BlockDriver *drv = bs->drv;
 849    int64_t sector_num;
 850    unsigned int nb_sectors;
 851    int ret;
 852
 853    assert(!(flags & ~BDRV_REQ_MASK));
 854
 855    if (drv->bdrv_co_pwritev) {
 856        ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
 857                                   flags & bs->supported_write_flags);
 858        flags &= ~bs->supported_write_flags;
 859        goto emulate_flags;
 860    }
 861
 862    sector_num = offset >> BDRV_SECTOR_BITS;
 863    nb_sectors = bytes >> BDRV_SECTOR_BITS;
 864
 865    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
 866    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 867    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 868
 869    if (drv->bdrv_co_writev_flags) {
 870        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
 871                                        flags & bs->supported_write_flags);
 872        flags &= ~bs->supported_write_flags;
 873    } else if (drv->bdrv_co_writev) {
 874        assert(!bs->supported_write_flags);
 875        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
 876    } else {
 877        BlockAIOCB *acb;
 878        CoroutineIOCompletion co = {
 879            .coroutine = qemu_coroutine_self(),
 880        };
 881
 882        acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
 883                                       bdrv_co_io_em_complete, &co);
 884        if (acb == NULL) {
 885            ret = -EIO;
 886        } else {
 887            qemu_coroutine_yield();
 888            ret = co.ret;
 889        }
 890    }
 891
 892emulate_flags:
 893    if (ret == 0 && (flags & BDRV_REQ_FUA)) {
 894        ret = bdrv_co_flush(bs);
 895    }
 896
 897    return ret;
 898}
 899
 900static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs,
 901        int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
 902{
 903    /* Perform I/O through a temporary buffer so that users who scribble over
 904     * their read buffer while the operation is in progress do not end up
 905     * modifying the image file.  This is critical for zero-copy guest I/O
 906     * where anything might happen inside guest memory.
 907     */
 908    void *bounce_buffer;
 909
 910    BlockDriver *drv = bs->drv;
 911    struct iovec iov;
 912    QEMUIOVector bounce_qiov;
 913    int64_t cluster_offset;
 914    unsigned int cluster_bytes;
 915    size_t skip_bytes;
 916    int ret;
 917
 918    /* Cover entire cluster so no additional backing file I/O is required when
 919     * allocating cluster in the image file.
 920     */
 921    bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
 922
 923    trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
 924                                   cluster_offset, cluster_bytes);
 925
 926    iov.iov_len = cluster_bytes;
 927    iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
 928    if (bounce_buffer == NULL) {
 929        ret = -ENOMEM;
 930        goto err;
 931    }
 932
 933    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
 934
 935    ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes,
 936                             &bounce_qiov, 0);
 937    if (ret < 0) {
 938        goto err;
 939    }
 940
 941    if (drv->bdrv_co_pwrite_zeroes &&
 942        buffer_is_zero(bounce_buffer, iov.iov_len)) {
 943        /* FIXME: Should we (perhaps conditionally) be setting
 944         * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
 945         * that still correctly reads as zero? */
 946        ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0);
 947    } else {
 948        /* This does not change the data on the disk, it is not necessary
 949         * to flush even in cache=writethrough mode.
 950         */
 951        ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes,
 952                                  &bounce_qiov, 0);
 953    }
 954
 955    if (ret < 0) {
 956        /* It might be okay to ignore write errors for guest requests.  If this
 957         * is a deliberate copy-on-read then we don't want to ignore the error.
 958         * Simply report it in all cases.
 959         */
 960        goto err;
 961    }
 962
 963    skip_bytes = offset - cluster_offset;
 964    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes);
 965
 966err:
 967    qemu_vfree(bounce_buffer);
 968    return ret;
 969}
 970
 971/*
 972 * Forwards an already correctly aligned request to the BlockDriver. This
 973 * handles copy on read, zeroing after EOF, and fragmentation of large
 974 * reads; any other features must be implemented by the caller.
 975 */
 976static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs,
 977    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
 978    int64_t align, QEMUIOVector *qiov, int flags)
 979{
 980    int64_t total_bytes, max_bytes;
 981    int ret = 0;
 982    uint64_t bytes_remaining = bytes;
 983    int max_transfer;
 984
 985    assert(is_power_of_2(align));
 986    assert((offset & (align - 1)) == 0);
 987    assert((bytes & (align - 1)) == 0);
 988    assert(!qiov || bytes == qiov->size);
 989    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
 990    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
 991                                   align);
 992
 993    /* TODO: We would need a per-BDS .supported_read_flags and
 994     * potential fallback support, if we ever implement any read flags
 995     * to pass through to drivers.  For now, there aren't any
 996     * passthrough flags.  */
 997    assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
 998
 999    /* Handle Copy on Read and associated serialisation */
1000    if (flags & BDRV_REQ_COPY_ON_READ) {
1001        /* If we touch the same cluster it counts as an overlap.  This
1002         * guarantees that allocating writes will be serialized and not race
1003         * with each other for the same cluster.  For example, in copy-on-read
1004         * it ensures that the CoR read and write operations are atomic and
1005         * guest writes cannot interleave between them. */
1006        mark_request_serialising(req, bdrv_get_cluster_size(bs));
1007    }
1008
1009    if (!(flags & BDRV_REQ_NO_SERIALISING)) {
1010        wait_serialising_requests(req);
1011    }
1012
1013    if (flags & BDRV_REQ_COPY_ON_READ) {
1014        int64_t start_sector = offset >> BDRV_SECTOR_BITS;
1015        int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1016        unsigned int nb_sectors = end_sector - start_sector;
1017        int pnum;
1018
1019        ret = bdrv_is_allocated(bs, start_sector, nb_sectors, &pnum);
1020        if (ret < 0) {
1021            goto out;
1022        }
1023
1024        if (!ret || pnum != nb_sectors) {
1025            ret = bdrv_co_do_copy_on_readv(bs, offset, bytes, qiov);
1026            goto out;
1027        }
1028    }
1029
1030    /* Forward the request to the BlockDriver, possibly fragmenting it */
1031    total_bytes = bdrv_getlength(bs);
1032    if (total_bytes < 0) {
1033        ret = total_bytes;
1034        goto out;
1035    }
1036
1037    max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1038    if (bytes <= max_bytes && bytes <= max_transfer) {
1039        ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
1040        goto out;
1041    }
1042
1043    while (bytes_remaining) {
1044        int num;
1045
1046        if (max_bytes) {
1047            QEMUIOVector local_qiov;
1048
1049            num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1050            assert(num);
1051            qemu_iovec_init(&local_qiov, qiov->niov);
1052            qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1053
1054            ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1055                                     num, &local_qiov, 0);
1056            max_bytes -= num;
1057            qemu_iovec_destroy(&local_qiov);
1058        } else {
1059            num = bytes_remaining;
1060            ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
1061                                    bytes_remaining);
1062        }
1063        if (ret < 0) {
1064            goto out;
1065        }
1066        bytes_remaining -= num;
1067    }
1068
1069out:
1070    return ret < 0 ? ret : 0;
1071}
1072
1073/*
1074 * Handle a read request in coroutine context
1075 */
1076int coroutine_fn bdrv_co_preadv(BdrvChild *child,
1077    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1078    BdrvRequestFlags flags)
1079{
1080    BlockDriverState *bs = child->bs;
1081    BlockDriver *drv = bs->drv;
1082    BdrvTrackedRequest req;
1083
1084    uint64_t align = bs->bl.request_alignment;
1085    uint8_t *head_buf = NULL;
1086    uint8_t *tail_buf = NULL;
1087    QEMUIOVector local_qiov;
1088    bool use_local_qiov = false;
1089    int ret;
1090
1091    if (!drv) {
1092        return -ENOMEDIUM;
1093    }
1094
1095    ret = bdrv_check_byte_request(bs, offset, bytes);
1096    if (ret < 0) {
1097        return ret;
1098    }
1099
1100    /* Don't do copy-on-read if we read data before write operation */
1101    if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) {
1102        flags |= BDRV_REQ_COPY_ON_READ;
1103    }
1104
1105    /* Align read if necessary by padding qiov */
1106    if (offset & (align - 1)) {
1107        head_buf = qemu_blockalign(bs, align);
1108        qemu_iovec_init(&local_qiov, qiov->niov + 2);
1109        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1110        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1111        use_local_qiov = true;
1112
1113        bytes += offset & (align - 1);
1114        offset = offset & ~(align - 1);
1115    }
1116
1117    if ((offset + bytes) & (align - 1)) {
1118        if (!use_local_qiov) {
1119            qemu_iovec_init(&local_qiov, qiov->niov + 1);
1120            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1121            use_local_qiov = true;
1122        }
1123        tail_buf = qemu_blockalign(bs, align);
1124        qemu_iovec_add(&local_qiov, tail_buf,
1125                       align - ((offset + bytes) & (align - 1)));
1126
1127        bytes = ROUND_UP(bytes, align);
1128    }
1129
1130    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1131    ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
1132                              use_local_qiov ? &local_qiov : qiov,
1133                              flags);
1134    tracked_request_end(&req);
1135
1136    if (use_local_qiov) {
1137        qemu_iovec_destroy(&local_qiov);
1138        qemu_vfree(head_buf);
1139        qemu_vfree(tail_buf);
1140    }
1141
1142    return ret;
1143}
1144
1145static int coroutine_fn bdrv_co_do_readv(BdrvChild *child,
1146    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1147    BdrvRequestFlags flags)
1148{
1149    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1150        return -EINVAL;
1151    }
1152
1153    return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS,
1154                          nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1155}
1156
1157int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
1158                               int nb_sectors, QEMUIOVector *qiov)
1159{
1160    trace_bdrv_co_readv(child->bs, sector_num, nb_sectors);
1161
1162    return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0);
1163}
1164
1165/* Maximum buffer for write zeroes fallback, in bytes */
1166#define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
1167
1168static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1169    int64_t offset, int count, BdrvRequestFlags flags)
1170{
1171    BlockDriver *drv = bs->drv;
1172    QEMUIOVector qiov;
1173    struct iovec iov = {0};
1174    int ret = 0;
1175    bool need_flush = false;
1176    int head = 0;
1177    int tail = 0;
1178
1179    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1180    int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1181                        bs->bl.request_alignment);
1182
1183    assert(alignment % bs->bl.request_alignment == 0);
1184    head = offset % alignment;
1185    tail = (offset + count) % alignment;
1186    max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1187    assert(max_write_zeroes >= bs->bl.request_alignment);
1188
1189    while (count > 0 && !ret) {
1190        int num = count;
1191
1192        /* Align request.  Block drivers can expect the "bulk" of the request
1193         * to be aligned, and that unaligned requests do not cross cluster
1194         * boundaries.
1195         */
1196        if (head) {
1197            /* Make a small request up to the first aligned sector.  */
1198            num = MIN(count, alignment - head);
1199            head = 0;
1200        } else if (tail && num > alignment) {
1201            /* Shorten the request to the last aligned sector.  */
1202            num -= tail;
1203        }
1204
1205        /* limit request size */
1206        if (num > max_write_zeroes) {
1207            num = max_write_zeroes;
1208        }
1209
1210        ret = -ENOTSUP;
1211        /* First try the efficient write zeroes operation */
1212        if (drv->bdrv_co_pwrite_zeroes) {
1213            ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1214                                             flags & bs->supported_zero_flags);
1215            if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1216                !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1217                need_flush = true;
1218            }
1219        } else {
1220            assert(!bs->supported_zero_flags);
1221        }
1222
1223        if (ret == -ENOTSUP) {
1224            /* Fall back to bounce buffer if write zeroes is unsupported */
1225            int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1226                                            MAX_WRITE_ZEROES_BOUNCE_BUFFER);
1227            BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1228
1229            if ((flags & BDRV_REQ_FUA) &&
1230                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1231                /* No need for bdrv_driver_pwrite() to do a fallback
1232                 * flush on each chunk; use just one at the end */
1233                write_flags &= ~BDRV_REQ_FUA;
1234                need_flush = true;
1235            }
1236            num = MIN(num, max_transfer);
1237            iov.iov_len = num;
1238            if (iov.iov_base == NULL) {
1239                iov.iov_base = qemu_try_blockalign(bs, num);
1240                if (iov.iov_base == NULL) {
1241                    ret = -ENOMEM;
1242                    goto fail;
1243                }
1244                memset(iov.iov_base, 0, num);
1245            }
1246            qemu_iovec_init_external(&qiov, &iov, 1);
1247
1248            ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
1249
1250            /* Keep bounce buffer around if it is big enough for all
1251             * all future requests.
1252             */
1253            if (num < max_transfer) {
1254                qemu_vfree(iov.iov_base);
1255                iov.iov_base = NULL;
1256            }
1257        }
1258
1259        offset += num;
1260        count -= num;
1261    }
1262
1263fail:
1264    if (ret == 0 && need_flush) {
1265        ret = bdrv_co_flush(bs);
1266    }
1267    qemu_vfree(iov.iov_base);
1268    return ret;
1269}
1270
1271/*
1272 * Forwards an already correctly aligned write request to the BlockDriver,
1273 * after possibly fragmenting it.
1274 */
1275static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs,
1276    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1277    int64_t align, QEMUIOVector *qiov, int flags)
1278{
1279    BlockDriver *drv = bs->drv;
1280    bool waited;
1281    int ret;
1282
1283    int64_t start_sector = offset >> BDRV_SECTOR_BITS;
1284    int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1285    uint64_t bytes_remaining = bytes;
1286    int max_transfer;
1287
1288    assert(is_power_of_2(align));
1289    assert((offset & (align - 1)) == 0);
1290    assert((bytes & (align - 1)) == 0);
1291    assert(!qiov || bytes == qiov->size);
1292    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1293    assert(!(flags & ~BDRV_REQ_MASK));
1294    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1295                                   align);
1296
1297    waited = wait_serialising_requests(req);
1298    assert(!waited || !req->serialising);
1299    assert(req->overlap_offset <= offset);
1300    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1301
1302    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1303
1304    if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1305        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
1306        qemu_iovec_is_zero(qiov)) {
1307        flags |= BDRV_REQ_ZERO_WRITE;
1308        if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1309            flags |= BDRV_REQ_MAY_UNMAP;
1310        }
1311    }
1312
1313    if (ret < 0) {
1314        /* Do nothing, write notifier decided to fail this request */
1315    } else if (flags & BDRV_REQ_ZERO_WRITE) {
1316        bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1317        ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
1318    } else if (bytes <= max_transfer) {
1319        bdrv_debug_event(bs, BLKDBG_PWRITEV);
1320        ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
1321    } else {
1322        bdrv_debug_event(bs, BLKDBG_PWRITEV);
1323        while (bytes_remaining) {
1324            int num = MIN(bytes_remaining, max_transfer);
1325            QEMUIOVector local_qiov;
1326            int local_flags = flags;
1327
1328            assert(num);
1329            if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
1330                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1331                /* If FUA is going to be emulated by flush, we only
1332                 * need to flush on the last iteration */
1333                local_flags &= ~BDRV_REQ_FUA;
1334            }
1335            qemu_iovec_init(&local_qiov, qiov->niov);
1336            qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1337
1338            ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
1339                                      num, &local_qiov, local_flags);
1340            qemu_iovec_destroy(&local_qiov);
1341            if (ret < 0) {
1342                break;
1343            }
1344            bytes_remaining -= num;
1345        }
1346    }
1347    bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1348
1349    ++bs->write_gen;
1350    bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
1351
1352    if (bs->wr_highest_offset < offset + bytes) {
1353        bs->wr_highest_offset = offset + bytes;
1354    }
1355
1356    if (ret >= 0) {
1357        bs->total_sectors = MAX(bs->total_sectors, end_sector);
1358        ret = 0;
1359    }
1360
1361    return ret;
1362}
1363
1364static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs,
1365                                                int64_t offset,
1366                                                unsigned int bytes,
1367                                                BdrvRequestFlags flags,
1368                                                BdrvTrackedRequest *req)
1369{
1370    uint8_t *buf = NULL;
1371    QEMUIOVector local_qiov;
1372    struct iovec iov;
1373    uint64_t align = bs->bl.request_alignment;
1374    unsigned int head_padding_bytes, tail_padding_bytes;
1375    int ret = 0;
1376
1377    head_padding_bytes = offset & (align - 1);
1378    tail_padding_bytes = align - ((offset + bytes) & (align - 1));
1379
1380
1381    assert(flags & BDRV_REQ_ZERO_WRITE);
1382    if (head_padding_bytes || tail_padding_bytes) {
1383        buf = qemu_blockalign(bs, align);
1384        iov = (struct iovec) {
1385            .iov_base   = buf,
1386            .iov_len    = align,
1387        };
1388        qemu_iovec_init_external(&local_qiov, &iov, 1);
1389    }
1390    if (head_padding_bytes) {
1391        uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1392
1393        /* RMW the unaligned part before head. */
1394        mark_request_serialising(req, align);
1395        wait_serialising_requests(req);
1396        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1397        ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align,
1398                                  align, &local_qiov, 0);
1399        if (ret < 0) {
1400            goto fail;
1401        }
1402        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1403
1404        memset(buf + head_padding_bytes, 0, zero_bytes);
1405        ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align,
1406                                   align, &local_qiov,
1407                                   flags & ~BDRV_REQ_ZERO_WRITE);
1408        if (ret < 0) {
1409            goto fail;
1410        }
1411        offset += zero_bytes;
1412        bytes -= zero_bytes;
1413    }
1414
1415    assert(!bytes || (offset & (align - 1)) == 0);
1416    if (bytes >= align) {
1417        /* Write the aligned part in the middle. */
1418        uint64_t aligned_bytes = bytes & ~(align - 1);
1419        ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, align,
1420                                   NULL, flags);
1421        if (ret < 0) {
1422            goto fail;
1423        }
1424        bytes -= aligned_bytes;
1425        offset += aligned_bytes;
1426    }
1427
1428    assert(!bytes || (offset & (align - 1)) == 0);
1429    if (bytes) {
1430        assert(align == tail_padding_bytes + bytes);
1431        /* RMW the unaligned part after tail. */
1432        mark_request_serialising(req, align);
1433        wait_serialising_requests(req);
1434        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1435        ret = bdrv_aligned_preadv(bs, req, offset, align,
1436                                  align, &local_qiov, 0);
1437        if (ret < 0) {
1438            goto fail;
1439        }
1440        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1441
1442        memset(buf, 0, bytes);
1443        ret = bdrv_aligned_pwritev(bs, req, offset, align, align,
1444                                   &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1445    }
1446fail:
1447    qemu_vfree(buf);
1448    return ret;
1449
1450}
1451
1452/*
1453 * Handle a write request in coroutine context
1454 */
1455int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
1456    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1457    BdrvRequestFlags flags)
1458{
1459    BlockDriverState *bs = child->bs;
1460    BdrvTrackedRequest req;
1461    uint64_t align = bs->bl.request_alignment;
1462    uint8_t *head_buf = NULL;
1463    uint8_t *tail_buf = NULL;
1464    QEMUIOVector local_qiov;
1465    bool use_local_qiov = false;
1466    int ret;
1467
1468    if (!bs->drv) {
1469        return -ENOMEDIUM;
1470    }
1471    if (bs->read_only) {
1472        return -EPERM;
1473    }
1474    assert(!(bs->open_flags & BDRV_O_INACTIVE));
1475
1476    ret = bdrv_check_byte_request(bs, offset, bytes);
1477    if (ret < 0) {
1478        return ret;
1479    }
1480
1481    /*
1482     * Align write if necessary by performing a read-modify-write cycle.
1483     * Pad qiov with the read parts and be sure to have a tracked request not
1484     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1485     */
1486    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1487
1488    if (!qiov) {
1489        ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
1490        goto out;
1491    }
1492
1493    if (offset & (align - 1)) {
1494        QEMUIOVector head_qiov;
1495        struct iovec head_iov;
1496
1497        mark_request_serialising(&req, align);
1498        wait_serialising_requests(&req);
1499
1500        head_buf = qemu_blockalign(bs, align);
1501        head_iov = (struct iovec) {
1502            .iov_base   = head_buf,
1503            .iov_len    = align,
1504        };
1505        qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1506
1507        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1508        ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align,
1509                                  align, &head_qiov, 0);
1510        if (ret < 0) {
1511            goto fail;
1512        }
1513        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1514
1515        qemu_iovec_init(&local_qiov, qiov->niov + 2);
1516        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1517        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1518        use_local_qiov = true;
1519
1520        bytes += offset & (align - 1);
1521        offset = offset & ~(align - 1);
1522
1523        /* We have read the tail already if the request is smaller
1524         * than one aligned block.
1525         */
1526        if (bytes < align) {
1527            qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1528            bytes = align;
1529        }
1530    }
1531
1532    if ((offset + bytes) & (align - 1)) {
1533        QEMUIOVector tail_qiov;
1534        struct iovec tail_iov;
1535        size_t tail_bytes;
1536        bool waited;
1537
1538        mark_request_serialising(&req, align);
1539        waited = wait_serialising_requests(&req);
1540        assert(!waited || !use_local_qiov);
1541
1542        tail_buf = qemu_blockalign(bs, align);
1543        tail_iov = (struct iovec) {
1544            .iov_base   = tail_buf,
1545            .iov_len    = align,
1546        };
1547        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1548
1549        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1550        ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align,
1551                                  align, &tail_qiov, 0);
1552        if (ret < 0) {
1553            goto fail;
1554        }
1555        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1556
1557        if (!use_local_qiov) {
1558            qemu_iovec_init(&local_qiov, qiov->niov + 1);
1559            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1560            use_local_qiov = true;
1561        }
1562
1563        tail_bytes = (offset + bytes) & (align - 1);
1564        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1565
1566        bytes = ROUND_UP(bytes, align);
1567    }
1568
1569    ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, align,
1570                               use_local_qiov ? &local_qiov : qiov,
1571                               flags);
1572
1573fail:
1574
1575    if (use_local_qiov) {
1576        qemu_iovec_destroy(&local_qiov);
1577    }
1578    qemu_vfree(head_buf);
1579    qemu_vfree(tail_buf);
1580out:
1581    tracked_request_end(&req);
1582    return ret;
1583}
1584
1585static int coroutine_fn bdrv_co_do_writev(BdrvChild *child,
1586    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1587    BdrvRequestFlags flags)
1588{
1589    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1590        return -EINVAL;
1591    }
1592
1593    return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS,
1594                           nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1595}
1596
1597int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num,
1598    int nb_sectors, QEMUIOVector *qiov)
1599{
1600    trace_bdrv_co_writev(child->bs, sector_num, nb_sectors);
1601
1602    return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0);
1603}
1604
1605int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
1606                                       int count, BdrvRequestFlags flags)
1607{
1608    trace_bdrv_co_pwrite_zeroes(child->bs, offset, count, flags);
1609
1610    if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
1611        flags &= ~BDRV_REQ_MAY_UNMAP;
1612    }
1613
1614    return bdrv_co_pwritev(child, offset, count, NULL,
1615                           BDRV_REQ_ZERO_WRITE | flags);
1616}
1617
1618typedef struct BdrvCoGetBlockStatusData {
1619    BlockDriverState *bs;
1620    BlockDriverState *base;
1621    BlockDriverState **file;
1622    int64_t sector_num;
1623    int nb_sectors;
1624    int *pnum;
1625    int64_t ret;
1626    bool done;
1627} BdrvCoGetBlockStatusData;
1628
1629/*
1630 * Returns the allocation status of the specified sectors.
1631 * Drivers not implementing the functionality are assumed to not support
1632 * backing files, hence all their sectors are reported as allocated.
1633 *
1634 * If 'sector_num' is beyond the end of the disk image the return value is 0
1635 * and 'pnum' is set to 0.
1636 *
1637 * 'pnum' is set to the number of sectors (including and immediately following
1638 * the specified sector) that are known to be in the same
1639 * allocated/unallocated state.
1640 *
1641 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1642 * beyond the end of the disk image it will be clamped.
1643 *
1644 * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
1645 * points to the BDS which the sector range is allocated in.
1646 */
1647static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
1648                                                     int64_t sector_num,
1649                                                     int nb_sectors, int *pnum,
1650                                                     BlockDriverState **file)
1651{
1652    int64_t total_sectors;
1653    int64_t n;
1654    int64_t ret, ret2;
1655
1656    total_sectors = bdrv_nb_sectors(bs);
1657    if (total_sectors < 0) {
1658        return total_sectors;
1659    }
1660
1661    if (sector_num >= total_sectors) {
1662        *pnum = 0;
1663        return 0;
1664    }
1665
1666    n = total_sectors - sector_num;
1667    if (n < nb_sectors) {
1668        nb_sectors = n;
1669    }
1670
1671    if (!bs->drv->bdrv_co_get_block_status) {
1672        *pnum = nb_sectors;
1673        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1674        if (bs->drv->protocol_name) {
1675            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
1676        }
1677        return ret;
1678    }
1679
1680    *file = NULL;
1681    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
1682                                            file);
1683    if (ret < 0) {
1684        *pnum = 0;
1685        return ret;
1686    }
1687
1688    if (ret & BDRV_BLOCK_RAW) {
1689        assert(ret & BDRV_BLOCK_OFFSET_VALID);
1690        return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS,
1691                                     *pnum, pnum, file);
1692    }
1693
1694    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1695        ret |= BDRV_BLOCK_ALLOCATED;
1696    } else {
1697        if (bdrv_unallocated_blocks_are_zero(bs)) {
1698            ret |= BDRV_BLOCK_ZERO;
1699        } else if (bs->backing) {
1700            BlockDriverState *bs2 = bs->backing->bs;
1701            int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
1702            if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
1703                ret |= BDRV_BLOCK_ZERO;
1704            }
1705        }
1706    }
1707
1708    if (*file && *file != bs &&
1709        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1710        (ret & BDRV_BLOCK_OFFSET_VALID)) {
1711        BlockDriverState *file2;
1712        int file_pnum;
1713
1714        ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
1715                                        *pnum, &file_pnum, &file2);
1716        if (ret2 >= 0) {
1717            /* Ignore errors.  This is just providing extra information, it
1718             * is useful but not necessary.
1719             */
1720            if (!file_pnum) {
1721                /* !file_pnum indicates an offset at or beyond the EOF; it is
1722                 * perfectly valid for the format block driver to point to such
1723                 * offsets, so catch it and mark everything as zero */
1724                ret |= BDRV_BLOCK_ZERO;
1725            } else {
1726                /* Limit request to the range reported by the protocol driver */
1727                *pnum = file_pnum;
1728                ret |= (ret2 & BDRV_BLOCK_ZERO);
1729            }
1730        }
1731    }
1732
1733    return ret;
1734}
1735
1736static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
1737        BlockDriverState *base,
1738        int64_t sector_num,
1739        int nb_sectors,
1740        int *pnum,
1741        BlockDriverState **file)
1742{
1743    BlockDriverState *p;
1744    int64_t ret = 0;
1745
1746    assert(bs != base);
1747    for (p = bs; p != base; p = backing_bs(p)) {
1748        ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
1749        if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) {
1750            break;
1751        }
1752        /* [sector_num, pnum] unallocated on this layer, which could be only
1753         * the first part of [sector_num, nb_sectors].  */
1754        nb_sectors = MIN(nb_sectors, *pnum);
1755    }
1756    return ret;
1757}
1758
1759/* Coroutine wrapper for bdrv_get_block_status_above() */
1760static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
1761{
1762    BdrvCoGetBlockStatusData *data = opaque;
1763
1764    data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
1765                                               data->sector_num,
1766                                               data->nb_sectors,
1767                                               data->pnum,
1768                                               data->file);
1769    data->done = true;
1770}
1771
1772/*
1773 * Synchronous wrapper around bdrv_co_get_block_status_above().
1774 *
1775 * See bdrv_co_get_block_status_above() for details.
1776 */
1777int64_t bdrv_get_block_status_above(BlockDriverState *bs,
1778                                    BlockDriverState *base,
1779                                    int64_t sector_num,
1780                                    int nb_sectors, int *pnum,
1781                                    BlockDriverState **file)
1782{
1783    Coroutine *co;
1784    BdrvCoGetBlockStatusData data = {
1785        .bs = bs,
1786        .base = base,
1787        .file = file,
1788        .sector_num = sector_num,
1789        .nb_sectors = nb_sectors,
1790        .pnum = pnum,
1791        .done = false,
1792    };
1793
1794    if (qemu_in_coroutine()) {
1795        /* Fast-path if already in coroutine context */
1796        bdrv_get_block_status_above_co_entry(&data);
1797    } else {
1798        AioContext *aio_context = bdrv_get_aio_context(bs);
1799
1800        co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry,
1801                                   &data);
1802        qemu_coroutine_enter(co);
1803        while (!data.done) {
1804            aio_poll(aio_context, true);
1805        }
1806    }
1807    return data.ret;
1808}
1809
1810int64_t bdrv_get_block_status(BlockDriverState *bs,
1811                              int64_t sector_num,
1812                              int nb_sectors, int *pnum,
1813                              BlockDriverState **file)
1814{
1815    return bdrv_get_block_status_above(bs, backing_bs(bs),
1816                                       sector_num, nb_sectors, pnum, file);
1817}
1818
1819int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num,
1820                                   int nb_sectors, int *pnum)
1821{
1822    BlockDriverState *file;
1823    int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum,
1824                                        &file);
1825    if (ret < 0) {
1826        return ret;
1827    }
1828    return !!(ret & BDRV_BLOCK_ALLOCATED);
1829}
1830
1831/*
1832 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
1833 *
1834 * Return true if the given sector is allocated in any image between
1835 * BASE and TOP (inclusive).  BASE can be NULL to check if the given
1836 * sector is allocated in any image of the chain.  Return false otherwise.
1837 *
1838 * 'pnum' is set to the number of sectors (including and immediately following
1839 *  the specified sector) that are known to be in the same
1840 *  allocated/unallocated state.
1841 *
1842 */
1843int bdrv_is_allocated_above(BlockDriverState *top,
1844                            BlockDriverState *base,
1845                            int64_t sector_num,
1846                            int nb_sectors, int *pnum)
1847{
1848    BlockDriverState *intermediate;
1849    int ret, n = nb_sectors;
1850
1851    intermediate = top;
1852    while (intermediate && intermediate != base) {
1853        int pnum_inter;
1854        ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors,
1855                                &pnum_inter);
1856        if (ret < 0) {
1857            return ret;
1858        } else if (ret) {
1859            *pnum = pnum_inter;
1860            return 1;
1861        }
1862
1863        /*
1864         * [sector_num, nb_sectors] is unallocated on top but intermediate
1865         * might have
1866         *
1867         * [sector_num+x, nr_sectors] allocated.
1868         */
1869        if (n > pnum_inter &&
1870            (intermediate == top ||
1871             sector_num + pnum_inter < intermediate->total_sectors)) {
1872            n = pnum_inter;
1873        }
1874
1875        intermediate = backing_bs(intermediate);
1876    }
1877
1878    *pnum = n;
1879    return 0;
1880}
1881
1882int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num,
1883                          const uint8_t *buf, int nb_sectors)
1884{
1885    BlockDriver *drv = bs->drv;
1886    int ret;
1887
1888    if (!drv) {
1889        return -ENOMEDIUM;
1890    }
1891    if (!drv->bdrv_write_compressed) {
1892        return -ENOTSUP;
1893    }
1894    ret = bdrv_check_request(bs, sector_num, nb_sectors);
1895    if (ret < 0) {
1896        return ret;
1897    }
1898
1899    assert(QLIST_EMPTY(&bs->dirty_bitmaps));
1900
1901    return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors);
1902}
1903
1904typedef struct BdrvVmstateCo {
1905    BlockDriverState    *bs;
1906    QEMUIOVector        *qiov;
1907    int64_t             pos;
1908    bool                is_read;
1909    int                 ret;
1910} BdrvVmstateCo;
1911
1912static int coroutine_fn
1913bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
1914                   bool is_read)
1915{
1916    BlockDriver *drv = bs->drv;
1917
1918    if (!drv) {
1919        return -ENOMEDIUM;
1920    } else if (drv->bdrv_load_vmstate) {
1921        return is_read ? drv->bdrv_load_vmstate(bs, qiov, pos)
1922                       : drv->bdrv_save_vmstate(bs, qiov, pos);
1923    } else if (bs->file) {
1924        return bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
1925    }
1926
1927    return -ENOTSUP;
1928}
1929
1930static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
1931{
1932    BdrvVmstateCo *co = opaque;
1933    co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
1934}
1935
1936static inline int
1937bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
1938                bool is_read)
1939{
1940    if (qemu_in_coroutine()) {
1941        return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
1942    } else {
1943        BdrvVmstateCo data = {
1944            .bs         = bs,
1945            .qiov       = qiov,
1946            .pos        = pos,
1947            .is_read    = is_read,
1948            .ret        = -EINPROGRESS,
1949        };
1950        Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
1951
1952        qemu_coroutine_enter(co);
1953        while (data.ret == -EINPROGRESS) {
1954            aio_poll(bdrv_get_aio_context(bs), true);
1955        }
1956        return data.ret;
1957    }
1958}
1959
1960int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
1961                      int64_t pos, int size)
1962{
1963    QEMUIOVector qiov;
1964    struct iovec iov = {
1965        .iov_base   = (void *) buf,
1966        .iov_len    = size,
1967    };
1968    int ret;
1969
1970    qemu_iovec_init_external(&qiov, &iov, 1);
1971
1972    ret = bdrv_writev_vmstate(bs, &qiov, pos);
1973    if (ret < 0) {
1974        return ret;
1975    }
1976
1977    return size;
1978}
1979
1980int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
1981{
1982    return bdrv_rw_vmstate(bs, qiov, pos, false);
1983}
1984
1985int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
1986                      int64_t pos, int size)
1987{
1988    QEMUIOVector qiov;
1989    struct iovec iov = {
1990        .iov_base   = buf,
1991        .iov_len    = size,
1992    };
1993    int ret;
1994
1995    qemu_iovec_init_external(&qiov, &iov, 1);
1996    ret = bdrv_readv_vmstate(bs, &qiov, pos);
1997    if (ret < 0) {
1998        return ret;
1999    }
2000
2001    return size;
2002}
2003
2004int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2005{
2006    return bdrv_rw_vmstate(bs, qiov, pos, true);
2007}
2008
2009/**************************************************************/
2010/* async I/Os */
2011
2012BlockAIOCB *bdrv_aio_readv(BdrvChild *child, int64_t sector_num,
2013                           QEMUIOVector *qiov, int nb_sectors,
2014                           BlockCompletionFunc *cb, void *opaque)
2015{
2016    trace_bdrv_aio_readv(child->bs, sector_num, nb_sectors, opaque);
2017
2018    assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
2019    return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
2020                                  0, cb, opaque, false);
2021}
2022
2023BlockAIOCB *bdrv_aio_writev(BdrvChild *child, int64_t sector_num,
2024                            QEMUIOVector *qiov, int nb_sectors,
2025                            BlockCompletionFunc *cb, void *opaque)
2026{
2027    trace_bdrv_aio_writev(child->bs, sector_num, nb_sectors, opaque);
2028
2029    assert(nb_sectors << BDRV_SECTOR_BITS == qiov->size);
2030    return bdrv_co_aio_prw_vector(child, sector_num << BDRV_SECTOR_BITS, qiov,
2031                                  0, cb, opaque, true);
2032}
2033
2034void bdrv_aio_cancel(BlockAIOCB *acb)
2035{
2036    qemu_aio_ref(acb);
2037    bdrv_aio_cancel_async(acb);
2038    while (acb->refcnt > 1) {
2039        if (acb->aiocb_info->get_aio_context) {
2040            aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2041        } else if (acb->bs) {
2042            aio_poll(bdrv_get_aio_context(acb->bs), true);
2043        } else {
2044            abort();
2045        }
2046    }
2047    qemu_aio_unref(acb);
2048}
2049
2050/* Async version of aio cancel. The caller is not blocked if the acb implements
2051 * cancel_async, otherwise we do nothing and let the request normally complete.
2052 * In either case the completion callback must be called. */
2053void bdrv_aio_cancel_async(BlockAIOCB *acb)
2054{
2055    if (acb->aiocb_info->cancel_async) {
2056        acb->aiocb_info->cancel_async(acb);
2057    }
2058}
2059
2060/**************************************************************/
2061/* async block device emulation */
2062
2063typedef struct BlockRequest {
2064    union {
2065        /* Used during read, write, trim */
2066        struct {
2067            int64_t offset;
2068            int bytes;
2069            int flags;
2070            QEMUIOVector *qiov;
2071        };
2072        /* Used during ioctl */
2073        struct {
2074            int req;
2075            void *buf;
2076        };
2077    };
2078    BlockCompletionFunc *cb;
2079    void *opaque;
2080
2081    int error;
2082} BlockRequest;
2083
2084typedef struct BlockAIOCBCoroutine {
2085    BlockAIOCB common;
2086    BdrvChild *child;
2087    BlockRequest req;
2088    bool is_write;
2089    bool need_bh;
2090    bool *done;
2091    QEMUBH* bh;
2092} BlockAIOCBCoroutine;
2093
2094static const AIOCBInfo bdrv_em_co_aiocb_info = {
2095    .aiocb_size         = sizeof(BlockAIOCBCoroutine),
2096};
2097
2098static void bdrv_co_complete(BlockAIOCBCoroutine *acb)
2099{
2100    if (!acb->need_bh) {
2101        acb->common.cb(acb->common.opaque, acb->req.error);
2102        qemu_aio_unref(acb);
2103    }
2104}
2105
2106static void bdrv_co_em_bh(void *opaque)
2107{
2108    BlockAIOCBCoroutine *acb = opaque;
2109
2110    assert(!acb->need_bh);
2111    qemu_bh_delete(acb->bh);
2112    bdrv_co_complete(acb);
2113}
2114
2115static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb)
2116{
2117    acb->need_bh = false;
2118    if (acb->req.error != -EINPROGRESS) {
2119        BlockDriverState *bs = acb->common.bs;
2120
2121        acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb);
2122        qemu_bh_schedule(acb->bh);
2123    }
2124}
2125
2126/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */
2127static void coroutine_fn bdrv_co_do_rw(void *opaque)
2128{
2129    BlockAIOCBCoroutine *acb = opaque;
2130
2131    if (!acb->is_write) {
2132        acb->req.error = bdrv_co_preadv(acb->child, acb->req.offset,
2133            acb->req.qiov->size, acb->req.qiov, acb->req.flags);
2134    } else {
2135        acb->req.error = bdrv_co_pwritev(acb->child, acb->req.offset,
2136            acb->req.qiov->size, acb->req.qiov, acb->req.flags);
2137    }
2138
2139    bdrv_co_complete(acb);
2140}
2141
2142static BlockAIOCB *bdrv_co_aio_prw_vector(BdrvChild *child,
2143                                          int64_t offset,
2144                                          QEMUIOVector *qiov,
2145                                          BdrvRequestFlags flags,
2146                                          BlockCompletionFunc *cb,
2147                                          void *opaque,
2148                                          bool is_write)
2149{
2150    Coroutine *co;
2151    BlockAIOCBCoroutine *acb;
2152
2153    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, child->bs, cb, opaque);
2154    acb->child = child;
2155    acb->need_bh = true;
2156    acb->req.error = -EINPROGRESS;
2157    acb->req.offset = offset;
2158    acb->req.qiov = qiov;
2159    acb->req.flags = flags;
2160    acb->is_write = is_write;
2161
2162    co = qemu_coroutine_create(bdrv_co_do_rw, acb);
2163    qemu_coroutine_enter(co);
2164
2165    bdrv_co_maybe_schedule_bh(acb);
2166    return &acb->common;
2167}
2168
2169static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque)
2170{
2171    BlockAIOCBCoroutine *acb = opaque;
2172    BlockDriverState *bs = acb->common.bs;
2173
2174    acb->req.error = bdrv_co_flush(bs);
2175    bdrv_co_complete(acb);
2176}
2177
2178BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs,
2179        BlockCompletionFunc *cb, void *opaque)
2180{
2181    trace_bdrv_aio_flush(bs, opaque);
2182
2183    Coroutine *co;
2184    BlockAIOCBCoroutine *acb;
2185
2186    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2187    acb->need_bh = true;
2188    acb->req.error = -EINPROGRESS;
2189
2190    co = qemu_coroutine_create(bdrv_aio_flush_co_entry, acb);
2191    qemu_coroutine_enter(co);
2192
2193    bdrv_co_maybe_schedule_bh(acb);
2194    return &acb->common;
2195}
2196
2197static void coroutine_fn bdrv_aio_pdiscard_co_entry(void *opaque)
2198{
2199    BlockAIOCBCoroutine *acb = opaque;
2200    BlockDriverState *bs = acb->common.bs;
2201
2202    acb->req.error = bdrv_co_pdiscard(bs, acb->req.offset, acb->req.bytes);
2203    bdrv_co_complete(acb);
2204}
2205
2206BlockAIOCB *bdrv_aio_pdiscard(BlockDriverState *bs, int64_t offset, int count,
2207                              BlockCompletionFunc *cb, void *opaque)
2208{
2209    Coroutine *co;
2210    BlockAIOCBCoroutine *acb;
2211
2212    trace_bdrv_aio_pdiscard(bs, offset, count, opaque);
2213
2214    acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque);
2215    acb->need_bh = true;
2216    acb->req.error = -EINPROGRESS;
2217    acb->req.offset = offset;
2218    acb->req.bytes = count;
2219    co = qemu_coroutine_create(bdrv_aio_pdiscard_co_entry, acb);
2220    qemu_coroutine_enter(co);
2221
2222    bdrv_co_maybe_schedule_bh(acb);
2223    return &acb->common;
2224}
2225
2226void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
2227                   BlockCompletionFunc *cb, void *opaque)
2228{
2229    BlockAIOCB *acb;
2230
2231    acb = g_malloc(aiocb_info->aiocb_size);
2232    acb->aiocb_info = aiocb_info;
2233    acb->bs = bs;
2234    acb->cb = cb;
2235    acb->opaque = opaque;
2236    acb->refcnt = 1;
2237    return acb;
2238}
2239
2240void qemu_aio_ref(void *p)
2241{
2242    BlockAIOCB *acb = p;
2243    acb->refcnt++;
2244}
2245
2246void qemu_aio_unref(void *p)
2247{
2248    BlockAIOCB *acb = p;
2249    assert(acb->refcnt > 0);
2250    if (--acb->refcnt == 0) {
2251        g_free(acb);
2252    }
2253}
2254
2255/**************************************************************/
2256/* Coroutine block device emulation */
2257
2258typedef struct FlushCo {
2259    BlockDriverState *bs;
2260    int ret;
2261} FlushCo;
2262
2263
2264static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2265{
2266    FlushCo *rwco = opaque;
2267
2268    rwco->ret = bdrv_co_flush(rwco->bs);
2269}
2270
2271int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2272{
2273    int ret;
2274    BdrvTrackedRequest req;
2275
2276    if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2277        bdrv_is_sg(bs)) {
2278        return 0;
2279    }
2280
2281    tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
2282
2283    int current_gen = bs->write_gen;
2284
2285    /* Wait until any previous flushes are completed */
2286    while (bs->active_flush_req != NULL) {
2287        qemu_co_queue_wait(&bs->flush_queue);
2288    }
2289
2290    bs->active_flush_req = &req;
2291
2292    /* Write back all layers by calling one driver function */
2293    if (bs->drv->bdrv_co_flush) {
2294        ret = bs->drv->bdrv_co_flush(bs);
2295        goto out;
2296    }
2297
2298    /* Write back cached data to the OS even with cache=unsafe */
2299    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2300    if (bs->drv->bdrv_co_flush_to_os) {
2301        ret = bs->drv->bdrv_co_flush_to_os(bs);
2302        if (ret < 0) {
2303            goto out;
2304        }
2305    }
2306
2307    /* But don't actually force it to the disk with cache=unsafe */
2308    if (bs->open_flags & BDRV_O_NO_FLUSH) {
2309        goto flush_parent;
2310    }
2311
2312    /* Check if we really need to flush anything */
2313    if (bs->flushed_gen == current_gen) {
2314        goto flush_parent;
2315    }
2316
2317    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2318    if (bs->drv->bdrv_co_flush_to_disk) {
2319        ret = bs->drv->bdrv_co_flush_to_disk(bs);
2320    } else if (bs->drv->bdrv_aio_flush) {
2321        BlockAIOCB *acb;
2322        CoroutineIOCompletion co = {
2323            .coroutine = qemu_coroutine_self(),
2324        };
2325
2326        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2327        if (acb == NULL) {
2328            ret = -EIO;
2329        } else {
2330            qemu_coroutine_yield();
2331            ret = co.ret;
2332        }
2333    } else {
2334        /*
2335         * Some block drivers always operate in either writethrough or unsafe
2336         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2337         * know how the server works (because the behaviour is hardcoded or
2338         * depends on server-side configuration), so we can't ensure that
2339         * everything is safe on disk. Returning an error doesn't work because
2340         * that would break guests even if the server operates in writethrough
2341         * mode.
2342         *
2343         * Let's hope the user knows what he's doing.
2344         */
2345        ret = 0;
2346    }
2347
2348    if (ret < 0) {
2349        goto out;
2350    }
2351
2352    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2353     * in the case of cache=unsafe, so there are no useless flushes.
2354     */
2355flush_parent:
2356    ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2357out:
2358    /* Notify any pending flushes that we have completed */
2359    bs->flushed_gen = current_gen;
2360    bs->active_flush_req = NULL;
2361    /* Return value is ignored - it's ok if wait queue is empty */
2362    qemu_co_queue_next(&bs->flush_queue);
2363
2364    tracked_request_end(&req);
2365    return ret;
2366}
2367
2368int bdrv_flush(BlockDriverState *bs)
2369{
2370    Coroutine *co;
2371    FlushCo flush_co = {
2372        .bs = bs,
2373        .ret = NOT_DONE,
2374    };
2375
2376    if (qemu_in_coroutine()) {
2377        /* Fast-path if already in coroutine context */
2378        bdrv_flush_co_entry(&flush_co);
2379    } else {
2380        AioContext *aio_context = bdrv_get_aio_context(bs);
2381
2382        co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
2383        qemu_coroutine_enter(co);
2384        while (flush_co.ret == NOT_DONE) {
2385            aio_poll(aio_context, true);
2386        }
2387    }
2388
2389    return flush_co.ret;
2390}
2391
2392typedef struct DiscardCo {
2393    BlockDriverState *bs;
2394    int64_t offset;
2395    int count;
2396    int ret;
2397} DiscardCo;
2398static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
2399{
2400    DiscardCo *rwco = opaque;
2401
2402    rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->count);
2403}
2404
2405int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
2406                                  int count)
2407{
2408    BdrvTrackedRequest req;
2409    int max_pdiscard, ret;
2410    int head, align;
2411
2412    if (!bs->drv) {
2413        return -ENOMEDIUM;
2414    }
2415
2416    ret = bdrv_check_byte_request(bs, offset, count);
2417    if (ret < 0) {
2418        return ret;
2419    } else if (bs->read_only) {
2420        return -EPERM;
2421    }
2422    assert(!(bs->open_flags & BDRV_O_INACTIVE));
2423
2424    /* Do nothing if disabled.  */
2425    if (!(bs->open_flags & BDRV_O_UNMAP)) {
2426        return 0;
2427    }
2428
2429    if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
2430        return 0;
2431    }
2432
2433    /* Discard is advisory, so ignore any unaligned head or tail */
2434    align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2435    assert(align % bs->bl.request_alignment == 0);
2436    head = offset % align;
2437    if (head) {
2438        head = MIN(count, align - head);
2439        count -= head;
2440        offset += head;
2441    }
2442    count = QEMU_ALIGN_DOWN(count, align);
2443    if (!count) {
2444        return 0;
2445    }
2446
2447    tracked_request_begin(&req, bs, offset, count, BDRV_TRACKED_DISCARD);
2448
2449    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2450    if (ret < 0) {
2451        goto out;
2452    }
2453
2454    max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
2455                                   align);
2456    assert(max_pdiscard);
2457
2458    while (count > 0) {
2459        int ret;
2460        int num = MIN(count, max_pdiscard);
2461
2462        if (bs->drv->bdrv_co_pdiscard) {
2463            ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
2464        } else {
2465            BlockAIOCB *acb;
2466            CoroutineIOCompletion co = {
2467                .coroutine = qemu_coroutine_self(),
2468            };
2469
2470            acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
2471                                             bdrv_co_io_em_complete, &co);
2472            if (acb == NULL) {
2473                ret = -EIO;
2474                goto out;
2475            } else {
2476                qemu_coroutine_yield();
2477                ret = co.ret;
2478            }
2479        }
2480        if (ret && ret != -ENOTSUP) {
2481            goto out;
2482        }
2483
2484        offset += num;
2485        count -= num;
2486    }
2487    ret = 0;
2488out:
2489    ++bs->write_gen;
2490    bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
2491                   req.bytes >> BDRV_SECTOR_BITS);
2492    tracked_request_end(&req);
2493    return ret;
2494}
2495
2496int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int count)
2497{
2498    Coroutine *co;
2499    DiscardCo rwco = {
2500        .bs = bs,
2501        .offset = offset,
2502        .count = count,
2503        .ret = NOT_DONE,
2504    };
2505
2506    if (qemu_in_coroutine()) {
2507        /* Fast-path if already in coroutine context */
2508        bdrv_pdiscard_co_entry(&rwco);
2509    } else {
2510        AioContext *aio_context = bdrv_get_aio_context(bs);
2511
2512        co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
2513        qemu_coroutine_enter(co);
2514        while (rwco.ret == NOT_DONE) {
2515            aio_poll(aio_context, true);
2516        }
2517    }
2518
2519    return rwco.ret;
2520}
2521
2522static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
2523{
2524    BlockDriver *drv = bs->drv;
2525    BdrvTrackedRequest tracked_req;
2526    CoroutineIOCompletion co = {
2527        .coroutine = qemu_coroutine_self(),
2528    };
2529    BlockAIOCB *acb;
2530
2531    tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
2532    if (!drv || !drv->bdrv_aio_ioctl) {
2533        co.ret = -ENOTSUP;
2534        goto out;
2535    }
2536
2537    acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2538    if (!acb) {
2539        co.ret = -ENOTSUP;
2540        goto out;
2541    }
2542    qemu_coroutine_yield();
2543out:
2544    tracked_request_end(&tracked_req);
2545    return co.ret;
2546}
2547
2548typedef struct {
2549    BlockDriverState *bs;
2550    int req;
2551    void *buf;
2552    int ret;
2553} BdrvIoctlCoData;
2554
2555static void coroutine_fn bdrv_co_ioctl_entry(void *opaque)
2556{
2557    BdrvIoctlCoData *data = opaque;
2558    data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf);
2559}
2560
2561/* needed for generic scsi interface */
2562int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
2563{
2564    BdrvIoctlCoData data = {
2565        .bs = bs,
2566        .req = req,
2567        .buf = buf,
2568        .ret = -EINPROGRESS,
2569    };
2570
2571    if (qemu_in_coroutine()) {
2572        /* Fast-path if already in coroutine context */
2573        bdrv_co_ioctl_entry(&data);
2574    } else {
2575        Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry, &data);
2576
2577        qemu_coroutine_enter(co);
2578        while (data.ret == -EINPROGRESS) {
2579            aio_poll(bdrv_get_aio_context(bs), true);
2580        }
2581    }
2582    return data.ret;
2583}
2584
2585static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque)
2586{
2587    BlockAIOCBCoroutine *acb = opaque;
2588    acb->req.error = bdrv_co_do_ioctl(acb->common.bs,
2589                                      acb->req.req, acb->req.buf);
2590    bdrv_co_complete(acb);
2591}
2592
2593BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
2594        unsigned long int req, void *buf,
2595        BlockCompletionFunc *cb, void *opaque)
2596{
2597    BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info,
2598                                            bs, cb, opaque);
2599    Coroutine *co;
2600
2601    acb->need_bh = true;
2602    acb->req.error = -EINPROGRESS;
2603    acb->req.req = req;
2604    acb->req.buf = buf;
2605    co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry, acb);
2606    qemu_coroutine_enter(co);
2607
2608    bdrv_co_maybe_schedule_bh(acb);
2609    return &acb->common;
2610}
2611
2612void *qemu_blockalign(BlockDriverState *bs, size_t size)
2613{
2614    return qemu_memalign(bdrv_opt_mem_align(bs), size);
2615}
2616
2617void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2618{
2619    return memset(qemu_blockalign(bs, size), 0, size);
2620}
2621
2622void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2623{
2624    size_t align = bdrv_opt_mem_align(bs);
2625
2626    /* Ensure that NULL is never returned on success */
2627    assert(align > 0);
2628    if (size == 0) {
2629        size = align;
2630    }
2631
2632    return qemu_try_memalign(align, size);
2633}
2634
2635void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2636{
2637    void *mem = qemu_try_blockalign(bs, size);
2638
2639    if (mem) {
2640        memset(mem, 0, size);
2641    }
2642
2643    return mem;
2644}
2645
2646/*
2647 * Check if all memory in this vector is sector aligned.
2648 */
2649bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2650{
2651    int i;
2652    size_t alignment = bdrv_min_mem_align(bs);
2653
2654    for (i = 0; i < qiov->niov; i++) {
2655        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2656            return false;
2657        }
2658        if (qiov->iov[i].iov_len % alignment) {
2659            return false;
2660        }
2661    }
2662
2663    return true;
2664}
2665
2666void bdrv_add_before_write_notifier(BlockDriverState *bs,
2667                                    NotifierWithReturn *notifier)
2668{
2669    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2670}
2671
2672void bdrv_io_plug(BlockDriverState *bs)
2673{
2674    BdrvChild *child;
2675
2676    QLIST_FOREACH(child, &bs->children, next) {
2677        bdrv_io_plug(child->bs);
2678    }
2679
2680    if (bs->io_plugged++ == 0 && bs->io_plug_disabled == 0) {
2681        BlockDriver *drv = bs->drv;
2682        if (drv && drv->bdrv_io_plug) {
2683            drv->bdrv_io_plug(bs);
2684        }
2685    }
2686}
2687
2688void bdrv_io_unplug(BlockDriverState *bs)
2689{
2690    BdrvChild *child;
2691
2692    assert(bs->io_plugged);
2693    if (--bs->io_plugged == 0 && bs->io_plug_disabled == 0) {
2694        BlockDriver *drv = bs->drv;
2695        if (drv && drv->bdrv_io_unplug) {
2696            drv->bdrv_io_unplug(bs);
2697        }
2698    }
2699
2700    QLIST_FOREACH(child, &bs->children, next) {
2701        bdrv_io_unplug(child->bs);
2702    }
2703}
2704
2705void bdrv_io_unplugged_begin(BlockDriverState *bs)
2706{
2707    BdrvChild *child;
2708
2709    if (bs->io_plug_disabled++ == 0 && bs->io_plugged > 0) {
2710        BlockDriver *drv = bs->drv;
2711        if (drv && drv->bdrv_io_unplug) {
2712            drv->bdrv_io_unplug(bs);
2713        }
2714    }
2715
2716    QLIST_FOREACH(child, &bs->children, next) {
2717        bdrv_io_unplugged_begin(child->bs);
2718    }
2719}
2720
2721void bdrv_io_unplugged_end(BlockDriverState *bs)
2722{
2723    BdrvChild *child;
2724
2725    assert(bs->io_plug_disabled);
2726    QLIST_FOREACH(child, &bs->children, next) {
2727        bdrv_io_unplugged_end(child->bs);
2728    }
2729
2730    if (--bs->io_plug_disabled == 0 && bs->io_plugged > 0) {
2731        BlockDriver *drv = bs->drv;
2732        if (drv && drv->bdrv_io_plug) {
2733            drv->bdrv_io_plug(bs);
2734        }
2735    }
2736}
2737