qemu/block/io.c
<<
>>
Prefs
   1/*
   2 * Block layer I/O functions
   3 *
   4 * Copyright (c) 2003 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24
  25#include "qemu/osdep.h"
  26#include "trace.h"
  27#include "sysemu/block-backend.h"
  28#include "block/blockjob.h"
  29#include "block/blockjob_int.h"
  30#include "block/block_int.h"
  31#include "qemu/cutils.h"
  32#include "qapi/error.h"
  33#include "qemu/error-report.h"
  34
  35#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  36
  37static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
  38    int64_t offset, int bytes, BdrvRequestFlags flags);
  39
  40void bdrv_parent_drained_begin(BlockDriverState *bs)
  41{
  42    BdrvChild *c;
  43
  44    QLIST_FOREACH(c, &bs->parents, next_parent) {
  45        if (c->role->drained_begin) {
  46            c->role->drained_begin(c);
  47        }
  48    }
  49}
  50
  51void bdrv_parent_drained_end(BlockDriverState *bs)
  52{
  53    BdrvChild *c;
  54
  55    QLIST_FOREACH(c, &bs->parents, next_parent) {
  56        if (c->role->drained_end) {
  57            c->role->drained_end(c);
  58        }
  59    }
  60}
  61
  62static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
  63{
  64    dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
  65    dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
  66    dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
  67                                 src->opt_mem_alignment);
  68    dst->min_mem_alignment = MAX(dst->min_mem_alignment,
  69                                 src->min_mem_alignment);
  70    dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
  71}
  72
  73void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
  74{
  75    BlockDriver *drv = bs->drv;
  76    Error *local_err = NULL;
  77
  78    memset(&bs->bl, 0, sizeof(bs->bl));
  79
  80    if (!drv) {
  81        return;
  82    }
  83
  84    /* Default alignment based on whether driver has byte interface */
  85    bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512;
  86
  87    /* Take some limits from the children as a default */
  88    if (bs->file) {
  89        bdrv_refresh_limits(bs->file->bs, &local_err);
  90        if (local_err) {
  91            error_propagate(errp, local_err);
  92            return;
  93        }
  94        bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
  95    } else {
  96        bs->bl.min_mem_alignment = 512;
  97        bs->bl.opt_mem_alignment = getpagesize();
  98
  99        /* Safe default since most protocols use readv()/writev()/etc */
 100        bs->bl.max_iov = IOV_MAX;
 101    }
 102
 103    if (bs->backing) {
 104        bdrv_refresh_limits(bs->backing->bs, &local_err);
 105        if (local_err) {
 106            error_propagate(errp, local_err);
 107            return;
 108        }
 109        bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
 110    }
 111
 112    /* Then let the driver override it */
 113    if (drv->bdrv_refresh_limits) {
 114        drv->bdrv_refresh_limits(bs, errp);
 115    }
 116}
 117
 118/**
 119 * The copy-on-read flag is actually a reference count so multiple users may
 120 * use the feature without worrying about clobbering its previous state.
 121 * Copy-on-read stays enabled until all users have called to disable it.
 122 */
 123void bdrv_enable_copy_on_read(BlockDriverState *bs)
 124{
 125    atomic_inc(&bs->copy_on_read);
 126}
 127
 128void bdrv_disable_copy_on_read(BlockDriverState *bs)
 129{
 130    int old = atomic_fetch_dec(&bs->copy_on_read);
 131    assert(old >= 1);
 132}
 133
 134/* Check if any requests are in-flight (including throttled requests) */
 135bool bdrv_requests_pending(BlockDriverState *bs)
 136{
 137    BdrvChild *child;
 138
 139    if (atomic_read(&bs->in_flight)) {
 140        return true;
 141    }
 142
 143    QLIST_FOREACH(child, &bs->children, next) {
 144        if (bdrv_requests_pending(child->bs)) {
 145            return true;
 146        }
 147    }
 148
 149    return false;
 150}
 151
 152typedef struct {
 153    Coroutine *co;
 154    BlockDriverState *bs;
 155    bool done;
 156} BdrvCoDrainData;
 157
 158static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
 159{
 160    BdrvCoDrainData *data = opaque;
 161    BlockDriverState *bs = data->bs;
 162
 163    bs->drv->bdrv_co_drain(bs);
 164
 165    /* Set data->done before reading bs->wakeup.  */
 166    atomic_mb_set(&data->done, true);
 167    bdrv_wakeup(bs);
 168}
 169
 170static void bdrv_drain_invoke(BlockDriverState *bs)
 171{
 172    BdrvCoDrainData data = { .bs = bs, .done = false };
 173
 174    if (!bs->drv || !bs->drv->bdrv_co_drain) {
 175        return;
 176    }
 177
 178    data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
 179    bdrv_coroutine_enter(bs, data.co);
 180    BDRV_POLL_WHILE(bs, !data.done);
 181}
 182
 183static bool bdrv_drain_recurse(BlockDriverState *bs)
 184{
 185    BdrvChild *child, *tmp;
 186    bool waited;
 187
 188    waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
 189
 190    /* Ensure any pending metadata writes are submitted to bs->file.  */
 191    bdrv_drain_invoke(bs);
 192
 193    QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
 194        BlockDriverState *bs = child->bs;
 195        bool in_main_loop =
 196            qemu_get_current_aio_context() == qemu_get_aio_context();
 197        assert(bs->refcnt > 0);
 198        if (in_main_loop) {
 199            /* In case the recursive bdrv_drain_recurse processes a
 200             * block_job_defer_to_main_loop BH and modifies the graph,
 201             * let's hold a reference to bs until we are done.
 202             *
 203             * IOThread doesn't have such a BH, and it is not safe to call
 204             * bdrv_unref without BQL, so skip doing it there.
 205             */
 206            bdrv_ref(bs);
 207        }
 208        waited |= bdrv_drain_recurse(bs);
 209        if (in_main_loop) {
 210            bdrv_unref(bs);
 211        }
 212    }
 213
 214    return waited;
 215}
 216
 217static void bdrv_co_drain_bh_cb(void *opaque)
 218{
 219    BdrvCoDrainData *data = opaque;
 220    Coroutine *co = data->co;
 221    BlockDriverState *bs = data->bs;
 222
 223    bdrv_dec_in_flight(bs);
 224    bdrv_drained_begin(bs);
 225    data->done = true;
 226    aio_co_wake(co);
 227}
 228
 229static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs)
 230{
 231    BdrvCoDrainData data;
 232
 233    /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
 234     * other coroutines run if they were queued from
 235     * qemu_co_queue_run_restart(). */
 236
 237    assert(qemu_in_coroutine());
 238    data = (BdrvCoDrainData) {
 239        .co = qemu_coroutine_self(),
 240        .bs = bs,
 241        .done = false,
 242    };
 243    bdrv_inc_in_flight(bs);
 244    aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
 245                            bdrv_co_drain_bh_cb, &data);
 246
 247    qemu_coroutine_yield();
 248    /* If we are resumed from some other event (such as an aio completion or a
 249     * timer callback), it is a bug in the caller that should be fixed. */
 250    assert(data.done);
 251}
 252
 253void bdrv_drained_begin(BlockDriverState *bs)
 254{
 255    if (qemu_in_coroutine()) {
 256        bdrv_co_yield_to_drain(bs);
 257        return;
 258    }
 259
 260    if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
 261        aio_disable_external(bdrv_get_aio_context(bs));
 262        bdrv_parent_drained_begin(bs);
 263    }
 264
 265    bdrv_drain_recurse(bs);
 266}
 267
 268void bdrv_drained_end(BlockDriverState *bs)
 269{
 270    assert(bs->quiesce_counter > 0);
 271    if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
 272        return;
 273    }
 274
 275    bdrv_parent_drained_end(bs);
 276    aio_enable_external(bdrv_get_aio_context(bs));
 277}
 278
 279/*
 280 * Wait for pending requests to complete on a single BlockDriverState subtree,
 281 * and suspend block driver's internal I/O until next request arrives.
 282 *
 283 * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
 284 * AioContext.
 285 *
 286 * Only this BlockDriverState's AioContext is run, so in-flight requests must
 287 * not depend on events in other AioContexts.  In that case, use
 288 * bdrv_drain_all() instead.
 289 */
 290void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
 291{
 292    assert(qemu_in_coroutine());
 293    bdrv_drained_begin(bs);
 294    bdrv_drained_end(bs);
 295}
 296
 297void bdrv_drain(BlockDriverState *bs)
 298{
 299    bdrv_drained_begin(bs);
 300    bdrv_drained_end(bs);
 301}
 302
 303/*
 304 * Wait for pending requests to complete across all BlockDriverStates
 305 *
 306 * This function does not flush data to disk, use bdrv_flush_all() for that
 307 * after calling this function.
 308 *
 309 * This pauses all block jobs and disables external clients. It must
 310 * be paired with bdrv_drain_all_end().
 311 *
 312 * NOTE: no new block jobs or BlockDriverStates can be created between
 313 * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
 314 */
 315void bdrv_drain_all_begin(void)
 316{
 317    /* Always run first iteration so any pending completion BHs run */
 318    bool waited = true;
 319    BlockDriverState *bs;
 320    BdrvNextIterator it;
 321    GSList *aio_ctxs = NULL, *ctx;
 322
 323    block_job_pause_all();
 324
 325    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 326        AioContext *aio_context = bdrv_get_aio_context(bs);
 327
 328        aio_context_acquire(aio_context);
 329        bdrv_parent_drained_begin(bs);
 330        aio_disable_external(aio_context);
 331        aio_context_release(aio_context);
 332
 333        if (!g_slist_find(aio_ctxs, aio_context)) {
 334            aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
 335        }
 336    }
 337
 338    /* Note that completion of an asynchronous I/O operation can trigger any
 339     * number of other I/O operations on other devices---for example a
 340     * coroutine can submit an I/O request to another device in response to
 341     * request completion.  Therefore we must keep looping until there was no
 342     * more activity rather than simply draining each device independently.
 343     */
 344    while (waited) {
 345        waited = false;
 346
 347        for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
 348            AioContext *aio_context = ctx->data;
 349
 350            aio_context_acquire(aio_context);
 351            for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 352                if (aio_context == bdrv_get_aio_context(bs)) {
 353                    waited |= bdrv_drain_recurse(bs);
 354                }
 355            }
 356            aio_context_release(aio_context);
 357        }
 358    }
 359
 360    g_slist_free(aio_ctxs);
 361}
 362
 363void bdrv_drain_all_end(void)
 364{
 365    BlockDriverState *bs;
 366    BdrvNextIterator it;
 367
 368    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 369        AioContext *aio_context = bdrv_get_aio_context(bs);
 370
 371        aio_context_acquire(aio_context);
 372        aio_enable_external(aio_context);
 373        bdrv_parent_drained_end(bs);
 374        aio_context_release(aio_context);
 375    }
 376
 377    block_job_resume_all();
 378}
 379
 380void bdrv_drain_all(void)
 381{
 382    bdrv_drain_all_begin();
 383    bdrv_drain_all_end();
 384}
 385
 386/**
 387 * Remove an active request from the tracked requests list
 388 *
 389 * This function should be called when a tracked request is completing.
 390 */
 391static void tracked_request_end(BdrvTrackedRequest *req)
 392{
 393    if (req->serialising) {
 394        atomic_dec(&req->bs->serialising_in_flight);
 395    }
 396
 397    qemu_co_mutex_lock(&req->bs->reqs_lock);
 398    QLIST_REMOVE(req, list);
 399    qemu_co_queue_restart_all(&req->wait_queue);
 400    qemu_co_mutex_unlock(&req->bs->reqs_lock);
 401}
 402
 403/**
 404 * Add an active request to the tracked requests list
 405 */
 406static void tracked_request_begin(BdrvTrackedRequest *req,
 407                                  BlockDriverState *bs,
 408                                  int64_t offset,
 409                                  unsigned int bytes,
 410                                  enum BdrvTrackedRequestType type)
 411{
 412    *req = (BdrvTrackedRequest){
 413        .bs = bs,
 414        .offset         = offset,
 415        .bytes          = bytes,
 416        .type           = type,
 417        .co             = qemu_coroutine_self(),
 418        .serialising    = false,
 419        .overlap_offset = offset,
 420        .overlap_bytes  = bytes,
 421    };
 422
 423    qemu_co_queue_init(&req->wait_queue);
 424
 425    qemu_co_mutex_lock(&bs->reqs_lock);
 426    QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
 427    qemu_co_mutex_unlock(&bs->reqs_lock);
 428}
 429
 430static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
 431{
 432    int64_t overlap_offset = req->offset & ~(align - 1);
 433    unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
 434                               - overlap_offset;
 435
 436    if (!req->serialising) {
 437        atomic_inc(&req->bs->serialising_in_flight);
 438        req->serialising = true;
 439    }
 440
 441    req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
 442    req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
 443}
 444
 445/**
 446 * Round a region to cluster boundaries
 447 */
 448void bdrv_round_to_clusters(BlockDriverState *bs,
 449                            int64_t offset, unsigned int bytes,
 450                            int64_t *cluster_offset,
 451                            unsigned int *cluster_bytes)
 452{
 453    BlockDriverInfo bdi;
 454
 455    if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
 456        *cluster_offset = offset;
 457        *cluster_bytes = bytes;
 458    } else {
 459        int64_t c = bdi.cluster_size;
 460        *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
 461        *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
 462    }
 463}
 464
 465static int bdrv_get_cluster_size(BlockDriverState *bs)
 466{
 467    BlockDriverInfo bdi;
 468    int ret;
 469
 470    ret = bdrv_get_info(bs, &bdi);
 471    if (ret < 0 || bdi.cluster_size == 0) {
 472        return bs->bl.request_alignment;
 473    } else {
 474        return bdi.cluster_size;
 475    }
 476}
 477
 478static bool tracked_request_overlaps(BdrvTrackedRequest *req,
 479                                     int64_t offset, unsigned int bytes)
 480{
 481    /*        aaaa   bbbb */
 482    if (offset >= req->overlap_offset + req->overlap_bytes) {
 483        return false;
 484    }
 485    /* bbbb   aaaa        */
 486    if (req->overlap_offset >= offset + bytes) {
 487        return false;
 488    }
 489    return true;
 490}
 491
 492void bdrv_inc_in_flight(BlockDriverState *bs)
 493{
 494    atomic_inc(&bs->in_flight);
 495}
 496
 497static void dummy_bh_cb(void *opaque)
 498{
 499}
 500
 501void bdrv_wakeup(BlockDriverState *bs)
 502{
 503    /* The barrier (or an atomic op) is in the caller.  */
 504    if (atomic_read(&bs->wakeup)) {
 505        aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
 506    }
 507}
 508
 509void bdrv_dec_in_flight(BlockDriverState *bs)
 510{
 511    atomic_dec(&bs->in_flight);
 512    bdrv_wakeup(bs);
 513}
 514
 515static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
 516{
 517    BlockDriverState *bs = self->bs;
 518    BdrvTrackedRequest *req;
 519    bool retry;
 520    bool waited = false;
 521
 522    if (!atomic_read(&bs->serialising_in_flight)) {
 523        return false;
 524    }
 525
 526    do {
 527        retry = false;
 528        qemu_co_mutex_lock(&bs->reqs_lock);
 529        QLIST_FOREACH(req, &bs->tracked_requests, list) {
 530            if (req == self || (!req->serialising && !self->serialising)) {
 531                continue;
 532            }
 533            if (tracked_request_overlaps(req, self->overlap_offset,
 534                                         self->overlap_bytes))
 535            {
 536                /* Hitting this means there was a reentrant request, for
 537                 * example, a block driver issuing nested requests.  This must
 538                 * never happen since it means deadlock.
 539                 */
 540                assert(qemu_coroutine_self() != req->co);
 541
 542                /* If the request is already (indirectly) waiting for us, or
 543                 * will wait for us as soon as it wakes up, then just go on
 544                 * (instead of producing a deadlock in the former case). */
 545                if (!req->waiting_for) {
 546                    self->waiting_for = req;
 547                    qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
 548                    self->waiting_for = NULL;
 549                    retry = true;
 550                    waited = true;
 551                    break;
 552                }
 553            }
 554        }
 555        qemu_co_mutex_unlock(&bs->reqs_lock);
 556    } while (retry);
 557
 558    return waited;
 559}
 560
 561static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 562                                   size_t size)
 563{
 564    if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
 565        return -EIO;
 566    }
 567
 568    if (!bdrv_is_inserted(bs)) {
 569        return -ENOMEDIUM;
 570    }
 571
 572    if (offset < 0) {
 573        return -EIO;
 574    }
 575
 576    return 0;
 577}
 578
 579typedef struct RwCo {
 580    BdrvChild *child;
 581    int64_t offset;
 582    QEMUIOVector *qiov;
 583    bool is_write;
 584    int ret;
 585    BdrvRequestFlags flags;
 586} RwCo;
 587
 588static void coroutine_fn bdrv_rw_co_entry(void *opaque)
 589{
 590    RwCo *rwco = opaque;
 591
 592    if (!rwco->is_write) {
 593        rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
 594                                   rwco->qiov->size, rwco->qiov,
 595                                   rwco->flags);
 596    } else {
 597        rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
 598                                    rwco->qiov->size, rwco->qiov,
 599                                    rwco->flags);
 600    }
 601}
 602
 603/*
 604 * Process a vectored synchronous request using coroutines
 605 */
 606static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
 607                        QEMUIOVector *qiov, bool is_write,
 608                        BdrvRequestFlags flags)
 609{
 610    Coroutine *co;
 611    RwCo rwco = {
 612        .child = child,
 613        .offset = offset,
 614        .qiov = qiov,
 615        .is_write = is_write,
 616        .ret = NOT_DONE,
 617        .flags = flags,
 618    };
 619
 620    if (qemu_in_coroutine()) {
 621        /* Fast-path if already in coroutine context */
 622        bdrv_rw_co_entry(&rwco);
 623    } else {
 624        co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
 625        bdrv_coroutine_enter(child->bs, co);
 626        BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
 627    }
 628    return rwco.ret;
 629}
 630
 631/*
 632 * Process a synchronous request using coroutines
 633 */
 634static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf,
 635                      int nb_sectors, bool is_write, BdrvRequestFlags flags)
 636{
 637    QEMUIOVector qiov;
 638    struct iovec iov = {
 639        .iov_base = (void *)buf,
 640        .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
 641    };
 642
 643    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
 644        return -EINVAL;
 645    }
 646
 647    qemu_iovec_init_external(&qiov, &iov, 1);
 648    return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS,
 649                        &qiov, is_write, flags);
 650}
 651
 652/* return < 0 if error. See bdrv_write() for the return codes */
 653int bdrv_read(BdrvChild *child, int64_t sector_num,
 654              uint8_t *buf, int nb_sectors)
 655{
 656    return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0);
 657}
 658
 659/* Return < 0 if error. Important errors are:
 660  -EIO         generic I/O error (may happen for all errors)
 661  -ENOMEDIUM   No media inserted.
 662  -EINVAL      Invalid sector number or nb_sectors
 663  -EACCES      Trying to write a read-only device
 664*/
 665int bdrv_write(BdrvChild *child, int64_t sector_num,
 666               const uint8_t *buf, int nb_sectors)
 667{
 668    return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
 669}
 670
 671int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
 672                       int bytes, BdrvRequestFlags flags)
 673{
 674    QEMUIOVector qiov;
 675    struct iovec iov = {
 676        .iov_base = NULL,
 677        .iov_len = bytes,
 678    };
 679
 680    qemu_iovec_init_external(&qiov, &iov, 1);
 681    return bdrv_prwv_co(child, offset, &qiov, true,
 682                        BDRV_REQ_ZERO_WRITE | flags);
 683}
 684
 685/*
 686 * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
 687 * The operation is sped up by checking the block status and only writing
 688 * zeroes to the device if they currently do not return zeroes. Optional
 689 * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
 690 * BDRV_REQ_FUA).
 691 *
 692 * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
 693 */
 694int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
 695{
 696    int64_t target_sectors, ret, nb_sectors, sector_num = 0;
 697    BlockDriverState *bs = child->bs;
 698    BlockDriverState *file;
 699    int n;
 700
 701    target_sectors = bdrv_nb_sectors(bs);
 702    if (target_sectors < 0) {
 703        return target_sectors;
 704    }
 705
 706    for (;;) {
 707        nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS);
 708        if (nb_sectors <= 0) {
 709            return 0;
 710        }
 711        ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file);
 712        if (ret < 0) {
 713            error_report("error getting block status at sector %" PRId64 ": %s",
 714                         sector_num, strerror(-ret));
 715            return ret;
 716        }
 717        if (ret & BDRV_BLOCK_ZERO) {
 718            sector_num += n;
 719            continue;
 720        }
 721        ret = bdrv_pwrite_zeroes(child, sector_num << BDRV_SECTOR_BITS,
 722                                 n << BDRV_SECTOR_BITS, flags);
 723        if (ret < 0) {
 724            error_report("error writing zeroes at sector %" PRId64 ": %s",
 725                         sector_num, strerror(-ret));
 726            return ret;
 727        }
 728        sector_num += n;
 729    }
 730}
 731
 732int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
 733{
 734    int ret;
 735
 736    ret = bdrv_prwv_co(child, offset, qiov, false, 0);
 737    if (ret < 0) {
 738        return ret;
 739    }
 740
 741    return qiov->size;
 742}
 743
 744int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
 745{
 746    QEMUIOVector qiov;
 747    struct iovec iov = {
 748        .iov_base = (void *)buf,
 749        .iov_len = bytes,
 750    };
 751
 752    if (bytes < 0) {
 753        return -EINVAL;
 754    }
 755
 756    qemu_iovec_init_external(&qiov, &iov, 1);
 757    return bdrv_preadv(child, offset, &qiov);
 758}
 759
 760int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
 761{
 762    int ret;
 763
 764    ret = bdrv_prwv_co(child, offset, qiov, true, 0);
 765    if (ret < 0) {
 766        return ret;
 767    }
 768
 769    return qiov->size;
 770}
 771
 772int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
 773{
 774    QEMUIOVector qiov;
 775    struct iovec iov = {
 776        .iov_base   = (void *) buf,
 777        .iov_len    = bytes,
 778    };
 779
 780    if (bytes < 0) {
 781        return -EINVAL;
 782    }
 783
 784    qemu_iovec_init_external(&qiov, &iov, 1);
 785    return bdrv_pwritev(child, offset, &qiov);
 786}
 787
 788/*
 789 * Writes to the file and ensures that no writes are reordered across this
 790 * request (acts as a barrier)
 791 *
 792 * Returns 0 on success, -errno in error cases.
 793 */
 794int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
 795                     const void *buf, int count)
 796{
 797    int ret;
 798
 799    ret = bdrv_pwrite(child, offset, buf, count);
 800    if (ret < 0) {
 801        return ret;
 802    }
 803
 804    ret = bdrv_flush(child->bs);
 805    if (ret < 0) {
 806        return ret;
 807    }
 808
 809    return 0;
 810}
 811
 812typedef struct CoroutineIOCompletion {
 813    Coroutine *coroutine;
 814    int ret;
 815} CoroutineIOCompletion;
 816
 817static void bdrv_co_io_em_complete(void *opaque, int ret)
 818{
 819    CoroutineIOCompletion *co = opaque;
 820
 821    co->ret = ret;
 822    aio_co_wake(co->coroutine);
 823}
 824
 825static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
 826                                           uint64_t offset, uint64_t bytes,
 827                                           QEMUIOVector *qiov, int flags)
 828{
 829    BlockDriver *drv = bs->drv;
 830    int64_t sector_num;
 831    unsigned int nb_sectors;
 832
 833    assert(!(flags & ~BDRV_REQ_MASK));
 834
 835    if (drv->bdrv_co_preadv) {
 836        return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
 837    }
 838
 839    sector_num = offset >> BDRV_SECTOR_BITS;
 840    nb_sectors = bytes >> BDRV_SECTOR_BITS;
 841
 842    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
 843    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 844    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 845
 846    if (drv->bdrv_co_readv) {
 847        return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
 848    } else {
 849        BlockAIOCB *acb;
 850        CoroutineIOCompletion co = {
 851            .coroutine = qemu_coroutine_self(),
 852        };
 853
 854        acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
 855                                      bdrv_co_io_em_complete, &co);
 856        if (acb == NULL) {
 857            return -EIO;
 858        } else {
 859            qemu_coroutine_yield();
 860            return co.ret;
 861        }
 862    }
 863}
 864
 865static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
 866                                            uint64_t offset, uint64_t bytes,
 867                                            QEMUIOVector *qiov, int flags)
 868{
 869    BlockDriver *drv = bs->drv;
 870    int64_t sector_num;
 871    unsigned int nb_sectors;
 872    int ret;
 873
 874    assert(!(flags & ~BDRV_REQ_MASK));
 875
 876    if (drv->bdrv_co_pwritev) {
 877        ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
 878                                   flags & bs->supported_write_flags);
 879        flags &= ~bs->supported_write_flags;
 880        goto emulate_flags;
 881    }
 882
 883    sector_num = offset >> BDRV_SECTOR_BITS;
 884    nb_sectors = bytes >> BDRV_SECTOR_BITS;
 885
 886    assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
 887    assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 888    assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 889
 890    if (drv->bdrv_co_writev_flags) {
 891        ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
 892                                        flags & bs->supported_write_flags);
 893        flags &= ~bs->supported_write_flags;
 894    } else if (drv->bdrv_co_writev) {
 895        assert(!bs->supported_write_flags);
 896        ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
 897    } else {
 898        BlockAIOCB *acb;
 899        CoroutineIOCompletion co = {
 900            .coroutine = qemu_coroutine_self(),
 901        };
 902
 903        acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
 904                                       bdrv_co_io_em_complete, &co);
 905        if (acb == NULL) {
 906            ret = -EIO;
 907        } else {
 908            qemu_coroutine_yield();
 909            ret = co.ret;
 910        }
 911    }
 912
 913emulate_flags:
 914    if (ret == 0 && (flags & BDRV_REQ_FUA)) {
 915        ret = bdrv_co_flush(bs);
 916    }
 917
 918    return ret;
 919}
 920
 921static int coroutine_fn
 922bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
 923                               uint64_t bytes, QEMUIOVector *qiov)
 924{
 925    BlockDriver *drv = bs->drv;
 926
 927    if (!drv->bdrv_co_pwritev_compressed) {
 928        return -ENOTSUP;
 929    }
 930
 931    return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
 932}
 933
 934static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
 935        int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
 936{
 937    BlockDriverState *bs = child->bs;
 938
 939    /* Perform I/O through a temporary buffer so that users who scribble over
 940     * their read buffer while the operation is in progress do not end up
 941     * modifying the image file.  This is critical for zero-copy guest I/O
 942     * where anything might happen inside guest memory.
 943     */
 944    void *bounce_buffer;
 945
 946    BlockDriver *drv = bs->drv;
 947    struct iovec iov;
 948    QEMUIOVector bounce_qiov;
 949    int64_t cluster_offset;
 950    unsigned int cluster_bytes;
 951    size_t skip_bytes;
 952    int ret;
 953
 954    /* FIXME We cannot require callers to have write permissions when all they
 955     * are doing is a read request. If we did things right, write permissions
 956     * would be obtained anyway, but internally by the copy-on-read code. As
 957     * long as it is implemented here rather than in a separat filter driver,
 958     * the copy-on-read code doesn't have its own BdrvChild, however, for which
 959     * it could request permissions. Therefore we have to bypass the permission
 960     * system for the moment. */
 961    // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
 962
 963    /* Cover entire cluster so no additional backing file I/O is required when
 964     * allocating cluster in the image file.
 965     */
 966    bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
 967
 968    trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
 969                                   cluster_offset, cluster_bytes);
 970
 971    iov.iov_len = cluster_bytes;
 972    iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
 973    if (bounce_buffer == NULL) {
 974        ret = -ENOMEM;
 975        goto err;
 976    }
 977
 978    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
 979
 980    ret = bdrv_driver_preadv(bs, cluster_offset, cluster_bytes,
 981                             &bounce_qiov, 0);
 982    if (ret < 0) {
 983        goto err;
 984    }
 985
 986    if (drv->bdrv_co_pwrite_zeroes &&
 987        buffer_is_zero(bounce_buffer, iov.iov_len)) {
 988        /* FIXME: Should we (perhaps conditionally) be setting
 989         * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
 990         * that still correctly reads as zero? */
 991        ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, cluster_bytes, 0);
 992    } else {
 993        /* This does not change the data on the disk, it is not necessary
 994         * to flush even in cache=writethrough mode.
 995         */
 996        ret = bdrv_driver_pwritev(bs, cluster_offset, cluster_bytes,
 997                                  &bounce_qiov, 0);
 998    }
 999
1000    if (ret < 0) {
1001        /* It might be okay to ignore write errors for guest requests.  If this
1002         * is a deliberate copy-on-read then we don't want to ignore the error.
1003         * Simply report it in all cases.
1004         */
1005        goto err;
1006    }
1007
1008    skip_bytes = offset - cluster_offset;
1009    qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, bytes);
1010
1011err:
1012    qemu_vfree(bounce_buffer);
1013    return ret;
1014}
1015
1016/*
1017 * Forwards an already correctly aligned request to the BlockDriver. This
1018 * handles copy on read, zeroing after EOF, and fragmentation of large
1019 * reads; any other features must be implemented by the caller.
1020 */
1021static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
1022    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1023    int64_t align, QEMUIOVector *qiov, int flags)
1024{
1025    BlockDriverState *bs = child->bs;
1026    int64_t total_bytes, max_bytes;
1027    int ret = 0;
1028    uint64_t bytes_remaining = bytes;
1029    int max_transfer;
1030
1031    assert(is_power_of_2(align));
1032    assert((offset & (align - 1)) == 0);
1033    assert((bytes & (align - 1)) == 0);
1034    assert(!qiov || bytes == qiov->size);
1035    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1036    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1037                                   align);
1038
1039    /* TODO: We would need a per-BDS .supported_read_flags and
1040     * potential fallback support, if we ever implement any read flags
1041     * to pass through to drivers.  For now, there aren't any
1042     * passthrough flags.  */
1043    assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
1044
1045    /* Handle Copy on Read and associated serialisation */
1046    if (flags & BDRV_REQ_COPY_ON_READ) {
1047        /* If we touch the same cluster it counts as an overlap.  This
1048         * guarantees that allocating writes will be serialized and not race
1049         * with each other for the same cluster.  For example, in copy-on-read
1050         * it ensures that the CoR read and write operations are atomic and
1051         * guest writes cannot interleave between them. */
1052        mark_request_serialising(req, bdrv_get_cluster_size(bs));
1053    }
1054
1055    if (!(flags & BDRV_REQ_NO_SERIALISING)) {
1056        wait_serialising_requests(req);
1057    }
1058
1059    if (flags & BDRV_REQ_COPY_ON_READ) {
1060        /* TODO: Simplify further once bdrv_is_allocated no longer
1061         * requires sector alignment */
1062        int64_t start = QEMU_ALIGN_DOWN(offset, BDRV_SECTOR_SIZE);
1063        int64_t end = QEMU_ALIGN_UP(offset + bytes, BDRV_SECTOR_SIZE);
1064        int64_t pnum;
1065
1066        ret = bdrv_is_allocated(bs, start, end - start, &pnum);
1067        if (ret < 0) {
1068            goto out;
1069        }
1070
1071        if (!ret || pnum != end - start) {
1072            ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
1073            goto out;
1074        }
1075    }
1076
1077    /* Forward the request to the BlockDriver, possibly fragmenting it */
1078    total_bytes = bdrv_getlength(bs);
1079    if (total_bytes < 0) {
1080        ret = total_bytes;
1081        goto out;
1082    }
1083
1084    max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1085    if (bytes <= max_bytes && bytes <= max_transfer) {
1086        ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
1087        goto out;
1088    }
1089
1090    while (bytes_remaining) {
1091        int num;
1092
1093        if (max_bytes) {
1094            QEMUIOVector local_qiov;
1095
1096            num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1097            assert(num);
1098            qemu_iovec_init(&local_qiov, qiov->niov);
1099            qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1100
1101            ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1102                                     num, &local_qiov, 0);
1103            max_bytes -= num;
1104            qemu_iovec_destroy(&local_qiov);
1105        } else {
1106            num = bytes_remaining;
1107            ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
1108                                    bytes_remaining);
1109        }
1110        if (ret < 0) {
1111            goto out;
1112        }
1113        bytes_remaining -= num;
1114    }
1115
1116out:
1117    return ret < 0 ? ret : 0;
1118}
1119
1120/*
1121 * Handle a read request in coroutine context
1122 */
1123int coroutine_fn bdrv_co_preadv(BdrvChild *child,
1124    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1125    BdrvRequestFlags flags)
1126{
1127    BlockDriverState *bs = child->bs;
1128    BlockDriver *drv = bs->drv;
1129    BdrvTrackedRequest req;
1130
1131    uint64_t align = bs->bl.request_alignment;
1132    uint8_t *head_buf = NULL;
1133    uint8_t *tail_buf = NULL;
1134    QEMUIOVector local_qiov;
1135    bool use_local_qiov = false;
1136    int ret;
1137
1138    trace_bdrv_co_preadv(child->bs, offset, bytes, flags);
1139
1140    if (!drv) {
1141        return -ENOMEDIUM;
1142    }
1143
1144    ret = bdrv_check_byte_request(bs, offset, bytes);
1145    if (ret < 0) {
1146        return ret;
1147    }
1148
1149    bdrv_inc_in_flight(bs);
1150
1151    /* Don't do copy-on-read if we read data before write operation */
1152    if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
1153        flags |= BDRV_REQ_COPY_ON_READ;
1154    }
1155
1156    /* Align read if necessary by padding qiov */
1157    if (offset & (align - 1)) {
1158        head_buf = qemu_blockalign(bs, align);
1159        qemu_iovec_init(&local_qiov, qiov->niov + 2);
1160        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1161        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1162        use_local_qiov = true;
1163
1164        bytes += offset & (align - 1);
1165        offset = offset & ~(align - 1);
1166    }
1167
1168    if ((offset + bytes) & (align - 1)) {
1169        if (!use_local_qiov) {
1170            qemu_iovec_init(&local_qiov, qiov->niov + 1);
1171            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1172            use_local_qiov = true;
1173        }
1174        tail_buf = qemu_blockalign(bs, align);
1175        qemu_iovec_add(&local_qiov, tail_buf,
1176                       align - ((offset + bytes) & (align - 1)));
1177
1178        bytes = ROUND_UP(bytes, align);
1179    }
1180
1181    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1182    ret = bdrv_aligned_preadv(child, &req, offset, bytes, align,
1183                              use_local_qiov ? &local_qiov : qiov,
1184                              flags);
1185    tracked_request_end(&req);
1186    bdrv_dec_in_flight(bs);
1187
1188    if (use_local_qiov) {
1189        qemu_iovec_destroy(&local_qiov);
1190        qemu_vfree(head_buf);
1191        qemu_vfree(tail_buf);
1192    }
1193
1194    return ret;
1195}
1196
1197static int coroutine_fn bdrv_co_do_readv(BdrvChild *child,
1198    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1199    BdrvRequestFlags flags)
1200{
1201    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1202        return -EINVAL;
1203    }
1204
1205    return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS,
1206                          nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1207}
1208
1209int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
1210                               int nb_sectors, QEMUIOVector *qiov)
1211{
1212    return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0);
1213}
1214
1215/* Maximum buffer for write zeroes fallback, in bytes */
1216#define MAX_WRITE_ZEROES_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
1217
1218static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1219    int64_t offset, int bytes, BdrvRequestFlags flags)
1220{
1221    BlockDriver *drv = bs->drv;
1222    QEMUIOVector qiov;
1223    struct iovec iov = {0};
1224    int ret = 0;
1225    bool need_flush = false;
1226    int head = 0;
1227    int tail = 0;
1228
1229    int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1230    int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1231                        bs->bl.request_alignment);
1232    int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
1233                                    MAX_WRITE_ZEROES_BOUNCE_BUFFER);
1234
1235    assert(alignment % bs->bl.request_alignment == 0);
1236    head = offset % alignment;
1237    tail = (offset + bytes) % alignment;
1238    max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1239    assert(max_write_zeroes >= bs->bl.request_alignment);
1240
1241    while (bytes > 0 && !ret) {
1242        int num = bytes;
1243
1244        /* Align request.  Block drivers can expect the "bulk" of the request
1245         * to be aligned, and that unaligned requests do not cross cluster
1246         * boundaries.
1247         */
1248        if (head) {
1249            /* Make a small request up to the first aligned sector. For
1250             * convenience, limit this request to max_transfer even if
1251             * we don't need to fall back to writes.  */
1252            num = MIN(MIN(bytes, max_transfer), alignment - head);
1253            head = (head + num) % alignment;
1254            assert(num < max_write_zeroes);
1255        } else if (tail && num > alignment) {
1256            /* Shorten the request to the last aligned sector.  */
1257            num -= tail;
1258        }
1259
1260        /* limit request size */
1261        if (num > max_write_zeroes) {
1262            num = max_write_zeroes;
1263        }
1264
1265        ret = -ENOTSUP;
1266        /* First try the efficient write zeroes operation */
1267        if (drv->bdrv_co_pwrite_zeroes) {
1268            ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1269                                             flags & bs->supported_zero_flags);
1270            if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1271                !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1272                need_flush = true;
1273            }
1274        } else {
1275            assert(!bs->supported_zero_flags);
1276        }
1277
1278        if (ret == -ENOTSUP) {
1279            /* Fall back to bounce buffer if write zeroes is unsupported */
1280            BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1281
1282            if ((flags & BDRV_REQ_FUA) &&
1283                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1284                /* No need for bdrv_driver_pwrite() to do a fallback
1285                 * flush on each chunk; use just one at the end */
1286                write_flags &= ~BDRV_REQ_FUA;
1287                need_flush = true;
1288            }
1289            num = MIN(num, max_transfer);
1290            iov.iov_len = num;
1291            if (iov.iov_base == NULL) {
1292                iov.iov_base = qemu_try_blockalign(bs, num);
1293                if (iov.iov_base == NULL) {
1294                    ret = -ENOMEM;
1295                    goto fail;
1296                }
1297                memset(iov.iov_base, 0, num);
1298            }
1299            qemu_iovec_init_external(&qiov, &iov, 1);
1300
1301            ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
1302
1303            /* Keep bounce buffer around if it is big enough for all
1304             * all future requests.
1305             */
1306            if (num < max_transfer) {
1307                qemu_vfree(iov.iov_base);
1308                iov.iov_base = NULL;
1309            }
1310        }
1311
1312        offset += num;
1313        bytes -= num;
1314    }
1315
1316fail:
1317    if (ret == 0 && need_flush) {
1318        ret = bdrv_co_flush(bs);
1319    }
1320    qemu_vfree(iov.iov_base);
1321    return ret;
1322}
1323
1324/*
1325 * Forwards an already correctly aligned write request to the BlockDriver,
1326 * after possibly fragmenting it.
1327 */
1328static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
1329    BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1330    int64_t align, QEMUIOVector *qiov, int flags)
1331{
1332    BlockDriverState *bs = child->bs;
1333    BlockDriver *drv = bs->drv;
1334    bool waited;
1335    int ret;
1336
1337    int64_t start_sector = offset >> BDRV_SECTOR_BITS;
1338    int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1339    uint64_t bytes_remaining = bytes;
1340    int max_transfer;
1341
1342    if (bdrv_has_readonly_bitmaps(bs)) {
1343        return -EPERM;
1344    }
1345
1346    assert(is_power_of_2(align));
1347    assert((offset & (align - 1)) == 0);
1348    assert((bytes & (align - 1)) == 0);
1349    assert(!qiov || bytes == qiov->size);
1350    assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1351    assert(!(flags & ~BDRV_REQ_MASK));
1352    max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1353                                   align);
1354
1355    waited = wait_serialising_requests(req);
1356    assert(!waited || !req->serialising);
1357    assert(req->overlap_offset <= offset);
1358    assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1359    assert(child->perm & BLK_PERM_WRITE);
1360    assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
1361
1362    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1363
1364    if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1365        !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
1366        qemu_iovec_is_zero(qiov)) {
1367        flags |= BDRV_REQ_ZERO_WRITE;
1368        if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1369            flags |= BDRV_REQ_MAY_UNMAP;
1370        }
1371    }
1372
1373    if (ret < 0) {
1374        /* Do nothing, write notifier decided to fail this request */
1375    } else if (flags & BDRV_REQ_ZERO_WRITE) {
1376        bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1377        ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
1378    } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
1379        ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov);
1380    } else if (bytes <= max_transfer) {
1381        bdrv_debug_event(bs, BLKDBG_PWRITEV);
1382        ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
1383    } else {
1384        bdrv_debug_event(bs, BLKDBG_PWRITEV);
1385        while (bytes_remaining) {
1386            int num = MIN(bytes_remaining, max_transfer);
1387            QEMUIOVector local_qiov;
1388            int local_flags = flags;
1389
1390            assert(num);
1391            if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
1392                !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1393                /* If FUA is going to be emulated by flush, we only
1394                 * need to flush on the last iteration */
1395                local_flags &= ~BDRV_REQ_FUA;
1396            }
1397            qemu_iovec_init(&local_qiov, qiov->niov);
1398            qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1399
1400            ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
1401                                      num, &local_qiov, local_flags);
1402            qemu_iovec_destroy(&local_qiov);
1403            if (ret < 0) {
1404                break;
1405            }
1406            bytes_remaining -= num;
1407        }
1408    }
1409    bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1410
1411    atomic_inc(&bs->write_gen);
1412    bdrv_set_dirty(bs, start_sector, end_sector - start_sector);
1413
1414    stat64_max(&bs->wr_highest_offset, offset + bytes);
1415
1416    if (ret >= 0) {
1417        bs->total_sectors = MAX(bs->total_sectors, end_sector);
1418        ret = 0;
1419    }
1420
1421    return ret;
1422}
1423
1424static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
1425                                                int64_t offset,
1426                                                unsigned int bytes,
1427                                                BdrvRequestFlags flags,
1428                                                BdrvTrackedRequest *req)
1429{
1430    BlockDriverState *bs = child->bs;
1431    uint8_t *buf = NULL;
1432    QEMUIOVector local_qiov;
1433    struct iovec iov;
1434    uint64_t align = bs->bl.request_alignment;
1435    unsigned int head_padding_bytes, tail_padding_bytes;
1436    int ret = 0;
1437
1438    head_padding_bytes = offset & (align - 1);
1439    tail_padding_bytes = (align - (offset + bytes)) & (align - 1);
1440
1441
1442    assert(flags & BDRV_REQ_ZERO_WRITE);
1443    if (head_padding_bytes || tail_padding_bytes) {
1444        buf = qemu_blockalign(bs, align);
1445        iov = (struct iovec) {
1446            .iov_base   = buf,
1447            .iov_len    = align,
1448        };
1449        qemu_iovec_init_external(&local_qiov, &iov, 1);
1450    }
1451    if (head_padding_bytes) {
1452        uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1453
1454        /* RMW the unaligned part before head. */
1455        mark_request_serialising(req, align);
1456        wait_serialising_requests(req);
1457        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1458        ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align,
1459                                  align, &local_qiov, 0);
1460        if (ret < 0) {
1461            goto fail;
1462        }
1463        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1464
1465        memset(buf + head_padding_bytes, 0, zero_bytes);
1466        ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align,
1467                                   align, &local_qiov,
1468                                   flags & ~BDRV_REQ_ZERO_WRITE);
1469        if (ret < 0) {
1470            goto fail;
1471        }
1472        offset += zero_bytes;
1473        bytes -= zero_bytes;
1474    }
1475
1476    assert(!bytes || (offset & (align - 1)) == 0);
1477    if (bytes >= align) {
1478        /* Write the aligned part in the middle. */
1479        uint64_t aligned_bytes = bytes & ~(align - 1);
1480        ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
1481                                   NULL, flags);
1482        if (ret < 0) {
1483            goto fail;
1484        }
1485        bytes -= aligned_bytes;
1486        offset += aligned_bytes;
1487    }
1488
1489    assert(!bytes || (offset & (align - 1)) == 0);
1490    if (bytes) {
1491        assert(align == tail_padding_bytes + bytes);
1492        /* RMW the unaligned part after tail. */
1493        mark_request_serialising(req, align);
1494        wait_serialising_requests(req);
1495        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1496        ret = bdrv_aligned_preadv(child, req, offset, align,
1497                                  align, &local_qiov, 0);
1498        if (ret < 0) {
1499            goto fail;
1500        }
1501        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1502
1503        memset(buf, 0, bytes);
1504        ret = bdrv_aligned_pwritev(child, req, offset, align, align,
1505                                   &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1506    }
1507fail:
1508    qemu_vfree(buf);
1509    return ret;
1510
1511}
1512
1513/*
1514 * Handle a write request in coroutine context
1515 */
1516int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
1517    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1518    BdrvRequestFlags flags)
1519{
1520    BlockDriverState *bs = child->bs;
1521    BdrvTrackedRequest req;
1522    uint64_t align = bs->bl.request_alignment;
1523    uint8_t *head_buf = NULL;
1524    uint8_t *tail_buf = NULL;
1525    QEMUIOVector local_qiov;
1526    bool use_local_qiov = false;
1527    int ret;
1528
1529    trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
1530
1531    if (!bs->drv) {
1532        return -ENOMEDIUM;
1533    }
1534    if (bs->read_only) {
1535        return -EPERM;
1536    }
1537    assert(!(bs->open_flags & BDRV_O_INACTIVE));
1538
1539    ret = bdrv_check_byte_request(bs, offset, bytes);
1540    if (ret < 0) {
1541        return ret;
1542    }
1543
1544    bdrv_inc_in_flight(bs);
1545    /*
1546     * Align write if necessary by performing a read-modify-write cycle.
1547     * Pad qiov with the read parts and be sure to have a tracked request not
1548     * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1549     */
1550    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1551
1552    if (!qiov) {
1553        ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
1554        goto out;
1555    }
1556
1557    if (offset & (align - 1)) {
1558        QEMUIOVector head_qiov;
1559        struct iovec head_iov;
1560
1561        mark_request_serialising(&req, align);
1562        wait_serialising_requests(&req);
1563
1564        head_buf = qemu_blockalign(bs, align);
1565        head_iov = (struct iovec) {
1566            .iov_base   = head_buf,
1567            .iov_len    = align,
1568        };
1569        qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1570
1571        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1572        ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align,
1573                                  align, &head_qiov, 0);
1574        if (ret < 0) {
1575            goto fail;
1576        }
1577        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1578
1579        qemu_iovec_init(&local_qiov, qiov->niov + 2);
1580        qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1581        qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1582        use_local_qiov = true;
1583
1584        bytes += offset & (align - 1);
1585        offset = offset & ~(align - 1);
1586
1587        /* We have read the tail already if the request is smaller
1588         * than one aligned block.
1589         */
1590        if (bytes < align) {
1591            qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1592            bytes = align;
1593        }
1594    }
1595
1596    if ((offset + bytes) & (align - 1)) {
1597        QEMUIOVector tail_qiov;
1598        struct iovec tail_iov;
1599        size_t tail_bytes;
1600        bool waited;
1601
1602        mark_request_serialising(&req, align);
1603        waited = wait_serialising_requests(&req);
1604        assert(!waited || !use_local_qiov);
1605
1606        tail_buf = qemu_blockalign(bs, align);
1607        tail_iov = (struct iovec) {
1608            .iov_base   = tail_buf,
1609            .iov_len    = align,
1610        };
1611        qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1612
1613        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1614        ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1),
1615                                  align, align, &tail_qiov, 0);
1616        if (ret < 0) {
1617            goto fail;
1618        }
1619        bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1620
1621        if (!use_local_qiov) {
1622            qemu_iovec_init(&local_qiov, qiov->niov + 1);
1623            qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1624            use_local_qiov = true;
1625        }
1626
1627        tail_bytes = (offset + bytes) & (align - 1);
1628        qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1629
1630        bytes = ROUND_UP(bytes, align);
1631    }
1632
1633    ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
1634                               use_local_qiov ? &local_qiov : qiov,
1635                               flags);
1636
1637fail:
1638
1639    if (use_local_qiov) {
1640        qemu_iovec_destroy(&local_qiov);
1641    }
1642    qemu_vfree(head_buf);
1643    qemu_vfree(tail_buf);
1644out:
1645    tracked_request_end(&req);
1646    bdrv_dec_in_flight(bs);
1647    return ret;
1648}
1649
1650static int coroutine_fn bdrv_co_do_writev(BdrvChild *child,
1651    int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1652    BdrvRequestFlags flags)
1653{
1654    if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1655        return -EINVAL;
1656    }
1657
1658    return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS,
1659                           nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1660}
1661
1662int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num,
1663    int nb_sectors, QEMUIOVector *qiov)
1664{
1665    return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0);
1666}
1667
1668int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
1669                                       int bytes, BdrvRequestFlags flags)
1670{
1671    trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
1672
1673    if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
1674        flags &= ~BDRV_REQ_MAY_UNMAP;
1675    }
1676
1677    return bdrv_co_pwritev(child, offset, bytes, NULL,
1678                           BDRV_REQ_ZERO_WRITE | flags);
1679}
1680
1681/*
1682 * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
1683 */
1684int bdrv_flush_all(void)
1685{
1686    BdrvNextIterator it;
1687    BlockDriverState *bs = NULL;
1688    int result = 0;
1689
1690    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
1691        AioContext *aio_context = bdrv_get_aio_context(bs);
1692        int ret;
1693
1694        aio_context_acquire(aio_context);
1695        ret = bdrv_flush(bs);
1696        if (ret < 0 && !result) {
1697            result = ret;
1698        }
1699        aio_context_release(aio_context);
1700    }
1701
1702    return result;
1703}
1704
1705
1706typedef struct BdrvCoGetBlockStatusData {
1707    BlockDriverState *bs;
1708    BlockDriverState *base;
1709    BlockDriverState **file;
1710    int64_t sector_num;
1711    int nb_sectors;
1712    int *pnum;
1713    int64_t ret;
1714    bool done;
1715} BdrvCoGetBlockStatusData;
1716
1717/*
1718 * Returns the allocation status of the specified sectors.
1719 * Drivers not implementing the functionality are assumed to not support
1720 * backing files, hence all their sectors are reported as allocated.
1721 *
1722 * If 'sector_num' is beyond the end of the disk image the return value is
1723 * BDRV_BLOCK_EOF and 'pnum' is set to 0.
1724 *
1725 * 'pnum' is set to the number of sectors (including and immediately following
1726 * the specified sector) that are known to be in the same
1727 * allocated/unallocated state.
1728 *
1729 * 'nb_sectors' is the max value 'pnum' should be set to.  If nb_sectors goes
1730 * beyond the end of the disk image it will be clamped; if 'pnum' is set to
1731 * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
1732 *
1733 * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file'
1734 * points to the BDS which the sector range is allocated in.
1735 */
1736static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs,
1737                                                     int64_t sector_num,
1738                                                     int nb_sectors, int *pnum,
1739                                                     BlockDriverState **file)
1740{
1741    int64_t total_sectors;
1742    int64_t n;
1743    int64_t ret, ret2;
1744
1745    *file = NULL;
1746    total_sectors = bdrv_nb_sectors(bs);
1747    if (total_sectors < 0) {
1748        return total_sectors;
1749    }
1750
1751    if (sector_num >= total_sectors) {
1752        *pnum = 0;
1753        return BDRV_BLOCK_EOF;
1754    }
1755
1756    n = total_sectors - sector_num;
1757    if (n < nb_sectors) {
1758        nb_sectors = n;
1759    }
1760
1761    if (!bs->drv->bdrv_co_get_block_status) {
1762        *pnum = nb_sectors;
1763        ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1764        if (sector_num + nb_sectors == total_sectors) {
1765            ret |= BDRV_BLOCK_EOF;
1766        }
1767        if (bs->drv->protocol_name) {
1768            ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE);
1769            *file = bs;
1770        }
1771        return ret;
1772    }
1773
1774    bdrv_inc_in_flight(bs);
1775    ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum,
1776                                            file);
1777    if (ret < 0) {
1778        *pnum = 0;
1779        goto out;
1780    }
1781
1782    if (ret & BDRV_BLOCK_RAW) {
1783        assert(ret & BDRV_BLOCK_OFFSET_VALID && *file);
1784        ret = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
1785                                       *pnum, pnum, file);
1786        goto out;
1787    }
1788
1789    if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1790        ret |= BDRV_BLOCK_ALLOCATED;
1791    } else {
1792        if (bdrv_unallocated_blocks_are_zero(bs)) {
1793            ret |= BDRV_BLOCK_ZERO;
1794        } else if (bs->backing) {
1795            BlockDriverState *bs2 = bs->backing->bs;
1796            int64_t nb_sectors2 = bdrv_nb_sectors(bs2);
1797            if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) {
1798                ret |= BDRV_BLOCK_ZERO;
1799            }
1800        }
1801    }
1802
1803    if (*file && *file != bs &&
1804        (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1805        (ret & BDRV_BLOCK_OFFSET_VALID)) {
1806        BlockDriverState *file2;
1807        int file_pnum;
1808
1809        ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS,
1810                                        *pnum, &file_pnum, &file2);
1811        if (ret2 >= 0) {
1812            /* Ignore errors.  This is just providing extra information, it
1813             * is useful but not necessary.
1814             */
1815            if (ret2 & BDRV_BLOCK_EOF &&
1816                (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
1817                /*
1818                 * It is valid for the format block driver to read
1819                 * beyond the end of the underlying file's current
1820                 * size; such areas read as zero.
1821                 */
1822                ret |= BDRV_BLOCK_ZERO;
1823            } else {
1824                /* Limit request to the range reported by the protocol driver */
1825                *pnum = file_pnum;
1826                ret |= (ret2 & BDRV_BLOCK_ZERO);
1827            }
1828        }
1829    }
1830
1831out:
1832    bdrv_dec_in_flight(bs);
1833    if (ret >= 0 && sector_num + *pnum == total_sectors) {
1834        ret |= BDRV_BLOCK_EOF;
1835    }
1836    return ret;
1837}
1838
1839static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs,
1840        BlockDriverState *base,
1841        int64_t sector_num,
1842        int nb_sectors,
1843        int *pnum,
1844        BlockDriverState **file)
1845{
1846    BlockDriverState *p;
1847    int64_t ret = 0;
1848    bool first = true;
1849
1850    assert(bs != base);
1851    for (p = bs; p != base; p = backing_bs(p)) {
1852        ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file);
1853        if (ret < 0) {
1854            break;
1855        }
1856        if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
1857            /*
1858             * Reading beyond the end of the file continues to read
1859             * zeroes, but we can only widen the result to the
1860             * unallocated length we learned from an earlier
1861             * iteration.
1862             */
1863            *pnum = nb_sectors;
1864        }
1865        if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
1866            break;
1867        }
1868        /* [sector_num, pnum] unallocated on this layer, which could be only
1869         * the first part of [sector_num, nb_sectors].  */
1870        nb_sectors = MIN(nb_sectors, *pnum);
1871        first = false;
1872    }
1873    return ret;
1874}
1875
1876/* Coroutine wrapper for bdrv_get_block_status_above() */
1877static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque)
1878{
1879    BdrvCoGetBlockStatusData *data = opaque;
1880
1881    data->ret = bdrv_co_get_block_status_above(data->bs, data->base,
1882                                               data->sector_num,
1883                                               data->nb_sectors,
1884                                               data->pnum,
1885                                               data->file);
1886    data->done = true;
1887}
1888
1889/*
1890 * Synchronous wrapper around bdrv_co_get_block_status_above().
1891 *
1892 * See bdrv_co_get_block_status_above() for details.
1893 */
1894int64_t bdrv_get_block_status_above(BlockDriverState *bs,
1895                                    BlockDriverState *base,
1896                                    int64_t sector_num,
1897                                    int nb_sectors, int *pnum,
1898                                    BlockDriverState **file)
1899{
1900    Coroutine *co;
1901    BdrvCoGetBlockStatusData data = {
1902        .bs = bs,
1903        .base = base,
1904        .file = file,
1905        .sector_num = sector_num,
1906        .nb_sectors = nb_sectors,
1907        .pnum = pnum,
1908        .done = false,
1909    };
1910
1911    if (qemu_in_coroutine()) {
1912        /* Fast-path if already in coroutine context */
1913        bdrv_get_block_status_above_co_entry(&data);
1914    } else {
1915        co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry,
1916                                   &data);
1917        bdrv_coroutine_enter(bs, co);
1918        BDRV_POLL_WHILE(bs, !data.done);
1919    }
1920    return data.ret;
1921}
1922
1923int64_t bdrv_get_block_status(BlockDriverState *bs,
1924                              int64_t sector_num,
1925                              int nb_sectors, int *pnum,
1926                              BlockDriverState **file)
1927{
1928    return bdrv_get_block_status_above(bs, backing_bs(bs),
1929                                       sector_num, nb_sectors, pnum, file);
1930}
1931
1932int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
1933                                   int64_t bytes, int64_t *pnum)
1934{
1935    BlockDriverState *file;
1936    int64_t sector_num = offset >> BDRV_SECTOR_BITS;
1937    int nb_sectors = bytes >> BDRV_SECTOR_BITS;
1938    int64_t ret;
1939    int psectors;
1940
1941    assert(QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE));
1942    assert(QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE) && bytes < INT_MAX);
1943    ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &psectors,
1944                                &file);
1945    if (ret < 0) {
1946        return ret;
1947    }
1948    if (pnum) {
1949        *pnum = psectors * BDRV_SECTOR_SIZE;
1950    }
1951    return !!(ret & BDRV_BLOCK_ALLOCATED);
1952}
1953
1954/*
1955 * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
1956 *
1957 * Return true if (a prefix of) the given range is allocated in any image
1958 * between BASE and TOP (inclusive).  BASE can be NULL to check if the given
1959 * offset is allocated in any image of the chain.  Return false otherwise,
1960 * or negative errno on failure.
1961 *
1962 * 'pnum' is set to the number of bytes (including and immediately
1963 * following the specified offset) that are known to be in the same
1964 * allocated/unallocated state.  Note that a subsequent call starting
1965 * at 'offset + *pnum' may return the same allocation status (in other
1966 * words, the result is not necessarily the maximum possible range);
1967 * but 'pnum' will only be 0 when end of file is reached.
1968 *
1969 */
1970int bdrv_is_allocated_above(BlockDriverState *top,
1971                            BlockDriverState *base,
1972                            int64_t offset, int64_t bytes, int64_t *pnum)
1973{
1974    BlockDriverState *intermediate;
1975    int ret;
1976    int64_t n = bytes;
1977
1978    intermediate = top;
1979    while (intermediate && intermediate != base) {
1980        int64_t pnum_inter;
1981        int64_t size_inter;
1982
1983        ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
1984        if (ret < 0) {
1985            return ret;
1986        }
1987        if (ret) {
1988            *pnum = pnum_inter;
1989            return 1;
1990        }
1991
1992        size_inter = bdrv_getlength(intermediate);
1993        if (size_inter < 0) {
1994            return size_inter;
1995        }
1996        if (n > pnum_inter &&
1997            (intermediate == top || offset + pnum_inter < size_inter)) {
1998            n = pnum_inter;
1999        }
2000
2001        intermediate = backing_bs(intermediate);
2002    }
2003
2004    *pnum = n;
2005    return 0;
2006}
2007
2008typedef struct BdrvVmstateCo {
2009    BlockDriverState    *bs;
2010    QEMUIOVector        *qiov;
2011    int64_t             pos;
2012    bool                is_read;
2013    int                 ret;
2014} BdrvVmstateCo;
2015
2016static int coroutine_fn
2017bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2018                   bool is_read)
2019{
2020    BlockDriver *drv = bs->drv;
2021    int ret = -ENOTSUP;
2022
2023    bdrv_inc_in_flight(bs);
2024
2025    if (!drv) {
2026        ret = -ENOMEDIUM;
2027    } else if (drv->bdrv_load_vmstate) {
2028        if (is_read) {
2029            ret = drv->bdrv_load_vmstate(bs, qiov, pos);
2030        } else {
2031            ret = drv->bdrv_save_vmstate(bs, qiov, pos);
2032        }
2033    } else if (bs->file) {
2034        ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
2035    }
2036
2037    bdrv_dec_in_flight(bs);
2038    return ret;
2039}
2040
2041static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
2042{
2043    BdrvVmstateCo *co = opaque;
2044    co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
2045}
2046
2047static inline int
2048bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2049                bool is_read)
2050{
2051    if (qemu_in_coroutine()) {
2052        return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
2053    } else {
2054        BdrvVmstateCo data = {
2055            .bs         = bs,
2056            .qiov       = qiov,
2057            .pos        = pos,
2058            .is_read    = is_read,
2059            .ret        = -EINPROGRESS,
2060        };
2061        Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
2062
2063        bdrv_coroutine_enter(bs, co);
2064        BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
2065        return data.ret;
2066    }
2067}
2068
2069int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2070                      int64_t pos, int size)
2071{
2072    QEMUIOVector qiov;
2073    struct iovec iov = {
2074        .iov_base   = (void *) buf,
2075        .iov_len    = size,
2076    };
2077    int ret;
2078
2079    qemu_iovec_init_external(&qiov, &iov, 1);
2080
2081    ret = bdrv_writev_vmstate(bs, &qiov, pos);
2082    if (ret < 0) {
2083        return ret;
2084    }
2085
2086    return size;
2087}
2088
2089int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2090{
2091    return bdrv_rw_vmstate(bs, qiov, pos, false);
2092}
2093
2094int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2095                      int64_t pos, int size)
2096{
2097    QEMUIOVector qiov;
2098    struct iovec iov = {
2099        .iov_base   = buf,
2100        .iov_len    = size,
2101    };
2102    int ret;
2103
2104    qemu_iovec_init_external(&qiov, &iov, 1);
2105    ret = bdrv_readv_vmstate(bs, &qiov, pos);
2106    if (ret < 0) {
2107        return ret;
2108    }
2109
2110    return size;
2111}
2112
2113int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2114{
2115    return bdrv_rw_vmstate(bs, qiov, pos, true);
2116}
2117
2118/**************************************************************/
2119/* async I/Os */
2120
2121void bdrv_aio_cancel(BlockAIOCB *acb)
2122{
2123    qemu_aio_ref(acb);
2124    bdrv_aio_cancel_async(acb);
2125    while (acb->refcnt > 1) {
2126        if (acb->aiocb_info->get_aio_context) {
2127            aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2128        } else if (acb->bs) {
2129            /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
2130             * assert that we're not using an I/O thread.  Thread-safe
2131             * code should use bdrv_aio_cancel_async exclusively.
2132             */
2133            assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
2134            aio_poll(bdrv_get_aio_context(acb->bs), true);
2135        } else {
2136            abort();
2137        }
2138    }
2139    qemu_aio_unref(acb);
2140}
2141
2142/* Async version of aio cancel. The caller is not blocked if the acb implements
2143 * cancel_async, otherwise we do nothing and let the request normally complete.
2144 * In either case the completion callback must be called. */
2145void bdrv_aio_cancel_async(BlockAIOCB *acb)
2146{
2147    if (acb->aiocb_info->cancel_async) {
2148        acb->aiocb_info->cancel_async(acb);
2149    }
2150}
2151
2152/**************************************************************/
2153/* Coroutine block device emulation */
2154
2155typedef struct FlushCo {
2156    BlockDriverState *bs;
2157    int ret;
2158} FlushCo;
2159
2160
2161static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2162{
2163    FlushCo *rwco = opaque;
2164
2165    rwco->ret = bdrv_co_flush(rwco->bs);
2166}
2167
2168int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2169{
2170    int current_gen;
2171    int ret = 0;
2172
2173    bdrv_inc_in_flight(bs);
2174
2175    if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2176        bdrv_is_sg(bs)) {
2177        goto early_exit;
2178    }
2179
2180    qemu_co_mutex_lock(&bs->reqs_lock);
2181    current_gen = atomic_read(&bs->write_gen);
2182
2183    /* Wait until any previous flushes are completed */
2184    while (bs->active_flush_req) {
2185        qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
2186    }
2187
2188    /* Flushes reach this point in nondecreasing current_gen order.  */
2189    bs->active_flush_req = true;
2190    qemu_co_mutex_unlock(&bs->reqs_lock);
2191
2192    /* Write back all layers by calling one driver function */
2193    if (bs->drv->bdrv_co_flush) {
2194        ret = bs->drv->bdrv_co_flush(bs);
2195        goto out;
2196    }
2197
2198    /* Write back cached data to the OS even with cache=unsafe */
2199    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2200    if (bs->drv->bdrv_co_flush_to_os) {
2201        ret = bs->drv->bdrv_co_flush_to_os(bs);
2202        if (ret < 0) {
2203            goto out;
2204        }
2205    }
2206
2207    /* But don't actually force it to the disk with cache=unsafe */
2208    if (bs->open_flags & BDRV_O_NO_FLUSH) {
2209        goto flush_parent;
2210    }
2211
2212    /* Check if we really need to flush anything */
2213    if (bs->flushed_gen == current_gen) {
2214        goto flush_parent;
2215    }
2216
2217    BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2218    if (bs->drv->bdrv_co_flush_to_disk) {
2219        ret = bs->drv->bdrv_co_flush_to_disk(bs);
2220    } else if (bs->drv->bdrv_aio_flush) {
2221        BlockAIOCB *acb;
2222        CoroutineIOCompletion co = {
2223            .coroutine = qemu_coroutine_self(),
2224        };
2225
2226        acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2227        if (acb == NULL) {
2228            ret = -EIO;
2229        } else {
2230            qemu_coroutine_yield();
2231            ret = co.ret;
2232        }
2233    } else {
2234        /*
2235         * Some block drivers always operate in either writethrough or unsafe
2236         * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2237         * know how the server works (because the behaviour is hardcoded or
2238         * depends on server-side configuration), so we can't ensure that
2239         * everything is safe on disk. Returning an error doesn't work because
2240         * that would break guests even if the server operates in writethrough
2241         * mode.
2242         *
2243         * Let's hope the user knows what he's doing.
2244         */
2245        ret = 0;
2246    }
2247
2248    if (ret < 0) {
2249        goto out;
2250    }
2251
2252    /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2253     * in the case of cache=unsafe, so there are no useless flushes.
2254     */
2255flush_parent:
2256    ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2257out:
2258    /* Notify any pending flushes that we have completed */
2259    if (ret == 0) {
2260        bs->flushed_gen = current_gen;
2261    }
2262
2263    qemu_co_mutex_lock(&bs->reqs_lock);
2264    bs->active_flush_req = false;
2265    /* Return value is ignored - it's ok if wait queue is empty */
2266    qemu_co_queue_next(&bs->flush_queue);
2267    qemu_co_mutex_unlock(&bs->reqs_lock);
2268
2269early_exit:
2270    bdrv_dec_in_flight(bs);
2271    return ret;
2272}
2273
2274int bdrv_flush(BlockDriverState *bs)
2275{
2276    Coroutine *co;
2277    FlushCo flush_co = {
2278        .bs = bs,
2279        .ret = NOT_DONE,
2280    };
2281
2282    if (qemu_in_coroutine()) {
2283        /* Fast-path if already in coroutine context */
2284        bdrv_flush_co_entry(&flush_co);
2285    } else {
2286        co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
2287        bdrv_coroutine_enter(bs, co);
2288        BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
2289    }
2290
2291    return flush_co.ret;
2292}
2293
2294typedef struct DiscardCo {
2295    BlockDriverState *bs;
2296    int64_t offset;
2297    int bytes;
2298    int ret;
2299} DiscardCo;
2300static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
2301{
2302    DiscardCo *rwco = opaque;
2303
2304    rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes);
2305}
2306
2307int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
2308                                  int bytes)
2309{
2310    BdrvTrackedRequest req;
2311    int max_pdiscard, ret;
2312    int head, tail, align;
2313
2314    if (!bs->drv) {
2315        return -ENOMEDIUM;
2316    }
2317
2318    if (bdrv_has_readonly_bitmaps(bs)) {
2319        return -EPERM;
2320    }
2321
2322    ret = bdrv_check_byte_request(bs, offset, bytes);
2323    if (ret < 0) {
2324        return ret;
2325    } else if (bs->read_only) {
2326        return -EPERM;
2327    }
2328    assert(!(bs->open_flags & BDRV_O_INACTIVE));
2329
2330    /* Do nothing if disabled.  */
2331    if (!(bs->open_flags & BDRV_O_UNMAP)) {
2332        return 0;
2333    }
2334
2335    if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
2336        return 0;
2337    }
2338
2339    /* Discard is advisory, but some devices track and coalesce
2340     * unaligned requests, so we must pass everything down rather than
2341     * round here.  Still, most devices will just silently ignore
2342     * unaligned requests (by returning -ENOTSUP), so we must fragment
2343     * the request accordingly.  */
2344    align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2345    assert(align % bs->bl.request_alignment == 0);
2346    head = offset % align;
2347    tail = (offset + bytes) % align;
2348
2349    bdrv_inc_in_flight(bs);
2350    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
2351
2352    ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2353    if (ret < 0) {
2354        goto out;
2355    }
2356
2357    max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
2358                                   align);
2359    assert(max_pdiscard >= bs->bl.request_alignment);
2360
2361    while (bytes > 0) {
2362        int num = bytes;
2363
2364        if (head) {
2365            /* Make small requests to get to alignment boundaries. */
2366            num = MIN(bytes, align - head);
2367            if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
2368                num %= bs->bl.request_alignment;
2369            }
2370            head = (head + num) % align;
2371            assert(num < max_pdiscard);
2372        } else if (tail) {
2373            if (num > align) {
2374                /* Shorten the request to the last aligned cluster.  */
2375                num -= tail;
2376            } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
2377                       tail > bs->bl.request_alignment) {
2378                tail %= bs->bl.request_alignment;
2379                num -= tail;
2380            }
2381        }
2382        /* limit request size */
2383        if (num > max_pdiscard) {
2384            num = max_pdiscard;
2385        }
2386
2387        if (bs->drv->bdrv_co_pdiscard) {
2388            ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
2389        } else {
2390            BlockAIOCB *acb;
2391            CoroutineIOCompletion co = {
2392                .coroutine = qemu_coroutine_self(),
2393            };
2394
2395            acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
2396                                             bdrv_co_io_em_complete, &co);
2397            if (acb == NULL) {
2398                ret = -EIO;
2399                goto out;
2400            } else {
2401                qemu_coroutine_yield();
2402                ret = co.ret;
2403            }
2404        }
2405        if (ret && ret != -ENOTSUP) {
2406            goto out;
2407        }
2408
2409        offset += num;
2410        bytes -= num;
2411    }
2412    ret = 0;
2413out:
2414    atomic_inc(&bs->write_gen);
2415    bdrv_set_dirty(bs, req.offset >> BDRV_SECTOR_BITS,
2416                   req.bytes >> BDRV_SECTOR_BITS);
2417    tracked_request_end(&req);
2418    bdrv_dec_in_flight(bs);
2419    return ret;
2420}
2421
2422int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
2423{
2424    Coroutine *co;
2425    DiscardCo rwco = {
2426        .bs = bs,
2427        .offset = offset,
2428        .bytes = bytes,
2429        .ret = NOT_DONE,
2430    };
2431
2432    if (qemu_in_coroutine()) {
2433        /* Fast-path if already in coroutine context */
2434        bdrv_pdiscard_co_entry(&rwco);
2435    } else {
2436        co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
2437        bdrv_coroutine_enter(bs, co);
2438        BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE);
2439    }
2440
2441    return rwco.ret;
2442}
2443
2444int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
2445{
2446    BlockDriver *drv = bs->drv;
2447    CoroutineIOCompletion co = {
2448        .coroutine = qemu_coroutine_self(),
2449    };
2450    BlockAIOCB *acb;
2451
2452    bdrv_inc_in_flight(bs);
2453    if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
2454        co.ret = -ENOTSUP;
2455        goto out;
2456    }
2457
2458    if (drv->bdrv_co_ioctl) {
2459        co.ret = drv->bdrv_co_ioctl(bs, req, buf);
2460    } else {
2461        acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2462        if (!acb) {
2463            co.ret = -ENOTSUP;
2464            goto out;
2465        }
2466        qemu_coroutine_yield();
2467    }
2468out:
2469    bdrv_dec_in_flight(bs);
2470    return co.ret;
2471}
2472
2473void *qemu_blockalign(BlockDriverState *bs, size_t size)
2474{
2475    return qemu_memalign(bdrv_opt_mem_align(bs), size);
2476}
2477
2478void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2479{
2480    return memset(qemu_blockalign(bs, size), 0, size);
2481}
2482
2483void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2484{
2485    size_t align = bdrv_opt_mem_align(bs);
2486
2487    /* Ensure that NULL is never returned on success */
2488    assert(align > 0);
2489    if (size == 0) {
2490        size = align;
2491    }
2492
2493    return qemu_try_memalign(align, size);
2494}
2495
2496void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2497{
2498    void *mem = qemu_try_blockalign(bs, size);
2499
2500    if (mem) {
2501        memset(mem, 0, size);
2502    }
2503
2504    return mem;
2505}
2506
2507/*
2508 * Check if all memory in this vector is sector aligned.
2509 */
2510bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2511{
2512    int i;
2513    size_t alignment = bdrv_min_mem_align(bs);
2514
2515    for (i = 0; i < qiov->niov; i++) {
2516        if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2517            return false;
2518        }
2519        if (qiov->iov[i].iov_len % alignment) {
2520            return false;
2521        }
2522    }
2523
2524    return true;
2525}
2526
2527void bdrv_add_before_write_notifier(BlockDriverState *bs,
2528                                    NotifierWithReturn *notifier)
2529{
2530    notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2531}
2532
2533void bdrv_io_plug(BlockDriverState *bs)
2534{
2535    BdrvChild *child;
2536
2537    QLIST_FOREACH(child, &bs->children, next) {
2538        bdrv_io_plug(child->bs);
2539    }
2540
2541    if (atomic_fetch_inc(&bs->io_plugged) == 0) {
2542        BlockDriver *drv = bs->drv;
2543        if (drv && drv->bdrv_io_plug) {
2544            drv->bdrv_io_plug(bs);
2545        }
2546    }
2547}
2548
2549void bdrv_io_unplug(BlockDriverState *bs)
2550{
2551    BdrvChild *child;
2552
2553    assert(bs->io_plugged);
2554    if (atomic_fetch_dec(&bs->io_plugged) == 1) {
2555        BlockDriver *drv = bs->drv;
2556        if (drv && drv->bdrv_io_unplug) {
2557            drv->bdrv_io_unplug(bs);
2558        }
2559    }
2560
2561    QLIST_FOREACH(child, &bs->children, next) {
2562        bdrv_io_unplug(child->bs);
2563    }
2564}
2565