qemu/block/block-backend.c
<<
>>
Prefs
   1/*
   2 * QEMU Block backends
   3 *
   4 * Copyright (C) 2014-2016 Red Hat, Inc.
   5 *
   6 * Authors:
   7 *  Markus Armbruster <armbru@redhat.com>,
   8 *
   9 * This work is licensed under the terms of the GNU LGPL, version 2.1
  10 * or later.  See the COPYING.LIB file in the top-level directory.
  11 */
  12
  13#include "qemu/osdep.h"
  14#include "sysemu/block-backend.h"
  15#include "block/block_int.h"
  16#include "block/blockjob.h"
  17#include "block/throttle-groups.h"
  18#include "hw/qdev-core.h"
  19#include "sysemu/blockdev.h"
  20#include "sysemu/runstate.h"
  21#include "sysemu/sysemu.h"
  22#include "sysemu/replay.h"
  23#include "qapi/error.h"
  24#include "qapi/qapi-events-block.h"
  25#include "qemu/id.h"
  26#include "qemu/main-loop.h"
  27#include "qemu/option.h"
  28#include "trace.h"
  29#include "migration/misc.h"
  30
  31/* Number of coroutines to reserve per attached device model */
  32#define COROUTINE_POOL_RESERVATION 64
  33
  34#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  35
  36static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb);
  37
  38typedef struct BlockBackendAioNotifier {
  39    void (*attached_aio_context)(AioContext *new_context, void *opaque);
  40    void (*detach_aio_context)(void *opaque);
  41    void *opaque;
  42    QLIST_ENTRY(BlockBackendAioNotifier) list;
  43} BlockBackendAioNotifier;
  44
  45struct BlockBackend {
  46    char *name;
  47    int refcnt;
  48    BdrvChild *root;
  49    AioContext *ctx;
  50    DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
  51    QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
  52    QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
  53    BlockBackendPublic public;
  54
  55    DeviceState *dev;           /* attached device model, if any */
  56    const BlockDevOps *dev_ops;
  57    void *dev_opaque;
  58
  59    /* the block size for which the guest device expects atomicity */
  60    int guest_block_size;
  61
  62    /* If the BDS tree is removed, some of its options are stored here (which
  63     * can be used to restore those options in the new BDS on insert) */
  64    BlockBackendRootState root_state;
  65
  66    bool enable_write_cache;
  67
  68    /* I/O stats (display with "info blockstats"). */
  69    BlockAcctStats stats;
  70
  71    BlockdevOnError on_read_error, on_write_error;
  72    bool iostatus_enabled;
  73    BlockDeviceIoStatus iostatus;
  74
  75    uint64_t perm;
  76    uint64_t shared_perm;
  77    bool disable_perm;
  78
  79    bool allow_aio_context_change;
  80    bool allow_write_beyond_eof;
  81
  82    NotifierList remove_bs_notifiers, insert_bs_notifiers;
  83    QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers;
  84
  85    int quiesce_counter;
  86    CoQueue queued_requests;
  87    bool disable_request_queuing;
  88
  89    VMChangeStateEntry *vmsh;
  90    bool force_allow_inactivate;
  91
  92    /* Number of in-flight aio requests.  BlockDriverState also counts
  93     * in-flight requests but aio requests can exist even when blk->root is
  94     * NULL, so we cannot rely on its counter for that case.
  95     * Accessed with atomic ops.
  96     */
  97    unsigned int in_flight;
  98};
  99
 100typedef struct BlockBackendAIOCB {
 101    BlockAIOCB common;
 102    BlockBackend *blk;
 103    int ret;
 104} BlockBackendAIOCB;
 105
 106static const AIOCBInfo block_backend_aiocb_info = {
 107    .get_aio_context = blk_aiocb_get_aio_context,
 108    .aiocb_size = sizeof(BlockBackendAIOCB),
 109};
 110
 111static void drive_info_del(DriveInfo *dinfo);
 112static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
 113
 114/* All BlockBackends */
 115static QTAILQ_HEAD(, BlockBackend) block_backends =
 116    QTAILQ_HEAD_INITIALIZER(block_backends);
 117
 118/* All BlockBackends referenced by the monitor and which are iterated through by
 119 * blk_next() */
 120static QTAILQ_HEAD(, BlockBackend) monitor_block_backends =
 121    QTAILQ_HEAD_INITIALIZER(monitor_block_backends);
 122
 123static void blk_root_inherit_options(int *child_flags, QDict *child_options,
 124                                     int parent_flags, QDict *parent_options)
 125{
 126    /* We're not supposed to call this function for root nodes */
 127    abort();
 128}
 129static void blk_root_drained_begin(BdrvChild *child);
 130static bool blk_root_drained_poll(BdrvChild *child);
 131static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter);
 132
 133static void blk_root_change_media(BdrvChild *child, bool load);
 134static void blk_root_resize(BdrvChild *child);
 135
 136static bool blk_root_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
 137                                     GSList **ignore, Error **errp);
 138static void blk_root_set_aio_ctx(BdrvChild *child, AioContext *ctx,
 139                                 GSList **ignore);
 140
 141static char *blk_root_get_parent_desc(BdrvChild *child)
 142{
 143    BlockBackend *blk = child->opaque;
 144    char *dev_id;
 145
 146    if (blk->name) {
 147        return g_strdup(blk->name);
 148    }
 149
 150    dev_id = blk_get_attached_dev_id(blk);
 151    if (*dev_id) {
 152        return dev_id;
 153    } else {
 154        /* TODO Callback into the BB owner for something more detailed */
 155        g_free(dev_id);
 156        return g_strdup("a block device");
 157    }
 158}
 159
 160static const char *blk_root_get_name(BdrvChild *child)
 161{
 162    return blk_name(child->opaque);
 163}
 164
 165static void blk_vm_state_changed(void *opaque, int running, RunState state)
 166{
 167    Error *local_err = NULL;
 168    BlockBackend *blk = opaque;
 169
 170    if (state == RUN_STATE_INMIGRATE) {
 171        return;
 172    }
 173
 174    qemu_del_vm_change_state_handler(blk->vmsh);
 175    blk->vmsh = NULL;
 176    blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
 177    if (local_err) {
 178        error_report_err(local_err);
 179    }
 180}
 181
 182/*
 183 * Notifies the user of the BlockBackend that migration has completed. qdev
 184 * devices can tighten their permissions in response (specifically revoke
 185 * shared write permissions that we needed for storage migration).
 186 *
 187 * If an error is returned, the VM cannot be allowed to be resumed.
 188 */
 189static void blk_root_activate(BdrvChild *child, Error **errp)
 190{
 191    BlockBackend *blk = child->opaque;
 192    Error *local_err = NULL;
 193
 194    if (!blk->disable_perm) {
 195        return;
 196    }
 197
 198    blk->disable_perm = false;
 199
 200    blk_set_perm(blk, blk->perm, BLK_PERM_ALL, &local_err);
 201    if (local_err) {
 202        error_propagate(errp, local_err);
 203        blk->disable_perm = true;
 204        return;
 205    }
 206
 207    if (runstate_check(RUN_STATE_INMIGRATE)) {
 208        /* Activation can happen when migration process is still active, for
 209         * example when nbd_server_add is called during non-shared storage
 210         * migration. Defer the shared_perm update to migration completion. */
 211        if (!blk->vmsh) {
 212            blk->vmsh = qemu_add_vm_change_state_handler(blk_vm_state_changed,
 213                                                         blk);
 214        }
 215        return;
 216    }
 217
 218    blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
 219    if (local_err) {
 220        error_propagate(errp, local_err);
 221        blk->disable_perm = true;
 222        return;
 223    }
 224}
 225
 226void blk_set_force_allow_inactivate(BlockBackend *blk)
 227{
 228    blk->force_allow_inactivate = true;
 229}
 230
 231static bool blk_can_inactivate(BlockBackend *blk)
 232{
 233    /* If it is a guest device, inactivate is ok. */
 234    if (blk->dev || blk_name(blk)[0]) {
 235        return true;
 236    }
 237
 238    /* Inactivating means no more writes to the image can be done,
 239     * even if those writes would be changes invisible to the
 240     * guest.  For block job BBs that satisfy this, we can just allow
 241     * it.  This is the case for mirror job source, which is required
 242     * by libvirt non-shared block migration. */
 243    if (!(blk->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED))) {
 244        return true;
 245    }
 246
 247    return blk->force_allow_inactivate;
 248}
 249
 250static int blk_root_inactivate(BdrvChild *child)
 251{
 252    BlockBackend *blk = child->opaque;
 253
 254    if (blk->disable_perm) {
 255        return 0;
 256    }
 257
 258    if (!blk_can_inactivate(blk)) {
 259        return -EPERM;
 260    }
 261
 262    blk->disable_perm = true;
 263    if (blk->root) {
 264        bdrv_child_try_set_perm(blk->root, 0, BLK_PERM_ALL, &error_abort);
 265    }
 266
 267    return 0;
 268}
 269
 270static void blk_root_attach(BdrvChild *child)
 271{
 272    BlockBackend *blk = child->opaque;
 273    BlockBackendAioNotifier *notifier;
 274
 275    trace_blk_root_attach(child, blk, child->bs);
 276
 277    QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
 278        bdrv_add_aio_context_notifier(child->bs,
 279                notifier->attached_aio_context,
 280                notifier->detach_aio_context,
 281                notifier->opaque);
 282    }
 283}
 284
 285static void blk_root_detach(BdrvChild *child)
 286{
 287    BlockBackend *blk = child->opaque;
 288    BlockBackendAioNotifier *notifier;
 289
 290    trace_blk_root_detach(child, blk, child->bs);
 291
 292    QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
 293        bdrv_remove_aio_context_notifier(child->bs,
 294                notifier->attached_aio_context,
 295                notifier->detach_aio_context,
 296                notifier->opaque);
 297    }
 298}
 299
 300static const BdrvChildRole child_root = {
 301    .inherit_options    = blk_root_inherit_options,
 302
 303    .change_media       = blk_root_change_media,
 304    .resize             = blk_root_resize,
 305    .get_name           = blk_root_get_name,
 306    .get_parent_desc    = blk_root_get_parent_desc,
 307
 308    .drained_begin      = blk_root_drained_begin,
 309    .drained_poll       = blk_root_drained_poll,
 310    .drained_end        = blk_root_drained_end,
 311
 312    .activate           = blk_root_activate,
 313    .inactivate         = blk_root_inactivate,
 314
 315    .attach             = blk_root_attach,
 316    .detach             = blk_root_detach,
 317
 318    .can_set_aio_ctx    = blk_root_can_set_aio_ctx,
 319    .set_aio_ctx        = blk_root_set_aio_ctx,
 320};
 321
 322/*
 323 * Create a new BlockBackend with a reference count of one.
 324 *
 325 * @perm is a bitmasks of BLK_PERM_* constants which describes the permissions
 326 * to request for a block driver node that is attached to this BlockBackend.
 327 * @shared_perm is a bitmask which describes which permissions may be granted
 328 * to other users of the attached node.
 329 * Both sets of permissions can be changed later using blk_set_perm().
 330 *
 331 * Return the new BlockBackend on success, null on failure.
 332 */
 333BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
 334{
 335    BlockBackend *blk;
 336
 337    blk = g_new0(BlockBackend, 1);
 338    blk->refcnt = 1;
 339    blk->ctx = ctx;
 340    blk->perm = perm;
 341    blk->shared_perm = shared_perm;
 342    blk_set_enable_write_cache(blk, true);
 343
 344    blk->on_read_error = BLOCKDEV_ON_ERROR_REPORT;
 345    blk->on_write_error = BLOCKDEV_ON_ERROR_ENOSPC;
 346
 347    block_acct_init(&blk->stats);
 348
 349    qemu_co_queue_init(&blk->queued_requests);
 350    notifier_list_init(&blk->remove_bs_notifiers);
 351    notifier_list_init(&blk->insert_bs_notifiers);
 352    QLIST_INIT(&blk->aio_notifiers);
 353
 354    QTAILQ_INSERT_TAIL(&block_backends, blk, link);
 355    return blk;
 356}
 357
 358/*
 359 * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
 360 * The new BlockBackend is in the main AioContext.
 361 *
 362 * Just as with bdrv_open(), after having called this function the reference to
 363 * @options belongs to the block layer (even on failure).
 364 *
 365 * TODO: Remove @filename and @flags; it should be possible to specify a whole
 366 * BDS tree just by specifying the @options QDict (or @reference,
 367 * alternatively). At the time of adding this function, this is not possible,
 368 * though, so callers of this function have to be able to specify @filename and
 369 * @flags.
 370 */
 371BlockBackend *blk_new_open(const char *filename, const char *reference,
 372                           QDict *options, int flags, Error **errp)
 373{
 374    BlockBackend *blk;
 375    BlockDriverState *bs;
 376    uint64_t perm = 0;
 377
 378    /* blk_new_open() is mainly used in .bdrv_create implementations and the
 379     * tools where sharing isn't a concern because the BDS stays private, so we
 380     * just request permission according to the flags.
 381     *
 382     * The exceptions are xen_disk and blockdev_init(); in these cases, the
 383     * caller of blk_new_open() doesn't make use of the permissions, but they
 384     * shouldn't hurt either. We can still share everything here because the
 385     * guest devices will add their own blockers if they can't share. */
 386    if ((flags & BDRV_O_NO_IO) == 0) {
 387        perm |= BLK_PERM_CONSISTENT_READ;
 388        if (flags & BDRV_O_RDWR) {
 389            perm |= BLK_PERM_WRITE;
 390        }
 391    }
 392    if (flags & BDRV_O_RESIZE) {
 393        perm |= BLK_PERM_RESIZE;
 394    }
 395
 396    blk = blk_new(qemu_get_aio_context(), perm, BLK_PERM_ALL);
 397    bs = bdrv_open(filename, reference, options, flags, errp);
 398    if (!bs) {
 399        blk_unref(blk);
 400        return NULL;
 401    }
 402
 403    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk->ctx,
 404                                       perm, BLK_PERM_ALL, blk, errp);
 405    if (!blk->root) {
 406        blk_unref(blk);
 407        return NULL;
 408    }
 409
 410    return blk;
 411}
 412
 413static void blk_delete(BlockBackend *blk)
 414{
 415    assert(!blk->refcnt);
 416    assert(!blk->name);
 417    assert(!blk->dev);
 418    if (blk->public.throttle_group_member.throttle_state) {
 419        blk_io_limits_disable(blk);
 420    }
 421    if (blk->root) {
 422        blk_remove_bs(blk);
 423    }
 424    if (blk->vmsh) {
 425        qemu_del_vm_change_state_handler(blk->vmsh);
 426        blk->vmsh = NULL;
 427    }
 428    assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
 429    assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
 430    assert(QLIST_EMPTY(&blk->aio_notifiers));
 431    QTAILQ_REMOVE(&block_backends, blk, link);
 432    drive_info_del(blk->legacy_dinfo);
 433    block_acct_cleanup(&blk->stats);
 434    g_free(blk);
 435}
 436
 437static void drive_info_del(DriveInfo *dinfo)
 438{
 439    if (!dinfo) {
 440        return;
 441    }
 442    qemu_opts_del(dinfo->opts);
 443    g_free(dinfo);
 444}
 445
 446int blk_get_refcnt(BlockBackend *blk)
 447{
 448    return blk ? blk->refcnt : 0;
 449}
 450
 451/*
 452 * Increment @blk's reference count.
 453 * @blk must not be null.
 454 */
 455void blk_ref(BlockBackend *blk)
 456{
 457    assert(blk->refcnt > 0);
 458    blk->refcnt++;
 459}
 460
 461/*
 462 * Decrement @blk's reference count.
 463 * If this drops it to zero, destroy @blk.
 464 * For convenience, do nothing if @blk is null.
 465 */
 466void blk_unref(BlockBackend *blk)
 467{
 468    if (blk) {
 469        assert(blk->refcnt > 0);
 470        if (blk->refcnt > 1) {
 471            blk->refcnt--;
 472        } else {
 473            blk_drain(blk);
 474            /* blk_drain() cannot resurrect blk, nobody held a reference */
 475            assert(blk->refcnt == 1);
 476            blk->refcnt = 0;
 477            blk_delete(blk);
 478        }
 479    }
 480}
 481
 482/*
 483 * Behaves similarly to blk_next() but iterates over all BlockBackends, even the
 484 * ones which are hidden (i.e. are not referenced by the monitor).
 485 */
 486BlockBackend *blk_all_next(BlockBackend *blk)
 487{
 488    return blk ? QTAILQ_NEXT(blk, link)
 489               : QTAILQ_FIRST(&block_backends);
 490}
 491
 492void blk_remove_all_bs(void)
 493{
 494    BlockBackend *blk = NULL;
 495
 496    while ((blk = blk_all_next(blk)) != NULL) {
 497        AioContext *ctx = blk_get_aio_context(blk);
 498
 499        aio_context_acquire(ctx);
 500        if (blk->root) {
 501            blk_remove_bs(blk);
 502        }
 503        aio_context_release(ctx);
 504    }
 505}
 506
 507/*
 508 * Return the monitor-owned BlockBackend after @blk.
 509 * If @blk is null, return the first one.
 510 * Else, return @blk's next sibling, which may be null.
 511 *
 512 * To iterate over all BlockBackends, do
 513 * for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
 514 *     ...
 515 * }
 516 */
 517BlockBackend *blk_next(BlockBackend *blk)
 518{
 519    return blk ? QTAILQ_NEXT(blk, monitor_link)
 520               : QTAILQ_FIRST(&monitor_block_backends);
 521}
 522
 523/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
 524 * the monitor or attached to a BlockBackend */
 525BlockDriverState *bdrv_next(BdrvNextIterator *it)
 526{
 527    BlockDriverState *bs, *old_bs;
 528
 529    /* Must be called from the main loop */
 530    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 531
 532    /* First, return all root nodes of BlockBackends. In order to avoid
 533     * returning a BDS twice when multiple BBs refer to it, we only return it
 534     * if the BB is the first one in the parent list of the BDS. */
 535    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
 536        BlockBackend *old_blk = it->blk;
 537
 538        old_bs = old_blk ? blk_bs(old_blk) : NULL;
 539
 540        do {
 541            it->blk = blk_all_next(it->blk);
 542            bs = it->blk ? blk_bs(it->blk) : NULL;
 543        } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
 544
 545        if (it->blk) {
 546            blk_ref(it->blk);
 547        }
 548        blk_unref(old_blk);
 549
 550        if (bs) {
 551            bdrv_ref(bs);
 552            bdrv_unref(old_bs);
 553            return bs;
 554        }
 555        it->phase = BDRV_NEXT_MONITOR_OWNED;
 556    } else {
 557        old_bs = it->bs;
 558    }
 559
 560    /* Then return the monitor-owned BDSes without a BB attached. Ignore all
 561     * BDSes that are attached to a BlockBackend here; they have been handled
 562     * by the above block already */
 563    do {
 564        it->bs = bdrv_next_monitor_owned(it->bs);
 565        bs = it->bs;
 566    } while (bs && bdrv_has_blk(bs));
 567
 568    if (bs) {
 569        bdrv_ref(bs);
 570    }
 571    bdrv_unref(old_bs);
 572
 573    return bs;
 574}
 575
 576static void bdrv_next_reset(BdrvNextIterator *it)
 577{
 578    *it = (BdrvNextIterator) {
 579        .phase = BDRV_NEXT_BACKEND_ROOTS,
 580    };
 581}
 582
 583BlockDriverState *bdrv_first(BdrvNextIterator *it)
 584{
 585    bdrv_next_reset(it);
 586    return bdrv_next(it);
 587}
 588
 589/* Must be called when aborting a bdrv_next() iteration before
 590 * bdrv_next() returns NULL */
 591void bdrv_next_cleanup(BdrvNextIterator *it)
 592{
 593    /* Must be called from the main loop */
 594    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 595
 596    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
 597        if (it->blk) {
 598            bdrv_unref(blk_bs(it->blk));
 599            blk_unref(it->blk);
 600        }
 601    } else {
 602        bdrv_unref(it->bs);
 603    }
 604
 605    bdrv_next_reset(it);
 606}
 607
 608/*
 609 * Add a BlockBackend into the list of backends referenced by the monitor, with
 610 * the given @name acting as the handle for the monitor.
 611 * Strictly for use by blockdev.c.
 612 *
 613 * @name must not be null or empty.
 614 *
 615 * Returns true on success and false on failure. In the latter case, an Error
 616 * object is returned through @errp.
 617 */
 618bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp)
 619{
 620    assert(!blk->name);
 621    assert(name && name[0]);
 622
 623    if (!id_wellformed(name)) {
 624        error_setg(errp, "Invalid device name");
 625        return false;
 626    }
 627    if (blk_by_name(name)) {
 628        error_setg(errp, "Device with id '%s' already exists", name);
 629        return false;
 630    }
 631    if (bdrv_find_node(name)) {
 632        error_setg(errp,
 633                   "Device name '%s' conflicts with an existing node name",
 634                   name);
 635        return false;
 636    }
 637
 638    blk->name = g_strdup(name);
 639    QTAILQ_INSERT_TAIL(&monitor_block_backends, blk, monitor_link);
 640    return true;
 641}
 642
 643/*
 644 * Remove a BlockBackend from the list of backends referenced by the monitor.
 645 * Strictly for use by blockdev.c.
 646 */
 647void monitor_remove_blk(BlockBackend *blk)
 648{
 649    if (!blk->name) {
 650        return;
 651    }
 652
 653    QTAILQ_REMOVE(&monitor_block_backends, blk, monitor_link);
 654    g_free(blk->name);
 655    blk->name = NULL;
 656}
 657
 658/*
 659 * Return @blk's name, a non-null string.
 660 * Returns an empty string iff @blk is not referenced by the monitor.
 661 */
 662const char *blk_name(const BlockBackend *blk)
 663{
 664    return blk->name ?: "";
 665}
 666
 667/*
 668 * Return the BlockBackend with name @name if it exists, else null.
 669 * @name must not be null.
 670 */
 671BlockBackend *blk_by_name(const char *name)
 672{
 673    BlockBackend *blk = NULL;
 674
 675    assert(name);
 676    while ((blk = blk_next(blk)) != NULL) {
 677        if (!strcmp(name, blk->name)) {
 678            return blk;
 679        }
 680    }
 681    return NULL;
 682}
 683
 684/*
 685 * Return the BlockDriverState attached to @blk if any, else null.
 686 */
 687BlockDriverState *blk_bs(BlockBackend *blk)
 688{
 689    return blk->root ? blk->root->bs : NULL;
 690}
 691
 692static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
 693{
 694    BdrvChild *child;
 695    QLIST_FOREACH(child, &bs->parents, next_parent) {
 696        if (child->role == &child_root) {
 697            return child->opaque;
 698        }
 699    }
 700
 701    return NULL;
 702}
 703
 704/*
 705 * Returns true if @bs has an associated BlockBackend.
 706 */
 707bool bdrv_has_blk(BlockDriverState *bs)
 708{
 709    return bdrv_first_blk(bs) != NULL;
 710}
 711
 712/*
 713 * Returns true if @bs has only BlockBackends as parents.
 714 */
 715bool bdrv_is_root_node(BlockDriverState *bs)
 716{
 717    BdrvChild *c;
 718
 719    QLIST_FOREACH(c, &bs->parents, next_parent) {
 720        if (c->role != &child_root) {
 721            return false;
 722        }
 723    }
 724
 725    return true;
 726}
 727
 728/*
 729 * Return @blk's DriveInfo if any, else null.
 730 */
 731DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
 732{
 733    return blk->legacy_dinfo;
 734}
 735
 736/*
 737 * Set @blk's DriveInfo to @dinfo, and return it.
 738 * @blk must not have a DriveInfo set already.
 739 * No other BlockBackend may have the same DriveInfo set.
 740 */
 741DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo)
 742{
 743    assert(!blk->legacy_dinfo);
 744    return blk->legacy_dinfo = dinfo;
 745}
 746
 747/*
 748 * Return the BlockBackend with DriveInfo @dinfo.
 749 * It must exist.
 750 */
 751BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
 752{
 753    BlockBackend *blk = NULL;
 754
 755    while ((blk = blk_next(blk)) != NULL) {
 756        if (blk->legacy_dinfo == dinfo) {
 757            return blk;
 758        }
 759    }
 760    abort();
 761}
 762
 763/*
 764 * Returns a pointer to the publicly accessible fields of @blk.
 765 */
 766BlockBackendPublic *blk_get_public(BlockBackend *blk)
 767{
 768    return &blk->public;
 769}
 770
 771/*
 772 * Returns a BlockBackend given the associated @public fields.
 773 */
 774BlockBackend *blk_by_public(BlockBackendPublic *public)
 775{
 776    return container_of(public, BlockBackend, public);
 777}
 778
 779/*
 780 * Disassociates the currently associated BlockDriverState from @blk.
 781 */
 782void blk_remove_bs(BlockBackend *blk)
 783{
 784    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
 785    BlockDriverState *bs;
 786
 787    notifier_list_notify(&blk->remove_bs_notifiers, blk);
 788    if (tgm->throttle_state) {
 789        bs = blk_bs(blk);
 790        bdrv_drained_begin(bs);
 791        throttle_group_detach_aio_context(tgm);
 792        throttle_group_attach_aio_context(tgm, qemu_get_aio_context());
 793        bdrv_drained_end(bs);
 794    }
 795
 796    blk_update_root_state(blk);
 797
 798    /* bdrv_root_unref_child() will cause blk->root to become stale and may
 799     * switch to a completion coroutine later on. Let's drain all I/O here
 800     * to avoid that and a potential QEMU crash.
 801     */
 802    blk_drain(blk);
 803    bdrv_root_unref_child(blk->root);
 804    blk->root = NULL;
 805}
 806
 807/*
 808 * Associates a new BlockDriverState with @blk.
 809 */
 810int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
 811{
 812    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
 813    bdrv_ref(bs);
 814    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk->ctx,
 815                                       blk->perm, blk->shared_perm, blk, errp);
 816    if (blk->root == NULL) {
 817        return -EPERM;
 818    }
 819
 820    notifier_list_notify(&blk->insert_bs_notifiers, blk);
 821    if (tgm->throttle_state) {
 822        throttle_group_detach_aio_context(tgm);
 823        throttle_group_attach_aio_context(tgm, bdrv_get_aio_context(bs));
 824    }
 825
 826    return 0;
 827}
 828
 829/*
 830 * Sets the permission bitmasks that the user of the BlockBackend needs.
 831 */
 832int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
 833                 Error **errp)
 834{
 835    int ret;
 836
 837    if (blk->root && !blk->disable_perm) {
 838        ret = bdrv_child_try_set_perm(blk->root, perm, shared_perm, errp);
 839        if (ret < 0) {
 840            return ret;
 841        }
 842    }
 843
 844    blk->perm = perm;
 845    blk->shared_perm = shared_perm;
 846
 847    return 0;
 848}
 849
 850void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm)
 851{
 852    *perm = blk->perm;
 853    *shared_perm = blk->shared_perm;
 854}
 855
 856/*
 857 * Attach device model @dev to @blk.
 858 * Return 0 on success, -EBUSY when a device model is attached already.
 859 */
 860int blk_attach_dev(BlockBackend *blk, DeviceState *dev)
 861{
 862    if (blk->dev) {
 863        return -EBUSY;
 864    }
 865
 866    /* While migration is still incoming, we don't need to apply the
 867     * permissions of guest device BlockBackends. We might still have a block
 868     * job or NBD server writing to the image for storage migration. */
 869    if (runstate_check(RUN_STATE_INMIGRATE)) {
 870        blk->disable_perm = true;
 871    }
 872
 873    blk_ref(blk);
 874    blk->dev = dev;
 875    blk_iostatus_reset(blk);
 876
 877    return 0;
 878}
 879
 880/*
 881 * Detach device model @dev from @blk.
 882 * @dev must be currently attached to @blk.
 883 */
 884void blk_detach_dev(BlockBackend *blk, DeviceState *dev)
 885{
 886    assert(blk->dev == dev);
 887    blk->dev = NULL;
 888    blk->dev_ops = NULL;
 889    blk->dev_opaque = NULL;
 890    blk->guest_block_size = 512;
 891    blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort);
 892    blk_unref(blk);
 893}
 894
 895/*
 896 * Return the device model attached to @blk if any, else null.
 897 */
 898DeviceState *blk_get_attached_dev(BlockBackend *blk)
 899{
 900    return blk->dev;
 901}
 902
 903/* Return the qdev ID, or if no ID is assigned the QOM path, of the block
 904 * device attached to the BlockBackend. */
 905char *blk_get_attached_dev_id(BlockBackend *blk)
 906{
 907    DeviceState *dev = blk->dev;
 908
 909    if (!dev) {
 910        return g_strdup("");
 911    } else if (dev->id) {
 912        return g_strdup(dev->id);
 913    }
 914
 915    return object_get_canonical_path(OBJECT(dev)) ?: g_strdup("");
 916}
 917
 918/*
 919 * Return the BlockBackend which has the device model @dev attached if it
 920 * exists, else null.
 921 *
 922 * @dev must not be null.
 923 */
 924BlockBackend *blk_by_dev(void *dev)
 925{
 926    BlockBackend *blk = NULL;
 927
 928    assert(dev != NULL);
 929    while ((blk = blk_all_next(blk)) != NULL) {
 930        if (blk->dev == dev) {
 931            return blk;
 932        }
 933    }
 934    return NULL;
 935}
 936
 937/*
 938 * Set @blk's device model callbacks to @ops.
 939 * @opaque is the opaque argument to pass to the callbacks.
 940 * This is for use by device models.
 941 */
 942void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
 943                     void *opaque)
 944{
 945    blk->dev_ops = ops;
 946    blk->dev_opaque = opaque;
 947
 948    /* Are we currently quiesced? Should we enforce this right now? */
 949    if (blk->quiesce_counter && ops->drained_begin) {
 950        ops->drained_begin(opaque);
 951    }
 952}
 953
 954/*
 955 * Notify @blk's attached device model of media change.
 956 *
 957 * If @load is true, notify of media load. This action can fail, meaning that
 958 * the medium cannot be loaded. @errp is set then.
 959 *
 960 * If @load is false, notify of media eject. This can never fail.
 961 *
 962 * Also send DEVICE_TRAY_MOVED events as appropriate.
 963 */
 964void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp)
 965{
 966    if (blk->dev_ops && blk->dev_ops->change_media_cb) {
 967        bool tray_was_open, tray_is_open;
 968        Error *local_err = NULL;
 969
 970        tray_was_open = blk_dev_is_tray_open(blk);
 971        blk->dev_ops->change_media_cb(blk->dev_opaque, load, &local_err);
 972        if (local_err) {
 973            assert(load == true);
 974            error_propagate(errp, local_err);
 975            return;
 976        }
 977        tray_is_open = blk_dev_is_tray_open(blk);
 978
 979        if (tray_was_open != tray_is_open) {
 980            char *id = blk_get_attached_dev_id(blk);
 981            qapi_event_send_device_tray_moved(blk_name(blk), id, tray_is_open);
 982            g_free(id);
 983        }
 984    }
 985}
 986
 987static void blk_root_change_media(BdrvChild *child, bool load)
 988{
 989    blk_dev_change_media_cb(child->opaque, load, NULL);
 990}
 991
 992/*
 993 * Does @blk's attached device model have removable media?
 994 * %true if no device model is attached.
 995 */
 996bool blk_dev_has_removable_media(BlockBackend *blk)
 997{
 998    return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb);
 999}
1000
1001/*
1002 * Does @blk's attached device model have a tray?
1003 */
1004bool blk_dev_has_tray(BlockBackend *blk)
1005{
1006    return blk->dev_ops && blk->dev_ops->is_tray_open;
1007}
1008
1009/*
1010 * Notify @blk's attached device model of a media eject request.
1011 * If @force is true, the medium is about to be yanked out forcefully.
1012 */
1013void blk_dev_eject_request(BlockBackend *blk, bool force)
1014{
1015    if (blk->dev_ops && blk->dev_ops->eject_request_cb) {
1016        blk->dev_ops->eject_request_cb(blk->dev_opaque, force);
1017    }
1018}
1019
1020/*
1021 * Does @blk's attached device model have a tray, and is it open?
1022 */
1023bool blk_dev_is_tray_open(BlockBackend *blk)
1024{
1025    if (blk_dev_has_tray(blk)) {
1026        return blk->dev_ops->is_tray_open(blk->dev_opaque);
1027    }
1028    return false;
1029}
1030
1031/*
1032 * Does @blk's attached device model have the medium locked?
1033 * %false if the device model has no such lock.
1034 */
1035bool blk_dev_is_medium_locked(BlockBackend *blk)
1036{
1037    if (blk->dev_ops && blk->dev_ops->is_medium_locked) {
1038        return blk->dev_ops->is_medium_locked(blk->dev_opaque);
1039    }
1040    return false;
1041}
1042
1043/*
1044 * Notify @blk's attached device model of a backend size change.
1045 */
1046static void blk_root_resize(BdrvChild *child)
1047{
1048    BlockBackend *blk = child->opaque;
1049
1050    if (blk->dev_ops && blk->dev_ops->resize_cb) {
1051        blk->dev_ops->resize_cb(blk->dev_opaque);
1052    }
1053}
1054
1055void blk_iostatus_enable(BlockBackend *blk)
1056{
1057    blk->iostatus_enabled = true;
1058    blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1059}
1060
1061/* The I/O status is only enabled if the drive explicitly
1062 * enables it _and_ the VM is configured to stop on errors */
1063bool blk_iostatus_is_enabled(const BlockBackend *blk)
1064{
1065    return (blk->iostatus_enabled &&
1066           (blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
1067            blk->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
1068            blk->on_read_error == BLOCKDEV_ON_ERROR_STOP));
1069}
1070
1071BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk)
1072{
1073    return blk->iostatus;
1074}
1075
1076void blk_iostatus_disable(BlockBackend *blk)
1077{
1078    blk->iostatus_enabled = false;
1079}
1080
1081void blk_iostatus_reset(BlockBackend *blk)
1082{
1083    if (blk_iostatus_is_enabled(blk)) {
1084        blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1085    }
1086}
1087
1088void blk_iostatus_set_err(BlockBackend *blk, int error)
1089{
1090    assert(blk_iostatus_is_enabled(blk));
1091    if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
1092        blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
1093                                          BLOCK_DEVICE_IO_STATUS_FAILED;
1094    }
1095}
1096
1097void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow)
1098{
1099    blk->allow_write_beyond_eof = allow;
1100}
1101
1102void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow)
1103{
1104    blk->allow_aio_context_change = allow;
1105}
1106
1107void blk_set_disable_request_queuing(BlockBackend *blk, bool disable)
1108{
1109    blk->disable_request_queuing = disable;
1110}
1111
1112static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
1113                                  size_t size)
1114{
1115    int64_t len;
1116
1117    if (size > INT_MAX) {
1118        return -EIO;
1119    }
1120
1121    if (!blk_is_available(blk)) {
1122        return -ENOMEDIUM;
1123    }
1124
1125    if (offset < 0) {
1126        return -EIO;
1127    }
1128
1129    if (!blk->allow_write_beyond_eof) {
1130        len = blk_getlength(blk);
1131        if (len < 0) {
1132            return len;
1133        }
1134
1135        if (offset > len || len - offset < size) {
1136            return -EIO;
1137        }
1138    }
1139
1140    return 0;
1141}
1142
1143/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1144static void coroutine_fn blk_wait_while_drained(BlockBackend *blk)
1145{
1146    assert(blk->in_flight > 0);
1147
1148    if (blk->quiesce_counter && !blk->disable_request_queuing) {
1149        blk_dec_in_flight(blk);
1150        qemu_co_queue_wait(&blk->queued_requests, NULL);
1151        blk_inc_in_flight(blk);
1152    }
1153}
1154
1155/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1156static int coroutine_fn
1157blk_do_preadv(BlockBackend *blk, int64_t offset, unsigned int bytes,
1158              QEMUIOVector *qiov, BdrvRequestFlags flags)
1159{
1160    int ret;
1161    BlockDriverState *bs;
1162
1163    blk_wait_while_drained(blk);
1164
1165    /* Call blk_bs() only after waiting, the graph may have changed */
1166    bs = blk_bs(blk);
1167    trace_blk_co_preadv(blk, bs, offset, bytes, flags);
1168
1169    ret = blk_check_byte_request(blk, offset, bytes);
1170    if (ret < 0) {
1171        return ret;
1172    }
1173
1174    bdrv_inc_in_flight(bs);
1175
1176    /* throttling disk I/O */
1177    if (blk->public.throttle_group_member.throttle_state) {
1178        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1179                bytes, false);
1180    }
1181
1182    ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
1183    bdrv_dec_in_flight(bs);
1184    return ret;
1185}
1186
1187int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
1188                               unsigned int bytes, QEMUIOVector *qiov,
1189                               BdrvRequestFlags flags)
1190{
1191    int ret;
1192
1193    blk_inc_in_flight(blk);
1194    ret = blk_do_preadv(blk, offset, bytes, qiov, flags);
1195    blk_dec_in_flight(blk);
1196
1197    return ret;
1198}
1199
1200/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1201static int coroutine_fn
1202blk_do_pwritev_part(BlockBackend *blk, int64_t offset, unsigned int bytes,
1203                    QEMUIOVector *qiov, size_t qiov_offset,
1204                    BdrvRequestFlags flags)
1205{
1206    int ret;
1207    BlockDriverState *bs;
1208
1209    blk_wait_while_drained(blk);
1210
1211    /* Call blk_bs() only after waiting, the graph may have changed */
1212    bs = blk_bs(blk);
1213    trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
1214
1215    ret = blk_check_byte_request(blk, offset, bytes);
1216    if (ret < 0) {
1217        return ret;
1218    }
1219
1220    bdrv_inc_in_flight(bs);
1221    /* throttling disk I/O */
1222    if (blk->public.throttle_group_member.throttle_state) {
1223        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1224                bytes, true);
1225    }
1226
1227    if (!blk->enable_write_cache) {
1228        flags |= BDRV_REQ_FUA;
1229    }
1230
1231    ret = bdrv_co_pwritev_part(blk->root, offset, bytes, qiov, qiov_offset,
1232                               flags);
1233    bdrv_dec_in_flight(bs);
1234    return ret;
1235}
1236
1237int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
1238                                     unsigned int bytes,
1239                                     QEMUIOVector *qiov, size_t qiov_offset,
1240                                     BdrvRequestFlags flags)
1241{
1242    int ret;
1243
1244    blk_inc_in_flight(blk);
1245    ret = blk_do_pwritev_part(blk, offset, bytes, qiov, qiov_offset, flags);
1246    blk_dec_in_flight(blk);
1247
1248    return ret;
1249}
1250
1251int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
1252                                unsigned int bytes, QEMUIOVector *qiov,
1253                                BdrvRequestFlags flags)
1254{
1255    return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags);
1256}
1257
1258typedef struct BlkRwCo {
1259    BlockBackend *blk;
1260    int64_t offset;
1261    void *iobuf;
1262    int ret;
1263    BdrvRequestFlags flags;
1264} BlkRwCo;
1265
1266static void blk_read_entry(void *opaque)
1267{
1268    BlkRwCo *rwco = opaque;
1269    QEMUIOVector *qiov = rwco->iobuf;
1270
1271    rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, qiov->size,
1272                              qiov, rwco->flags);
1273    aio_wait_kick();
1274}
1275
1276static void blk_write_entry(void *opaque)
1277{
1278    BlkRwCo *rwco = opaque;
1279    QEMUIOVector *qiov = rwco->iobuf;
1280
1281    rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, qiov->size,
1282                                    qiov, 0, rwco->flags);
1283    aio_wait_kick();
1284}
1285
1286static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
1287                   int64_t bytes, CoroutineEntry co_entry,
1288                   BdrvRequestFlags flags)
1289{
1290    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1291    BlkRwCo rwco = {
1292        .blk    = blk,
1293        .offset = offset,
1294        .iobuf  = &qiov,
1295        .flags  = flags,
1296        .ret    = NOT_DONE,
1297    };
1298
1299    blk_inc_in_flight(blk);
1300    if (qemu_in_coroutine()) {
1301        /* Fast-path if already in coroutine context */
1302        co_entry(&rwco);
1303    } else {
1304        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
1305        bdrv_coroutine_enter(blk_bs(blk), co);
1306        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
1307    }
1308    blk_dec_in_flight(blk);
1309
1310    return rwco.ret;
1311}
1312
1313int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1314                      int bytes, BdrvRequestFlags flags)
1315{
1316    return blk_prw(blk, offset, NULL, bytes, blk_write_entry,
1317                   flags | BDRV_REQ_ZERO_WRITE);
1318}
1319
1320int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
1321{
1322    return bdrv_make_zero(blk->root, flags);
1323}
1324
1325void blk_inc_in_flight(BlockBackend *blk)
1326{
1327    atomic_inc(&blk->in_flight);
1328}
1329
1330void blk_dec_in_flight(BlockBackend *blk)
1331{
1332    atomic_dec(&blk->in_flight);
1333    aio_wait_kick();
1334}
1335
1336static void error_callback_bh(void *opaque)
1337{
1338    struct BlockBackendAIOCB *acb = opaque;
1339
1340    blk_dec_in_flight(acb->blk);
1341    acb->common.cb(acb->common.opaque, acb->ret);
1342    qemu_aio_unref(acb);
1343}
1344
1345BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
1346                                  BlockCompletionFunc *cb,
1347                                  void *opaque, int ret)
1348{
1349    struct BlockBackendAIOCB *acb;
1350
1351    blk_inc_in_flight(blk);
1352    acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
1353    acb->blk = blk;
1354    acb->ret = ret;
1355
1356    replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1357                                     error_callback_bh, acb);
1358    return &acb->common;
1359}
1360
1361typedef struct BlkAioEmAIOCB {
1362    BlockAIOCB common;
1363    BlkRwCo rwco;
1364    int bytes;
1365    bool has_returned;
1366} BlkAioEmAIOCB;
1367
1368static const AIOCBInfo blk_aio_em_aiocb_info = {
1369    .aiocb_size         = sizeof(BlkAioEmAIOCB),
1370};
1371
1372static void blk_aio_complete(BlkAioEmAIOCB *acb)
1373{
1374    if (acb->has_returned) {
1375        acb->common.cb(acb->common.opaque, acb->rwco.ret);
1376        blk_dec_in_flight(acb->rwco.blk);
1377        qemu_aio_unref(acb);
1378    }
1379}
1380
1381static void blk_aio_complete_bh(void *opaque)
1382{
1383    BlkAioEmAIOCB *acb = opaque;
1384    assert(acb->has_returned);
1385    blk_aio_complete(acb);
1386}
1387
1388static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
1389                                void *iobuf, CoroutineEntry co_entry,
1390                                BdrvRequestFlags flags,
1391                                BlockCompletionFunc *cb, void *opaque)
1392{
1393    BlkAioEmAIOCB *acb;
1394    Coroutine *co;
1395
1396    blk_inc_in_flight(blk);
1397    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1398    acb->rwco = (BlkRwCo) {
1399        .blk    = blk,
1400        .offset = offset,
1401        .iobuf  = iobuf,
1402        .flags  = flags,
1403        .ret    = NOT_DONE,
1404    };
1405    acb->bytes = bytes;
1406    acb->has_returned = false;
1407
1408    co = qemu_coroutine_create(co_entry, acb);
1409    bdrv_coroutine_enter(blk_bs(blk), co);
1410
1411    acb->has_returned = true;
1412    if (acb->rwco.ret != NOT_DONE) {
1413        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1414                                         blk_aio_complete_bh, acb);
1415    }
1416
1417    return &acb->common;
1418}
1419
1420static void blk_aio_read_entry(void *opaque)
1421{
1422    BlkAioEmAIOCB *acb = opaque;
1423    BlkRwCo *rwco = &acb->rwco;
1424    QEMUIOVector *qiov = rwco->iobuf;
1425
1426    assert(qiov->size == acb->bytes);
1427    rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, acb->bytes,
1428                              qiov, rwco->flags);
1429    blk_aio_complete(acb);
1430}
1431
1432static void blk_aio_write_entry(void *opaque)
1433{
1434    BlkAioEmAIOCB *acb = opaque;
1435    BlkRwCo *rwco = &acb->rwco;
1436    QEMUIOVector *qiov = rwco->iobuf;
1437
1438    assert(!qiov || qiov->size == acb->bytes);
1439    rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, acb->bytes,
1440                                    qiov, 0, rwco->flags);
1441    blk_aio_complete(acb);
1442}
1443
1444BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1445                                  int count, BdrvRequestFlags flags,
1446                                  BlockCompletionFunc *cb, void *opaque)
1447{
1448    return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
1449                        flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
1450}
1451
1452int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
1453{
1454    int ret = blk_prw(blk, offset, buf, count, blk_read_entry, 0);
1455    if (ret < 0) {
1456        return ret;
1457    }
1458    return count;
1459}
1460
1461int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count,
1462               BdrvRequestFlags flags)
1463{
1464    int ret = blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
1465                      flags);
1466    if (ret < 0) {
1467        return ret;
1468    }
1469    return count;
1470}
1471
1472int64_t blk_getlength(BlockBackend *blk)
1473{
1474    if (!blk_is_available(blk)) {
1475        return -ENOMEDIUM;
1476    }
1477
1478    return bdrv_getlength(blk_bs(blk));
1479}
1480
1481void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr)
1482{
1483    if (!blk_bs(blk)) {
1484        *nb_sectors_ptr = 0;
1485    } else {
1486        bdrv_get_geometry(blk_bs(blk), nb_sectors_ptr);
1487    }
1488}
1489
1490int64_t blk_nb_sectors(BlockBackend *blk)
1491{
1492    if (!blk_is_available(blk)) {
1493        return -ENOMEDIUM;
1494    }
1495
1496    return bdrv_nb_sectors(blk_bs(blk));
1497}
1498
1499BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
1500                           QEMUIOVector *qiov, BdrvRequestFlags flags,
1501                           BlockCompletionFunc *cb, void *opaque)
1502{
1503    return blk_aio_prwv(blk, offset, qiov->size, qiov,
1504                        blk_aio_read_entry, flags, cb, opaque);
1505}
1506
1507BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
1508                            QEMUIOVector *qiov, BdrvRequestFlags flags,
1509                            BlockCompletionFunc *cb, void *opaque)
1510{
1511    return blk_aio_prwv(blk, offset, qiov->size, qiov,
1512                        blk_aio_write_entry, flags, cb, opaque);
1513}
1514
1515void blk_aio_cancel(BlockAIOCB *acb)
1516{
1517    bdrv_aio_cancel(acb);
1518}
1519
1520void blk_aio_cancel_async(BlockAIOCB *acb)
1521{
1522    bdrv_aio_cancel_async(acb);
1523}
1524
1525/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1526static int coroutine_fn
1527blk_do_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1528{
1529    blk_wait_while_drained(blk);
1530
1531    if (!blk_is_available(blk)) {
1532        return -ENOMEDIUM;
1533    }
1534
1535    return bdrv_co_ioctl(blk_bs(blk), req, buf);
1536}
1537
1538static void blk_ioctl_entry(void *opaque)
1539{
1540    BlkRwCo *rwco = opaque;
1541    QEMUIOVector *qiov = rwco->iobuf;
1542
1543    rwco->ret = blk_do_ioctl(rwco->blk, rwco->offset, qiov->iov[0].iov_base);
1544    aio_wait_kick();
1545}
1546
1547int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1548{
1549    return blk_prw(blk, req, buf, 0, blk_ioctl_entry, 0);
1550}
1551
1552static void blk_aio_ioctl_entry(void *opaque)
1553{
1554    BlkAioEmAIOCB *acb = opaque;
1555    BlkRwCo *rwco = &acb->rwco;
1556
1557    rwco->ret = blk_do_ioctl(rwco->blk, rwco->offset, rwco->iobuf);
1558
1559    blk_aio_complete(acb);
1560}
1561
1562BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
1563                          BlockCompletionFunc *cb, void *opaque)
1564{
1565    return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque);
1566}
1567
1568/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1569static int coroutine_fn
1570blk_do_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1571{
1572    int ret;
1573
1574    blk_wait_while_drained(blk);
1575
1576    ret = blk_check_byte_request(blk, offset, bytes);
1577    if (ret < 0) {
1578        return ret;
1579    }
1580
1581    return bdrv_co_pdiscard(blk->root, offset, bytes);
1582}
1583
1584static void blk_aio_pdiscard_entry(void *opaque)
1585{
1586    BlkAioEmAIOCB *acb = opaque;
1587    BlkRwCo *rwco = &acb->rwco;
1588
1589    rwco->ret = blk_do_pdiscard(rwco->blk, rwco->offset, acb->bytes);
1590    blk_aio_complete(acb);
1591}
1592
1593BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
1594                             int64_t offset, int bytes,
1595                             BlockCompletionFunc *cb, void *opaque)
1596{
1597    return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0,
1598                        cb, opaque);
1599}
1600
1601int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1602{
1603    int ret;
1604
1605    blk_inc_in_flight(blk);
1606    ret = blk_do_pdiscard(blk, offset, bytes);
1607    blk_dec_in_flight(blk);
1608
1609    return ret;
1610}
1611
1612static void blk_pdiscard_entry(void *opaque)
1613{
1614    BlkRwCo *rwco = opaque;
1615    QEMUIOVector *qiov = rwco->iobuf;
1616
1617    rwco->ret = blk_do_pdiscard(rwco->blk, rwco->offset, qiov->size);
1618    aio_wait_kick();
1619}
1620
1621int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1622{
1623    return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0);
1624}
1625
1626/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1627static int coroutine_fn blk_do_flush(BlockBackend *blk)
1628{
1629    blk_wait_while_drained(blk);
1630
1631    if (!blk_is_available(blk)) {
1632        return -ENOMEDIUM;
1633    }
1634
1635    return bdrv_co_flush(blk_bs(blk));
1636}
1637
1638static void blk_aio_flush_entry(void *opaque)
1639{
1640    BlkAioEmAIOCB *acb = opaque;
1641    BlkRwCo *rwco = &acb->rwco;
1642
1643    rwco->ret = blk_do_flush(rwco->blk);
1644    blk_aio_complete(acb);
1645}
1646
1647BlockAIOCB *blk_aio_flush(BlockBackend *blk,
1648                          BlockCompletionFunc *cb, void *opaque)
1649{
1650    return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque);
1651}
1652
1653int coroutine_fn blk_co_flush(BlockBackend *blk)
1654{
1655    int ret;
1656
1657    blk_inc_in_flight(blk);
1658    ret = blk_do_flush(blk);
1659    blk_dec_in_flight(blk);
1660
1661    return ret;
1662}
1663
1664static void blk_flush_entry(void *opaque)
1665{
1666    BlkRwCo *rwco = opaque;
1667    rwco->ret = blk_do_flush(rwco->blk);
1668    aio_wait_kick();
1669}
1670
1671int blk_flush(BlockBackend *blk)
1672{
1673    return blk_prw(blk, 0, NULL, 0, blk_flush_entry, 0);
1674}
1675
1676void blk_drain(BlockBackend *blk)
1677{
1678    BlockDriverState *bs = blk_bs(blk);
1679
1680    if (bs) {
1681        bdrv_drained_begin(bs);
1682    }
1683
1684    /* We may have -ENOMEDIUM completions in flight */
1685    AIO_WAIT_WHILE(blk_get_aio_context(blk),
1686                   atomic_mb_read(&blk->in_flight) > 0);
1687
1688    if (bs) {
1689        bdrv_drained_end(bs);
1690    }
1691}
1692
1693void blk_drain_all(void)
1694{
1695    BlockBackend *blk = NULL;
1696
1697    bdrv_drain_all_begin();
1698
1699    while ((blk = blk_all_next(blk)) != NULL) {
1700        AioContext *ctx = blk_get_aio_context(blk);
1701
1702        aio_context_acquire(ctx);
1703
1704        /* We may have -ENOMEDIUM completions in flight */
1705        AIO_WAIT_WHILE(ctx, atomic_mb_read(&blk->in_flight) > 0);
1706
1707        aio_context_release(ctx);
1708    }
1709
1710    bdrv_drain_all_end();
1711}
1712
1713void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
1714                      BlockdevOnError on_write_error)
1715{
1716    blk->on_read_error = on_read_error;
1717    blk->on_write_error = on_write_error;
1718}
1719
1720BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read)
1721{
1722    return is_read ? blk->on_read_error : blk->on_write_error;
1723}
1724
1725BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
1726                                      int error)
1727{
1728    BlockdevOnError on_err = blk_get_on_error(blk, is_read);
1729
1730    switch (on_err) {
1731    case BLOCKDEV_ON_ERROR_ENOSPC:
1732        return (error == ENOSPC) ?
1733               BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
1734    case BLOCKDEV_ON_ERROR_STOP:
1735        return BLOCK_ERROR_ACTION_STOP;
1736    case BLOCKDEV_ON_ERROR_REPORT:
1737        return BLOCK_ERROR_ACTION_REPORT;
1738    case BLOCKDEV_ON_ERROR_IGNORE:
1739        return BLOCK_ERROR_ACTION_IGNORE;
1740    case BLOCKDEV_ON_ERROR_AUTO:
1741    default:
1742        abort();
1743    }
1744}
1745
1746static void send_qmp_error_event(BlockBackend *blk,
1747                                 BlockErrorAction action,
1748                                 bool is_read, int error)
1749{
1750    IoOperationType optype;
1751    BlockDriverState *bs = blk_bs(blk);
1752
1753    optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
1754    qapi_event_send_block_io_error(blk_name(blk), !!bs,
1755                                   bs ? bdrv_get_node_name(bs) : NULL, optype,
1756                                   action, blk_iostatus_is_enabled(blk),
1757                                   error == ENOSPC, strerror(error));
1758}
1759
1760/* This is done by device models because, while the block layer knows
1761 * about the error, it does not know whether an operation comes from
1762 * the device or the block layer (from a job, for example).
1763 */
1764void blk_error_action(BlockBackend *blk, BlockErrorAction action,
1765                      bool is_read, int error)
1766{
1767    assert(error >= 0);
1768
1769    if (action == BLOCK_ERROR_ACTION_STOP) {
1770        /* First set the iostatus, so that "info block" returns an iostatus
1771         * that matches the events raised so far (an additional error iostatus
1772         * is fine, but not a lost one).
1773         */
1774        blk_iostatus_set_err(blk, error);
1775
1776        /* Then raise the request to stop the VM and the event.
1777         * qemu_system_vmstop_request_prepare has two effects.  First,
1778         * it ensures that the STOP event always comes after the
1779         * BLOCK_IO_ERROR event.  Second, it ensures that even if management
1780         * can observe the STOP event and do a "cont" before the STOP
1781         * event is issued, the VM will not stop.  In this case, vm_start()
1782         * also ensures that the STOP/RESUME pair of events is emitted.
1783         */
1784        qemu_system_vmstop_request_prepare();
1785        send_qmp_error_event(blk, action, is_read, error);
1786        qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
1787    } else {
1788        send_qmp_error_event(blk, action, is_read, error);
1789    }
1790}
1791
1792bool blk_is_read_only(BlockBackend *blk)
1793{
1794    BlockDriverState *bs = blk_bs(blk);
1795
1796    if (bs) {
1797        return bdrv_is_read_only(bs);
1798    } else {
1799        return blk->root_state.read_only;
1800    }
1801}
1802
1803bool blk_is_sg(BlockBackend *blk)
1804{
1805    BlockDriverState *bs = blk_bs(blk);
1806
1807    if (!bs) {
1808        return false;
1809    }
1810
1811    return bdrv_is_sg(bs);
1812}
1813
1814bool blk_enable_write_cache(BlockBackend *blk)
1815{
1816    return blk->enable_write_cache;
1817}
1818
1819void blk_set_enable_write_cache(BlockBackend *blk, bool wce)
1820{
1821    blk->enable_write_cache = wce;
1822}
1823
1824void blk_invalidate_cache(BlockBackend *blk, Error **errp)
1825{
1826    BlockDriverState *bs = blk_bs(blk);
1827
1828    if (!bs) {
1829        error_setg(errp, "Device '%s' has no medium", blk->name);
1830        return;
1831    }
1832
1833    bdrv_invalidate_cache(bs, errp);
1834}
1835
1836bool blk_is_inserted(BlockBackend *blk)
1837{
1838    BlockDriverState *bs = blk_bs(blk);
1839
1840    return bs && bdrv_is_inserted(bs);
1841}
1842
1843bool blk_is_available(BlockBackend *blk)
1844{
1845    return blk_is_inserted(blk) && !blk_dev_is_tray_open(blk);
1846}
1847
1848void blk_lock_medium(BlockBackend *blk, bool locked)
1849{
1850    BlockDriverState *bs = blk_bs(blk);
1851
1852    if (bs) {
1853        bdrv_lock_medium(bs, locked);
1854    }
1855}
1856
1857void blk_eject(BlockBackend *blk, bool eject_flag)
1858{
1859    BlockDriverState *bs = blk_bs(blk);
1860    char *id;
1861
1862    if (bs) {
1863        bdrv_eject(bs, eject_flag);
1864    }
1865
1866    /* Whether or not we ejected on the backend,
1867     * the frontend experienced a tray event. */
1868    id = blk_get_attached_dev_id(blk);
1869    qapi_event_send_device_tray_moved(blk_name(blk), id,
1870                                      eject_flag);
1871    g_free(id);
1872}
1873
1874int blk_get_flags(BlockBackend *blk)
1875{
1876    BlockDriverState *bs = blk_bs(blk);
1877
1878    if (bs) {
1879        return bdrv_get_flags(bs);
1880    } else {
1881        return blk->root_state.open_flags;
1882    }
1883}
1884
1885/* Returns the minimum request alignment, in bytes; guaranteed nonzero */
1886uint32_t blk_get_request_alignment(BlockBackend *blk)
1887{
1888    BlockDriverState *bs = blk_bs(blk);
1889    return bs ? bs->bl.request_alignment : BDRV_SECTOR_SIZE;
1890}
1891
1892/* Returns the maximum transfer length, in bytes; guaranteed nonzero */
1893uint32_t blk_get_max_transfer(BlockBackend *blk)
1894{
1895    BlockDriverState *bs = blk_bs(blk);
1896    uint32_t max = 0;
1897
1898    if (bs) {
1899        max = bs->bl.max_transfer;
1900    }
1901    return MIN_NON_ZERO(max, INT_MAX);
1902}
1903
1904int blk_get_max_iov(BlockBackend *blk)
1905{
1906    return blk->root->bs->bl.max_iov;
1907}
1908
1909void blk_set_guest_block_size(BlockBackend *blk, int align)
1910{
1911    blk->guest_block_size = align;
1912}
1913
1914void *blk_try_blockalign(BlockBackend *blk, size_t size)
1915{
1916    return qemu_try_blockalign(blk ? blk_bs(blk) : NULL, size);
1917}
1918
1919void *blk_blockalign(BlockBackend *blk, size_t size)
1920{
1921    return qemu_blockalign(blk ? blk_bs(blk) : NULL, size);
1922}
1923
1924bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
1925{
1926    BlockDriverState *bs = blk_bs(blk);
1927
1928    if (!bs) {
1929        return false;
1930    }
1931
1932    return bdrv_op_is_blocked(bs, op, errp);
1933}
1934
1935void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
1936{
1937    BlockDriverState *bs = blk_bs(blk);
1938
1939    if (bs) {
1940        bdrv_op_unblock(bs, op, reason);
1941    }
1942}
1943
1944void blk_op_block_all(BlockBackend *blk, Error *reason)
1945{
1946    BlockDriverState *bs = blk_bs(blk);
1947
1948    if (bs) {
1949        bdrv_op_block_all(bs, reason);
1950    }
1951}
1952
1953void blk_op_unblock_all(BlockBackend *blk, Error *reason)
1954{
1955    BlockDriverState *bs = blk_bs(blk);
1956
1957    if (bs) {
1958        bdrv_op_unblock_all(bs, reason);
1959    }
1960}
1961
1962AioContext *blk_get_aio_context(BlockBackend *blk)
1963{
1964    BlockDriverState *bs = blk_bs(blk);
1965
1966    if (bs) {
1967        AioContext *ctx = bdrv_get_aio_context(blk_bs(blk));
1968        assert(ctx == blk->ctx);
1969    }
1970
1971    return blk->ctx;
1972}
1973
1974static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
1975{
1976    BlockBackendAIOCB *blk_acb = DO_UPCAST(BlockBackendAIOCB, common, acb);
1977    return blk_get_aio_context(blk_acb->blk);
1978}
1979
1980static int blk_do_set_aio_context(BlockBackend *blk, AioContext *new_context,
1981                                  bool update_root_node, Error **errp)
1982{
1983    BlockDriverState *bs = blk_bs(blk);
1984    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
1985    int ret;
1986
1987    if (bs) {
1988        if (update_root_node) {
1989            ret = bdrv_child_try_set_aio_context(bs, new_context, blk->root,
1990                                                 errp);
1991            if (ret < 0) {
1992                return ret;
1993            }
1994        }
1995        if (tgm->throttle_state) {
1996            bdrv_drained_begin(bs);
1997            throttle_group_detach_aio_context(tgm);
1998            throttle_group_attach_aio_context(tgm, new_context);
1999            bdrv_drained_end(bs);
2000        }
2001    }
2002
2003    blk->ctx = new_context;
2004    return 0;
2005}
2006
2007int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
2008                        Error **errp)
2009{
2010    return blk_do_set_aio_context(blk, new_context, true, errp);
2011}
2012
2013static bool blk_root_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
2014                                     GSList **ignore, Error **errp)
2015{
2016    BlockBackend *blk = child->opaque;
2017
2018    if (blk->allow_aio_context_change) {
2019        return true;
2020    }
2021
2022    /* Only manually created BlockBackends that are not attached to anything
2023     * can change their AioContext without updating their user. */
2024    if (!blk->name || blk->dev) {
2025        /* TODO Add BB name/QOM path */
2026        error_setg(errp, "Cannot change iothread of active block backend");
2027        return false;
2028    }
2029
2030    return true;
2031}
2032
2033static void blk_root_set_aio_ctx(BdrvChild *child, AioContext *ctx,
2034                                 GSList **ignore)
2035{
2036    BlockBackend *blk = child->opaque;
2037    blk_do_set_aio_context(blk, ctx, false, &error_abort);
2038}
2039
2040void blk_add_aio_context_notifier(BlockBackend *blk,
2041        void (*attached_aio_context)(AioContext *new_context, void *opaque),
2042        void (*detach_aio_context)(void *opaque), void *opaque)
2043{
2044    BlockBackendAioNotifier *notifier;
2045    BlockDriverState *bs = blk_bs(blk);
2046
2047    notifier = g_new(BlockBackendAioNotifier, 1);
2048    notifier->attached_aio_context = attached_aio_context;
2049    notifier->detach_aio_context = detach_aio_context;
2050    notifier->opaque = opaque;
2051    QLIST_INSERT_HEAD(&blk->aio_notifiers, notifier, list);
2052
2053    if (bs) {
2054        bdrv_add_aio_context_notifier(bs, attached_aio_context,
2055                                      detach_aio_context, opaque);
2056    }
2057}
2058
2059void blk_remove_aio_context_notifier(BlockBackend *blk,
2060                                     void (*attached_aio_context)(AioContext *,
2061                                                                  void *),
2062                                     void (*detach_aio_context)(void *),
2063                                     void *opaque)
2064{
2065    BlockBackendAioNotifier *notifier;
2066    BlockDriverState *bs = blk_bs(blk);
2067
2068    if (bs) {
2069        bdrv_remove_aio_context_notifier(bs, attached_aio_context,
2070                                         detach_aio_context, opaque);
2071    }
2072
2073    QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
2074        if (notifier->attached_aio_context == attached_aio_context &&
2075            notifier->detach_aio_context == detach_aio_context &&
2076            notifier->opaque == opaque) {
2077            QLIST_REMOVE(notifier, list);
2078            g_free(notifier);
2079            return;
2080        }
2081    }
2082
2083    abort();
2084}
2085
2086void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify)
2087{
2088    notifier_list_add(&blk->remove_bs_notifiers, notify);
2089}
2090
2091void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
2092{
2093    notifier_list_add(&blk->insert_bs_notifiers, notify);
2094}
2095
2096void blk_io_plug(BlockBackend *blk)
2097{
2098    BlockDriverState *bs = blk_bs(blk);
2099
2100    if (bs) {
2101        bdrv_io_plug(bs);
2102    }
2103}
2104
2105void blk_io_unplug(BlockBackend *blk)
2106{
2107    BlockDriverState *bs = blk_bs(blk);
2108
2109    if (bs) {
2110        bdrv_io_unplug(bs);
2111    }
2112}
2113
2114BlockAcctStats *blk_get_stats(BlockBackend *blk)
2115{
2116    return &blk->stats;
2117}
2118
2119void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
2120                  BlockCompletionFunc *cb, void *opaque)
2121{
2122    return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
2123}
2124
2125int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
2126                                      int bytes, BdrvRequestFlags flags)
2127{
2128    return blk_co_pwritev(blk, offset, bytes, NULL,
2129                          flags | BDRV_REQ_ZERO_WRITE);
2130}
2131
2132int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
2133                          int count)
2134{
2135    return blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
2136                   BDRV_REQ_WRITE_COMPRESSED);
2137}
2138
2139int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
2140                 PreallocMode prealloc, Error **errp)
2141{
2142    if (!blk_is_available(blk)) {
2143        error_setg(errp, "No medium inserted");
2144        return -ENOMEDIUM;
2145    }
2146
2147    return bdrv_truncate(blk->root, offset, exact, prealloc, errp);
2148}
2149
2150int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
2151                     int64_t pos, int size)
2152{
2153    int ret;
2154
2155    if (!blk_is_available(blk)) {
2156        return -ENOMEDIUM;
2157    }
2158
2159    ret = bdrv_save_vmstate(blk_bs(blk), buf, pos, size);
2160    if (ret < 0) {
2161        return ret;
2162    }
2163
2164    if (ret == size && !blk->enable_write_cache) {
2165        ret = bdrv_flush(blk_bs(blk));
2166    }
2167
2168    return ret < 0 ? ret : size;
2169}
2170
2171int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
2172{
2173    if (!blk_is_available(blk)) {
2174        return -ENOMEDIUM;
2175    }
2176
2177    return bdrv_load_vmstate(blk_bs(blk), buf, pos, size);
2178}
2179
2180int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
2181{
2182    if (!blk_is_available(blk)) {
2183        return -ENOMEDIUM;
2184    }
2185
2186    return bdrv_probe_blocksizes(blk_bs(blk), bsz);
2187}
2188
2189int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo)
2190{
2191    if (!blk_is_available(blk)) {
2192        return -ENOMEDIUM;
2193    }
2194
2195    return bdrv_probe_geometry(blk_bs(blk), geo);
2196}
2197
2198/*
2199 * Updates the BlockBackendRootState object with data from the currently
2200 * attached BlockDriverState.
2201 */
2202void blk_update_root_state(BlockBackend *blk)
2203{
2204    assert(blk->root);
2205
2206    blk->root_state.open_flags    = blk->root->bs->open_flags;
2207    blk->root_state.read_only     = blk->root->bs->read_only;
2208    blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
2209}
2210
2211/*
2212 * Returns the detect-zeroes setting to be used for bdrv_open() of a
2213 * BlockDriverState which is supposed to inherit the root state.
2214 */
2215bool blk_get_detect_zeroes_from_root_state(BlockBackend *blk)
2216{
2217    return blk->root_state.detect_zeroes;
2218}
2219
2220/*
2221 * Returns the flags to be used for bdrv_open() of a BlockDriverState which is
2222 * supposed to inherit the root state.
2223 */
2224int blk_get_open_flags_from_root_state(BlockBackend *blk)
2225{
2226    int bs_flags;
2227
2228    bs_flags = blk->root_state.read_only ? 0 : BDRV_O_RDWR;
2229    bs_flags |= blk->root_state.open_flags & ~BDRV_O_RDWR;
2230
2231    return bs_flags;
2232}
2233
2234BlockBackendRootState *blk_get_root_state(BlockBackend *blk)
2235{
2236    return &blk->root_state;
2237}
2238
2239int blk_commit_all(void)
2240{
2241    BlockBackend *blk = NULL;
2242
2243    while ((blk = blk_all_next(blk)) != NULL) {
2244        AioContext *aio_context = blk_get_aio_context(blk);
2245
2246        aio_context_acquire(aio_context);
2247        if (blk_is_inserted(blk) && blk->root->bs->backing) {
2248            int ret = bdrv_commit(blk->root->bs);
2249            if (ret < 0) {
2250                aio_context_release(aio_context);
2251                return ret;
2252            }
2253        }
2254        aio_context_release(aio_context);
2255    }
2256    return 0;
2257}
2258
2259
2260/* throttling disk I/O limits */
2261void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
2262{
2263    throttle_group_config(&blk->public.throttle_group_member, cfg);
2264}
2265
2266void blk_io_limits_disable(BlockBackend *blk)
2267{
2268    BlockDriverState *bs = blk_bs(blk);
2269    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2270    assert(tgm->throttle_state);
2271    if (bs) {
2272        bdrv_drained_begin(bs);
2273    }
2274    throttle_group_unregister_tgm(tgm);
2275    if (bs) {
2276        bdrv_drained_end(bs);
2277    }
2278}
2279
2280/* should be called before blk_set_io_limits if a limit is set */
2281void blk_io_limits_enable(BlockBackend *blk, const char *group)
2282{
2283    assert(!blk->public.throttle_group_member.throttle_state);
2284    throttle_group_register_tgm(&blk->public.throttle_group_member,
2285                                group, blk_get_aio_context(blk));
2286}
2287
2288void blk_io_limits_update_group(BlockBackend *blk, const char *group)
2289{
2290    /* this BB is not part of any group */
2291    if (!blk->public.throttle_group_member.throttle_state) {
2292        return;
2293    }
2294
2295    /* this BB is a part of the same group than the one we want */
2296    if (!g_strcmp0(throttle_group_get_name(&blk->public.throttle_group_member),
2297                group)) {
2298        return;
2299    }
2300
2301    /* need to change the group this bs belong to */
2302    blk_io_limits_disable(blk);
2303    blk_io_limits_enable(blk, group);
2304}
2305
2306static void blk_root_drained_begin(BdrvChild *child)
2307{
2308    BlockBackend *blk = child->opaque;
2309
2310    if (++blk->quiesce_counter == 1) {
2311        if (blk->dev_ops && blk->dev_ops->drained_begin) {
2312            blk->dev_ops->drained_begin(blk->dev_opaque);
2313        }
2314    }
2315
2316    /* Note that blk->root may not be accessible here yet if we are just
2317     * attaching to a BlockDriverState that is drained. Use child instead. */
2318
2319    if (atomic_fetch_inc(&blk->public.throttle_group_member.io_limits_disabled) == 0) {
2320        throttle_group_restart_tgm(&blk->public.throttle_group_member);
2321    }
2322}
2323
2324static bool blk_root_drained_poll(BdrvChild *child)
2325{
2326    BlockBackend *blk = child->opaque;
2327    assert(blk->quiesce_counter);
2328    return !!blk->in_flight;
2329}
2330
2331static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter)
2332{
2333    BlockBackend *blk = child->opaque;
2334    assert(blk->quiesce_counter);
2335
2336    assert(blk->public.throttle_group_member.io_limits_disabled);
2337    atomic_dec(&blk->public.throttle_group_member.io_limits_disabled);
2338
2339    if (--blk->quiesce_counter == 0) {
2340        if (blk->dev_ops && blk->dev_ops->drained_end) {
2341            blk->dev_ops->drained_end(blk->dev_opaque);
2342        }
2343        while (qemu_co_enter_next(&blk->queued_requests, NULL)) {
2344            /* Resume all queued requests */
2345        }
2346    }
2347}
2348
2349void blk_register_buf(BlockBackend *blk, void *host, size_t size)
2350{
2351    bdrv_register_buf(blk_bs(blk), host, size);
2352}
2353
2354void blk_unregister_buf(BlockBackend *blk, void *host)
2355{
2356    bdrv_unregister_buf(blk_bs(blk), host);
2357}
2358
2359int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
2360                                   BlockBackend *blk_out, int64_t off_out,
2361                                   int bytes, BdrvRequestFlags read_flags,
2362                                   BdrvRequestFlags write_flags)
2363{
2364    int r;
2365    r = blk_check_byte_request(blk_in, off_in, bytes);
2366    if (r) {
2367        return r;
2368    }
2369    r = blk_check_byte_request(blk_out, off_out, bytes);
2370    if (r) {
2371        return r;
2372    }
2373    return bdrv_co_copy_range(blk_in->root, off_in,
2374                              blk_out->root, off_out,
2375                              bytes, read_flags, write_flags);
2376}
2377
2378const BdrvChild *blk_root(BlockBackend *blk)
2379{
2380    return blk->root;
2381}
2382