qemu/block/block-backend.c
<<
>>
Prefs
   1/*
   2 * QEMU Block backends
   3 *
   4 * Copyright (C) 2014-2016 Red Hat, Inc.
   5 *
   6 * Authors:
   7 *  Markus Armbruster <armbru@redhat.com>,
   8 *
   9 * This work is licensed under the terms of the GNU LGPL, version 2.1
  10 * or later.  See the COPYING.LIB file in the top-level directory.
  11 */
  12
  13#include "qemu/osdep.h"
  14#include "sysemu/block-backend.h"
  15#include "block/block_int.h"
  16#include "block/blockjob.h"
  17#include "block/throttle-groups.h"
  18#include "hw/qdev-core.h"
  19#include "sysemu/blockdev.h"
  20#include "sysemu/runstate.h"
  21#include "sysemu/sysemu.h"
  22#include "sysemu/replay.h"
  23#include "qapi/error.h"
  24#include "qapi/qapi-events-block.h"
  25#include "qemu/id.h"
  26#include "qemu/main-loop.h"
  27#include "qemu/option.h"
  28#include "trace.h"
  29#include "migration/misc.h"
  30
  31/* Number of coroutines to reserve per attached device model */
  32#define COROUTINE_POOL_RESERVATION 64
  33
  34#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  35
  36static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb);
  37
  38typedef struct BlockBackendAioNotifier {
  39    void (*attached_aio_context)(AioContext *new_context, void *opaque);
  40    void (*detach_aio_context)(void *opaque);
  41    void *opaque;
  42    QLIST_ENTRY(BlockBackendAioNotifier) list;
  43} BlockBackendAioNotifier;
  44
  45struct BlockBackend {
  46    char *name;
  47    int refcnt;
  48    BdrvChild *root;
  49    AioContext *ctx;
  50    DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
  51    QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
  52    QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
  53    BlockBackendPublic public;
  54
  55    DeviceState *dev;           /* attached device model, if any */
  56    const BlockDevOps *dev_ops;
  57    void *dev_opaque;
  58
  59    /* the block size for which the guest device expects atomicity */
  60    int guest_block_size;
  61
  62    /* If the BDS tree is removed, some of its options are stored here (which
  63     * can be used to restore those options in the new BDS on insert) */
  64    BlockBackendRootState root_state;
  65
  66    bool enable_write_cache;
  67
  68    /* I/O stats (display with "info blockstats"). */
  69    BlockAcctStats stats;
  70
  71    BlockdevOnError on_read_error, on_write_error;
  72    bool iostatus_enabled;
  73    BlockDeviceIoStatus iostatus;
  74
  75    uint64_t perm;
  76    uint64_t shared_perm;
  77    bool disable_perm;
  78
  79    bool allow_aio_context_change;
  80    bool allow_write_beyond_eof;
  81
  82    NotifierList remove_bs_notifiers, insert_bs_notifiers;
  83    QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers;
  84
  85    int quiesce_counter;
  86    CoQueue queued_requests;
  87    bool disable_request_queuing;
  88
  89    VMChangeStateEntry *vmsh;
  90    bool force_allow_inactivate;
  91
  92    /* Number of in-flight aio requests.  BlockDriverState also counts
  93     * in-flight requests but aio requests can exist even when blk->root is
  94     * NULL, so we cannot rely on its counter for that case.
  95     * Accessed with atomic ops.
  96     */
  97    unsigned int in_flight;
  98};
  99
 100typedef struct BlockBackendAIOCB {
 101    BlockAIOCB common;
 102    BlockBackend *blk;
 103    int ret;
 104} BlockBackendAIOCB;
 105
 106static const AIOCBInfo block_backend_aiocb_info = {
 107    .get_aio_context = blk_aiocb_get_aio_context,
 108    .aiocb_size = sizeof(BlockBackendAIOCB),
 109};
 110
 111static void drive_info_del(DriveInfo *dinfo);
 112static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
 113
 114/* All BlockBackends */
 115static QTAILQ_HEAD(, BlockBackend) block_backends =
 116    QTAILQ_HEAD_INITIALIZER(block_backends);
 117
 118/* All BlockBackends referenced by the monitor and which are iterated through by
 119 * blk_next() */
 120static QTAILQ_HEAD(, BlockBackend) monitor_block_backends =
 121    QTAILQ_HEAD_INITIALIZER(monitor_block_backends);
 122
 123static void blk_root_inherit_options(int *child_flags, QDict *child_options,
 124                                     int parent_flags, QDict *parent_options)
 125{
 126    /* We're not supposed to call this function for root nodes */
 127    abort();
 128}
 129static void blk_root_drained_begin(BdrvChild *child);
 130static bool blk_root_drained_poll(BdrvChild *child);
 131static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter);
 132
 133static void blk_root_change_media(BdrvChild *child, bool load);
 134static void blk_root_resize(BdrvChild *child);
 135
 136static bool blk_root_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
 137                                     GSList **ignore, Error **errp);
 138static void blk_root_set_aio_ctx(BdrvChild *child, AioContext *ctx,
 139                                 GSList **ignore);
 140
 141static char *blk_root_get_parent_desc(BdrvChild *child)
 142{
 143    BlockBackend *blk = child->opaque;
 144    char *dev_id;
 145
 146    if (blk->name) {
 147        return g_strdup(blk->name);
 148    }
 149
 150    dev_id = blk_get_attached_dev_id(blk);
 151    if (*dev_id) {
 152        return dev_id;
 153    } else {
 154        /* TODO Callback into the BB owner for something more detailed */
 155        g_free(dev_id);
 156        return g_strdup("a block device");
 157    }
 158}
 159
 160static const char *blk_root_get_name(BdrvChild *child)
 161{
 162    return blk_name(child->opaque);
 163}
 164
 165static void blk_vm_state_changed(void *opaque, int running, RunState state)
 166{
 167    Error *local_err = NULL;
 168    BlockBackend *blk = opaque;
 169
 170    if (state == RUN_STATE_INMIGRATE) {
 171        return;
 172    }
 173
 174    qemu_del_vm_change_state_handler(blk->vmsh);
 175    blk->vmsh = NULL;
 176    blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
 177    if (local_err) {
 178        error_report_err(local_err);
 179    }
 180}
 181
 182/*
 183 * Notifies the user of the BlockBackend that migration has completed. qdev
 184 * devices can tighten their permissions in response (specifically revoke
 185 * shared write permissions that we needed for storage migration).
 186 *
 187 * If an error is returned, the VM cannot be allowed to be resumed.
 188 */
 189static void blk_root_activate(BdrvChild *child, Error **errp)
 190{
 191    BlockBackend *blk = child->opaque;
 192    Error *local_err = NULL;
 193
 194    if (!blk->disable_perm) {
 195        return;
 196    }
 197
 198    blk->disable_perm = false;
 199
 200    blk_set_perm(blk, blk->perm, BLK_PERM_ALL, &local_err);
 201    if (local_err) {
 202        error_propagate(errp, local_err);
 203        blk->disable_perm = true;
 204        return;
 205    }
 206
 207    if (runstate_check(RUN_STATE_INMIGRATE)) {
 208        /* Activation can happen when migration process is still active, for
 209         * example when nbd_server_add is called during non-shared storage
 210         * migration. Defer the shared_perm update to migration completion. */
 211        if (!blk->vmsh) {
 212            blk->vmsh = qemu_add_vm_change_state_handler(blk_vm_state_changed,
 213                                                         blk);
 214        }
 215        return;
 216    }
 217
 218    blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
 219    if (local_err) {
 220        error_propagate(errp, local_err);
 221        blk->disable_perm = true;
 222        return;
 223    }
 224}
 225
 226void blk_set_force_allow_inactivate(BlockBackend *blk)
 227{
 228    blk->force_allow_inactivate = true;
 229}
 230
 231static bool blk_can_inactivate(BlockBackend *blk)
 232{
 233    /* If it is a guest device, inactivate is ok. */
 234    if (blk->dev || blk_name(blk)[0]) {
 235        return true;
 236    }
 237
 238    /* Inactivating means no more writes to the image can be done,
 239     * even if those writes would be changes invisible to the
 240     * guest.  For block job BBs that satisfy this, we can just allow
 241     * it.  This is the case for mirror job source, which is required
 242     * by libvirt non-shared block migration. */
 243    if (!(blk->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED))) {
 244        return true;
 245    }
 246
 247    return blk->force_allow_inactivate;
 248}
 249
 250static int blk_root_inactivate(BdrvChild *child)
 251{
 252    BlockBackend *blk = child->opaque;
 253
 254    if (blk->disable_perm) {
 255        return 0;
 256    }
 257
 258    if (!blk_can_inactivate(blk)) {
 259        return -EPERM;
 260    }
 261
 262    blk->disable_perm = true;
 263    if (blk->root) {
 264        bdrv_child_try_set_perm(blk->root, 0, BLK_PERM_ALL, &error_abort);
 265    }
 266
 267    return 0;
 268}
 269
 270static void blk_root_attach(BdrvChild *child)
 271{
 272    BlockBackend *blk = child->opaque;
 273    BlockBackendAioNotifier *notifier;
 274
 275    trace_blk_root_attach(child, blk, child->bs);
 276
 277    QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
 278        bdrv_add_aio_context_notifier(child->bs,
 279                notifier->attached_aio_context,
 280                notifier->detach_aio_context,
 281                notifier->opaque);
 282    }
 283}
 284
 285static void blk_root_detach(BdrvChild *child)
 286{
 287    BlockBackend *blk = child->opaque;
 288    BlockBackendAioNotifier *notifier;
 289
 290    trace_blk_root_detach(child, blk, child->bs);
 291
 292    QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
 293        bdrv_remove_aio_context_notifier(child->bs,
 294                notifier->attached_aio_context,
 295                notifier->detach_aio_context,
 296                notifier->opaque);
 297    }
 298}
 299
 300static const BdrvChildRole child_root = {
 301    .inherit_options    = blk_root_inherit_options,
 302
 303    .change_media       = blk_root_change_media,
 304    .resize             = blk_root_resize,
 305    .get_name           = blk_root_get_name,
 306    .get_parent_desc    = blk_root_get_parent_desc,
 307
 308    .drained_begin      = blk_root_drained_begin,
 309    .drained_poll       = blk_root_drained_poll,
 310    .drained_end        = blk_root_drained_end,
 311
 312    .activate           = blk_root_activate,
 313    .inactivate         = blk_root_inactivate,
 314
 315    .attach             = blk_root_attach,
 316    .detach             = blk_root_detach,
 317
 318    .can_set_aio_ctx    = blk_root_can_set_aio_ctx,
 319    .set_aio_ctx        = blk_root_set_aio_ctx,
 320};
 321
 322/*
 323 * Create a new BlockBackend with a reference count of one.
 324 *
 325 * @perm is a bitmasks of BLK_PERM_* constants which describes the permissions
 326 * to request for a block driver node that is attached to this BlockBackend.
 327 * @shared_perm is a bitmask which describes which permissions may be granted
 328 * to other users of the attached node.
 329 * Both sets of permissions can be changed later using blk_set_perm().
 330 *
 331 * Return the new BlockBackend on success, null on failure.
 332 */
 333BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
 334{
 335    BlockBackend *blk;
 336
 337    blk = g_new0(BlockBackend, 1);
 338    blk->refcnt = 1;
 339    blk->ctx = ctx;
 340    blk->perm = perm;
 341    blk->shared_perm = shared_perm;
 342    blk_set_enable_write_cache(blk, true);
 343
 344    blk->on_read_error = BLOCKDEV_ON_ERROR_REPORT;
 345    blk->on_write_error = BLOCKDEV_ON_ERROR_ENOSPC;
 346
 347    block_acct_init(&blk->stats);
 348
 349    qemu_co_queue_init(&blk->queued_requests);
 350    notifier_list_init(&blk->remove_bs_notifiers);
 351    notifier_list_init(&blk->insert_bs_notifiers);
 352    QLIST_INIT(&blk->aio_notifiers);
 353
 354    QTAILQ_INSERT_TAIL(&block_backends, blk, link);
 355    return blk;
 356}
 357
 358/*
 359 * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
 360 * The new BlockBackend is in the main AioContext.
 361 *
 362 * Just as with bdrv_open(), after having called this function the reference to
 363 * @options belongs to the block layer (even on failure).
 364 *
 365 * TODO: Remove @filename and @flags; it should be possible to specify a whole
 366 * BDS tree just by specifying the @options QDict (or @reference,
 367 * alternatively). At the time of adding this function, this is not possible,
 368 * though, so callers of this function have to be able to specify @filename and
 369 * @flags.
 370 */
 371BlockBackend *blk_new_open(const char *filename, const char *reference,
 372                           QDict *options, int flags, Error **errp)
 373{
 374    BlockBackend *blk;
 375    BlockDriverState *bs;
 376    uint64_t perm = 0;
 377
 378    /* blk_new_open() is mainly used in .bdrv_create implementations and the
 379     * tools where sharing isn't a concern because the BDS stays private, so we
 380     * just request permission according to the flags.
 381     *
 382     * The exceptions are xen_disk and blockdev_init(); in these cases, the
 383     * caller of blk_new_open() doesn't make use of the permissions, but they
 384     * shouldn't hurt either. We can still share everything here because the
 385     * guest devices will add their own blockers if they can't share. */
 386    if ((flags & BDRV_O_NO_IO) == 0) {
 387        perm |= BLK_PERM_CONSISTENT_READ;
 388        if (flags & BDRV_O_RDWR) {
 389            perm |= BLK_PERM_WRITE;
 390        }
 391    }
 392    if (flags & BDRV_O_RESIZE) {
 393        perm |= BLK_PERM_RESIZE;
 394    }
 395
 396    blk = blk_new(qemu_get_aio_context(), perm, BLK_PERM_ALL);
 397    bs = bdrv_open(filename, reference, options, flags, errp);
 398    if (!bs) {
 399        blk_unref(blk);
 400        return NULL;
 401    }
 402
 403    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk->ctx,
 404                                       perm, BLK_PERM_ALL, blk, errp);
 405    if (!blk->root) {
 406        blk_unref(blk);
 407        return NULL;
 408    }
 409
 410    return blk;
 411}
 412
 413static void blk_delete(BlockBackend *blk)
 414{
 415    assert(!blk->refcnt);
 416    assert(!blk->name);
 417    assert(!blk->dev);
 418    if (blk->public.throttle_group_member.throttle_state) {
 419        blk_io_limits_disable(blk);
 420    }
 421    if (blk->root) {
 422        blk_remove_bs(blk);
 423    }
 424    if (blk->vmsh) {
 425        qemu_del_vm_change_state_handler(blk->vmsh);
 426        blk->vmsh = NULL;
 427    }
 428    assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
 429    assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
 430    assert(QLIST_EMPTY(&blk->aio_notifiers));
 431    QTAILQ_REMOVE(&block_backends, blk, link);
 432    drive_info_del(blk->legacy_dinfo);
 433    block_acct_cleanup(&blk->stats);
 434    g_free(blk);
 435}
 436
 437static void drive_info_del(DriveInfo *dinfo)
 438{
 439    if (!dinfo) {
 440        return;
 441    }
 442    qemu_opts_del(dinfo->opts);
 443    g_free(dinfo);
 444}
 445
 446int blk_get_refcnt(BlockBackend *blk)
 447{
 448    return blk ? blk->refcnt : 0;
 449}
 450
 451/*
 452 * Increment @blk's reference count.
 453 * @blk must not be null.
 454 */
 455void blk_ref(BlockBackend *blk)
 456{
 457    assert(blk->refcnt > 0);
 458    blk->refcnt++;
 459}
 460
 461/*
 462 * Decrement @blk's reference count.
 463 * If this drops it to zero, destroy @blk.
 464 * For convenience, do nothing if @blk is null.
 465 */
 466void blk_unref(BlockBackend *blk)
 467{
 468    if (blk) {
 469        assert(blk->refcnt > 0);
 470        if (blk->refcnt > 1) {
 471            blk->refcnt--;
 472        } else {
 473            blk_drain(blk);
 474            /* blk_drain() cannot resurrect blk, nobody held a reference */
 475            assert(blk->refcnt == 1);
 476            blk->refcnt = 0;
 477            blk_delete(blk);
 478        }
 479    }
 480}
 481
 482/*
 483 * Behaves similarly to blk_next() but iterates over all BlockBackends, even the
 484 * ones which are hidden (i.e. are not referenced by the monitor).
 485 */
 486BlockBackend *blk_all_next(BlockBackend *blk)
 487{
 488    return blk ? QTAILQ_NEXT(blk, link)
 489               : QTAILQ_FIRST(&block_backends);
 490}
 491
 492void blk_remove_all_bs(void)
 493{
 494    BlockBackend *blk = NULL;
 495
 496    while ((blk = blk_all_next(blk)) != NULL) {
 497        AioContext *ctx = blk_get_aio_context(blk);
 498
 499        aio_context_acquire(ctx);
 500        if (blk->root) {
 501            blk_remove_bs(blk);
 502        }
 503        aio_context_release(ctx);
 504    }
 505}
 506
 507/*
 508 * Return the monitor-owned BlockBackend after @blk.
 509 * If @blk is null, return the first one.
 510 * Else, return @blk's next sibling, which may be null.
 511 *
 512 * To iterate over all BlockBackends, do
 513 * for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
 514 *     ...
 515 * }
 516 */
 517BlockBackend *blk_next(BlockBackend *blk)
 518{
 519    return blk ? QTAILQ_NEXT(blk, monitor_link)
 520               : QTAILQ_FIRST(&monitor_block_backends);
 521}
 522
 523/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
 524 * the monitor or attached to a BlockBackend */
 525BlockDriverState *bdrv_next(BdrvNextIterator *it)
 526{
 527    BlockDriverState *bs, *old_bs;
 528
 529    /* Must be called from the main loop */
 530    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 531
 532    /* First, return all root nodes of BlockBackends. In order to avoid
 533     * returning a BDS twice when multiple BBs refer to it, we only return it
 534     * if the BB is the first one in the parent list of the BDS. */
 535    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
 536        BlockBackend *old_blk = it->blk;
 537
 538        old_bs = old_blk ? blk_bs(old_blk) : NULL;
 539
 540        do {
 541            it->blk = blk_all_next(it->blk);
 542            bs = it->blk ? blk_bs(it->blk) : NULL;
 543        } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
 544
 545        if (it->blk) {
 546            blk_ref(it->blk);
 547        }
 548        blk_unref(old_blk);
 549
 550        if (bs) {
 551            bdrv_ref(bs);
 552            bdrv_unref(old_bs);
 553            return bs;
 554        }
 555        it->phase = BDRV_NEXT_MONITOR_OWNED;
 556    } else {
 557        old_bs = it->bs;
 558    }
 559
 560    /* Then return the monitor-owned BDSes without a BB attached. Ignore all
 561     * BDSes that are attached to a BlockBackend here; they have been handled
 562     * by the above block already */
 563    do {
 564        it->bs = bdrv_next_monitor_owned(it->bs);
 565        bs = it->bs;
 566    } while (bs && bdrv_has_blk(bs));
 567
 568    if (bs) {
 569        bdrv_ref(bs);
 570    }
 571    bdrv_unref(old_bs);
 572
 573    return bs;
 574}
 575
 576static void bdrv_next_reset(BdrvNextIterator *it)
 577{
 578    *it = (BdrvNextIterator) {
 579        .phase = BDRV_NEXT_BACKEND_ROOTS,
 580    };
 581}
 582
 583BlockDriverState *bdrv_first(BdrvNextIterator *it)
 584{
 585    bdrv_next_reset(it);
 586    return bdrv_next(it);
 587}
 588
 589/* Must be called when aborting a bdrv_next() iteration before
 590 * bdrv_next() returns NULL */
 591void bdrv_next_cleanup(BdrvNextIterator *it)
 592{
 593    /* Must be called from the main loop */
 594    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 595
 596    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
 597        if (it->blk) {
 598            bdrv_unref(blk_bs(it->blk));
 599            blk_unref(it->blk);
 600        }
 601    } else {
 602        bdrv_unref(it->bs);
 603    }
 604
 605    bdrv_next_reset(it);
 606}
 607
 608/*
 609 * Add a BlockBackend into the list of backends referenced by the monitor, with
 610 * the given @name acting as the handle for the monitor.
 611 * Strictly for use by blockdev.c.
 612 *
 613 * @name must not be null or empty.
 614 *
 615 * Returns true on success and false on failure. In the latter case, an Error
 616 * object is returned through @errp.
 617 */
 618bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp)
 619{
 620    assert(!blk->name);
 621    assert(name && name[0]);
 622
 623    if (!id_wellformed(name)) {
 624        error_setg(errp, "Invalid device name");
 625        return false;
 626    }
 627    if (blk_by_name(name)) {
 628        error_setg(errp, "Device with id '%s' already exists", name);
 629        return false;
 630    }
 631    if (bdrv_find_node(name)) {
 632        error_setg(errp,
 633                   "Device name '%s' conflicts with an existing node name",
 634                   name);
 635        return false;
 636    }
 637
 638    blk->name = g_strdup(name);
 639    QTAILQ_INSERT_TAIL(&monitor_block_backends, blk, monitor_link);
 640    return true;
 641}
 642
 643/*
 644 * Remove a BlockBackend from the list of backends referenced by the monitor.
 645 * Strictly for use by blockdev.c.
 646 */
 647void monitor_remove_blk(BlockBackend *blk)
 648{
 649    if (!blk->name) {
 650        return;
 651    }
 652
 653    QTAILQ_REMOVE(&monitor_block_backends, blk, monitor_link);
 654    g_free(blk->name);
 655    blk->name = NULL;
 656}
 657
 658/*
 659 * Return @blk's name, a non-null string.
 660 * Returns an empty string iff @blk is not referenced by the monitor.
 661 */
 662const char *blk_name(const BlockBackend *blk)
 663{
 664    return blk->name ?: "";
 665}
 666
 667/*
 668 * Return the BlockBackend with name @name if it exists, else null.
 669 * @name must not be null.
 670 */
 671BlockBackend *blk_by_name(const char *name)
 672{
 673    BlockBackend *blk = NULL;
 674
 675    assert(name);
 676    while ((blk = blk_next(blk)) != NULL) {
 677        if (!strcmp(name, blk->name)) {
 678            return blk;
 679        }
 680    }
 681    return NULL;
 682}
 683
 684/*
 685 * Return the BlockDriverState attached to @blk if any, else null.
 686 */
 687BlockDriverState *blk_bs(BlockBackend *blk)
 688{
 689    return blk->root ? blk->root->bs : NULL;
 690}
 691
 692static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
 693{
 694    BdrvChild *child;
 695    QLIST_FOREACH(child, &bs->parents, next_parent) {
 696        if (child->role == &child_root) {
 697            return child->opaque;
 698        }
 699    }
 700
 701    return NULL;
 702}
 703
 704/*
 705 * Returns true if @bs has an associated BlockBackend.
 706 */
 707bool bdrv_has_blk(BlockDriverState *bs)
 708{
 709    return bdrv_first_blk(bs) != NULL;
 710}
 711
 712/*
 713 * Returns true if @bs has only BlockBackends as parents.
 714 */
 715bool bdrv_is_root_node(BlockDriverState *bs)
 716{
 717    BdrvChild *c;
 718
 719    QLIST_FOREACH(c, &bs->parents, next_parent) {
 720        if (c->role != &child_root) {
 721            return false;
 722        }
 723    }
 724
 725    return true;
 726}
 727
 728/*
 729 * Return @blk's DriveInfo if any, else null.
 730 */
 731DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
 732{
 733    return blk->legacy_dinfo;
 734}
 735
 736/*
 737 * Set @blk's DriveInfo to @dinfo, and return it.
 738 * @blk must not have a DriveInfo set already.
 739 * No other BlockBackend may have the same DriveInfo set.
 740 */
 741DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo)
 742{
 743    assert(!blk->legacy_dinfo);
 744    return blk->legacy_dinfo = dinfo;
 745}
 746
 747/*
 748 * Return the BlockBackend with DriveInfo @dinfo.
 749 * It must exist.
 750 */
 751BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
 752{
 753    BlockBackend *blk = NULL;
 754
 755    while ((blk = blk_next(blk)) != NULL) {
 756        if (blk->legacy_dinfo == dinfo) {
 757            return blk;
 758        }
 759    }
 760    abort();
 761}
 762
 763/*
 764 * Returns a pointer to the publicly accessible fields of @blk.
 765 */
 766BlockBackendPublic *blk_get_public(BlockBackend *blk)
 767{
 768    return &blk->public;
 769}
 770
 771/*
 772 * Returns a BlockBackend given the associated @public fields.
 773 */
 774BlockBackend *blk_by_public(BlockBackendPublic *public)
 775{
 776    return container_of(public, BlockBackend, public);
 777}
 778
 779/*
 780 * Disassociates the currently associated BlockDriverState from @blk.
 781 */
 782void blk_remove_bs(BlockBackend *blk)
 783{
 784    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
 785    BlockDriverState *bs;
 786
 787    notifier_list_notify(&blk->remove_bs_notifiers, blk);
 788    if (tgm->throttle_state) {
 789        bs = blk_bs(blk);
 790        bdrv_drained_begin(bs);
 791        throttle_group_detach_aio_context(tgm);
 792        throttle_group_attach_aio_context(tgm, qemu_get_aio_context());
 793        bdrv_drained_end(bs);
 794    }
 795
 796    blk_update_root_state(blk);
 797
 798    /* bdrv_root_unref_child() will cause blk->root to become stale and may
 799     * switch to a completion coroutine later on. Let's drain all I/O here
 800     * to avoid that and a potential QEMU crash.
 801     */
 802    blk_drain(blk);
 803    bdrv_root_unref_child(blk->root);
 804    blk->root = NULL;
 805}
 806
 807/*
 808 * Associates a new BlockDriverState with @blk.
 809 */
 810int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
 811{
 812    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
 813    bdrv_ref(bs);
 814    blk->root = bdrv_root_attach_child(bs, "root", &child_root, blk->ctx,
 815                                       blk->perm, blk->shared_perm, blk, errp);
 816    if (blk->root == NULL) {
 817        return -EPERM;
 818    }
 819
 820    notifier_list_notify(&blk->insert_bs_notifiers, blk);
 821    if (tgm->throttle_state) {
 822        throttle_group_detach_aio_context(tgm);
 823        throttle_group_attach_aio_context(tgm, bdrv_get_aio_context(bs));
 824    }
 825
 826    return 0;
 827}
 828
 829/*
 830 * Sets the permission bitmasks that the user of the BlockBackend needs.
 831 */
 832int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
 833                 Error **errp)
 834{
 835    int ret;
 836
 837    if (blk->root && !blk->disable_perm) {
 838        ret = bdrv_child_try_set_perm(blk->root, perm, shared_perm, errp);
 839        if (ret < 0) {
 840            return ret;
 841        }
 842    }
 843
 844    blk->perm = perm;
 845    blk->shared_perm = shared_perm;
 846
 847    return 0;
 848}
 849
 850void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm)
 851{
 852    *perm = blk->perm;
 853    *shared_perm = blk->shared_perm;
 854}
 855
 856/*
 857 * Attach device model @dev to @blk.
 858 * Return 0 on success, -EBUSY when a device model is attached already.
 859 */
 860int blk_attach_dev(BlockBackend *blk, DeviceState *dev)
 861{
 862    if (blk->dev) {
 863        return -EBUSY;
 864    }
 865
 866    /* While migration is still incoming, we don't need to apply the
 867     * permissions of guest device BlockBackends. We might still have a block
 868     * job or NBD server writing to the image for storage migration. */
 869    if (runstate_check(RUN_STATE_INMIGRATE)) {
 870        blk->disable_perm = true;
 871    }
 872
 873    blk_ref(blk);
 874    blk->dev = dev;
 875    blk_iostatus_reset(blk);
 876
 877    return 0;
 878}
 879
 880/*
 881 * Detach device model @dev from @blk.
 882 * @dev must be currently attached to @blk.
 883 */
 884void blk_detach_dev(BlockBackend *blk, DeviceState *dev)
 885{
 886    assert(blk->dev == dev);
 887    blk->dev = NULL;
 888    blk->dev_ops = NULL;
 889    blk->dev_opaque = NULL;
 890    blk->guest_block_size = 512;
 891    blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort);
 892    blk_unref(blk);
 893}
 894
 895/*
 896 * Return the device model attached to @blk if any, else null.
 897 */
 898DeviceState *blk_get_attached_dev(BlockBackend *blk)
 899{
 900    return blk->dev;
 901}
 902
 903/* Return the qdev ID, or if no ID is assigned the QOM path, of the block
 904 * device attached to the BlockBackend. */
 905char *blk_get_attached_dev_id(BlockBackend *blk)
 906{
 907    DeviceState *dev = blk->dev;
 908
 909    if (!dev) {
 910        return g_strdup("");
 911    } else if (dev->id) {
 912        return g_strdup(dev->id);
 913    }
 914
 915    return object_get_canonical_path(OBJECT(dev)) ?: g_strdup("");
 916}
 917
 918/*
 919 * Return the BlockBackend which has the device model @dev attached if it
 920 * exists, else null.
 921 *
 922 * @dev must not be null.
 923 */
 924BlockBackend *blk_by_dev(void *dev)
 925{
 926    BlockBackend *blk = NULL;
 927
 928    assert(dev != NULL);
 929    while ((blk = blk_all_next(blk)) != NULL) {
 930        if (blk->dev == dev) {
 931            return blk;
 932        }
 933    }
 934    return NULL;
 935}
 936
 937/*
 938 * Set @blk's device model callbacks to @ops.
 939 * @opaque is the opaque argument to pass to the callbacks.
 940 * This is for use by device models.
 941 */
 942void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
 943                     void *opaque)
 944{
 945    blk->dev_ops = ops;
 946    blk->dev_opaque = opaque;
 947
 948    /* Are we currently quiesced? Should we enforce this right now? */
 949    if (blk->quiesce_counter && ops->drained_begin) {
 950        ops->drained_begin(opaque);
 951    }
 952}
 953
 954/*
 955 * Notify @blk's attached device model of media change.
 956 *
 957 * If @load is true, notify of media load. This action can fail, meaning that
 958 * the medium cannot be loaded. @errp is set then.
 959 *
 960 * If @load is false, notify of media eject. This can never fail.
 961 *
 962 * Also send DEVICE_TRAY_MOVED events as appropriate.
 963 */
 964void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp)
 965{
 966    if (blk->dev_ops && blk->dev_ops->change_media_cb) {
 967        bool tray_was_open, tray_is_open;
 968        Error *local_err = NULL;
 969
 970        tray_was_open = blk_dev_is_tray_open(blk);
 971        blk->dev_ops->change_media_cb(blk->dev_opaque, load, &local_err);
 972        if (local_err) {
 973            assert(load == true);
 974            error_propagate(errp, local_err);
 975            return;
 976        }
 977        tray_is_open = blk_dev_is_tray_open(blk);
 978
 979        if (tray_was_open != tray_is_open) {
 980            char *id = blk_get_attached_dev_id(blk);
 981            qapi_event_send_device_tray_moved(blk_name(blk), id, tray_is_open);
 982            g_free(id);
 983        }
 984    }
 985}
 986
 987static void blk_root_change_media(BdrvChild *child, bool load)
 988{
 989    blk_dev_change_media_cb(child->opaque, load, NULL);
 990}
 991
 992/*
 993 * Does @blk's attached device model have removable media?
 994 * %true if no device model is attached.
 995 */
 996bool blk_dev_has_removable_media(BlockBackend *blk)
 997{
 998    return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb);
 999}
1000
1001/*
1002 * Does @blk's attached device model have a tray?
1003 */
1004bool blk_dev_has_tray(BlockBackend *blk)
1005{
1006    return blk->dev_ops && blk->dev_ops->is_tray_open;
1007}
1008
1009/*
1010 * Notify @blk's attached device model of a media eject request.
1011 * If @force is true, the medium is about to be yanked out forcefully.
1012 */
1013void blk_dev_eject_request(BlockBackend *blk, bool force)
1014{
1015    if (blk->dev_ops && blk->dev_ops->eject_request_cb) {
1016        blk->dev_ops->eject_request_cb(blk->dev_opaque, force);
1017    }
1018}
1019
1020/*
1021 * Does @blk's attached device model have a tray, and is it open?
1022 */
1023bool blk_dev_is_tray_open(BlockBackend *blk)
1024{
1025    if (blk_dev_has_tray(blk)) {
1026        return blk->dev_ops->is_tray_open(blk->dev_opaque);
1027    }
1028    return false;
1029}
1030
1031/*
1032 * Does @blk's attached device model have the medium locked?
1033 * %false if the device model has no such lock.
1034 */
1035bool blk_dev_is_medium_locked(BlockBackend *blk)
1036{
1037    if (blk->dev_ops && blk->dev_ops->is_medium_locked) {
1038        return blk->dev_ops->is_medium_locked(blk->dev_opaque);
1039    }
1040    return false;
1041}
1042
1043/*
1044 * Notify @blk's attached device model of a backend size change.
1045 */
1046static void blk_root_resize(BdrvChild *child)
1047{
1048    BlockBackend *blk = child->opaque;
1049
1050    if (blk->dev_ops && blk->dev_ops->resize_cb) {
1051        blk->dev_ops->resize_cb(blk->dev_opaque);
1052    }
1053}
1054
1055void blk_iostatus_enable(BlockBackend *blk)
1056{
1057    blk->iostatus_enabled = true;
1058    blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1059}
1060
1061/* The I/O status is only enabled if the drive explicitly
1062 * enables it _and_ the VM is configured to stop on errors */
1063bool blk_iostatus_is_enabled(const BlockBackend *blk)
1064{
1065    return (blk->iostatus_enabled &&
1066           (blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
1067            blk->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
1068            blk->on_read_error == BLOCKDEV_ON_ERROR_STOP));
1069}
1070
1071BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk)
1072{
1073    return blk->iostatus;
1074}
1075
1076void blk_iostatus_disable(BlockBackend *blk)
1077{
1078    blk->iostatus_enabled = false;
1079}
1080
1081void blk_iostatus_reset(BlockBackend *blk)
1082{
1083    if (blk_iostatus_is_enabled(blk)) {
1084        blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1085    }
1086}
1087
1088void blk_iostatus_set_err(BlockBackend *blk, int error)
1089{
1090    assert(blk_iostatus_is_enabled(blk));
1091    if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
1092        blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
1093                                          BLOCK_DEVICE_IO_STATUS_FAILED;
1094    }
1095}
1096
1097void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow)
1098{
1099    blk->allow_write_beyond_eof = allow;
1100}
1101
1102void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow)
1103{
1104    blk->allow_aio_context_change = allow;
1105}
1106
1107void blk_set_disable_request_queuing(BlockBackend *blk, bool disable)
1108{
1109    blk->disable_request_queuing = disable;
1110}
1111
1112static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
1113                                  size_t size)
1114{
1115    int64_t len;
1116
1117    if (size > INT_MAX) {
1118        return -EIO;
1119    }
1120
1121    if (!blk_is_available(blk)) {
1122        return -ENOMEDIUM;
1123    }
1124
1125    if (offset < 0) {
1126        return -EIO;
1127    }
1128
1129    if (!blk->allow_write_beyond_eof) {
1130        len = blk_getlength(blk);
1131        if (len < 0) {
1132            return len;
1133        }
1134
1135        if (offset > len || len - offset < size) {
1136            return -EIO;
1137        }
1138    }
1139
1140    return 0;
1141}
1142
1143static void coroutine_fn blk_wait_while_drained(BlockBackend *blk)
1144{
1145    if (blk->quiesce_counter && !blk->disable_request_queuing) {
1146        qemu_co_queue_wait(&blk->queued_requests, NULL);
1147    }
1148}
1149
1150int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
1151                               unsigned int bytes, QEMUIOVector *qiov,
1152                               BdrvRequestFlags flags)
1153{
1154    int ret;
1155    BlockDriverState *bs;
1156
1157    blk_wait_while_drained(blk);
1158
1159    /* Call blk_bs() only after waiting, the graph may have changed */
1160    bs = blk_bs(blk);
1161    trace_blk_co_preadv(blk, bs, offset, bytes, flags);
1162
1163    ret = blk_check_byte_request(blk, offset, bytes);
1164    if (ret < 0) {
1165        return ret;
1166    }
1167
1168    bdrv_inc_in_flight(bs);
1169
1170    /* throttling disk I/O */
1171    if (blk->public.throttle_group_member.throttle_state) {
1172        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1173                bytes, false);
1174    }
1175
1176    ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
1177    bdrv_dec_in_flight(bs);
1178    return ret;
1179}
1180
1181int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
1182                                     unsigned int bytes,
1183                                     QEMUIOVector *qiov, size_t qiov_offset,
1184                                     BdrvRequestFlags flags)
1185{
1186    int ret;
1187    BlockDriverState *bs;
1188
1189    blk_wait_while_drained(blk);
1190
1191    /* Call blk_bs() only after waiting, the graph may have changed */
1192    bs = blk_bs(blk);
1193    trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
1194
1195    ret = blk_check_byte_request(blk, offset, bytes);
1196    if (ret < 0) {
1197        return ret;
1198    }
1199
1200    bdrv_inc_in_flight(bs);
1201    /* throttling disk I/O */
1202    if (blk->public.throttle_group_member.throttle_state) {
1203        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1204                bytes, true);
1205    }
1206
1207    if (!blk->enable_write_cache) {
1208        flags |= BDRV_REQ_FUA;
1209    }
1210
1211    ret = bdrv_co_pwritev_part(blk->root, offset, bytes, qiov, qiov_offset,
1212                               flags);
1213    bdrv_dec_in_flight(bs);
1214    return ret;
1215}
1216
1217int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
1218                                unsigned int bytes, QEMUIOVector *qiov,
1219                                BdrvRequestFlags flags)
1220{
1221    return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags);
1222}
1223
1224typedef struct BlkRwCo {
1225    BlockBackend *blk;
1226    int64_t offset;
1227    void *iobuf;
1228    int ret;
1229    BdrvRequestFlags flags;
1230} BlkRwCo;
1231
1232static void blk_read_entry(void *opaque)
1233{
1234    BlkRwCo *rwco = opaque;
1235    QEMUIOVector *qiov = rwco->iobuf;
1236
1237    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, qiov->size,
1238                              qiov, rwco->flags);
1239    aio_wait_kick();
1240}
1241
1242static void blk_write_entry(void *opaque)
1243{
1244    BlkRwCo *rwco = opaque;
1245    QEMUIOVector *qiov = rwco->iobuf;
1246
1247    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, qiov->size,
1248                               qiov, rwco->flags);
1249    aio_wait_kick();
1250}
1251
1252static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
1253                   int64_t bytes, CoroutineEntry co_entry,
1254                   BdrvRequestFlags flags)
1255{
1256    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1257    BlkRwCo rwco = {
1258        .blk    = blk,
1259        .offset = offset,
1260        .iobuf  = &qiov,
1261        .flags  = flags,
1262        .ret    = NOT_DONE,
1263    };
1264
1265    if (qemu_in_coroutine()) {
1266        /* Fast-path if already in coroutine context */
1267        co_entry(&rwco);
1268    } else {
1269        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
1270        bdrv_coroutine_enter(blk_bs(blk), co);
1271        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
1272    }
1273
1274    return rwco.ret;
1275}
1276
1277int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1278                      int bytes, BdrvRequestFlags flags)
1279{
1280    return blk_prw(blk, offset, NULL, bytes, blk_write_entry,
1281                   flags | BDRV_REQ_ZERO_WRITE);
1282}
1283
1284int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
1285{
1286    return bdrv_make_zero(blk->root, flags);
1287}
1288
1289void blk_inc_in_flight(BlockBackend *blk)
1290{
1291    atomic_inc(&blk->in_flight);
1292}
1293
1294void blk_dec_in_flight(BlockBackend *blk)
1295{
1296    atomic_dec(&blk->in_flight);
1297    aio_wait_kick();
1298}
1299
1300static void error_callback_bh(void *opaque)
1301{
1302    struct BlockBackendAIOCB *acb = opaque;
1303
1304    blk_dec_in_flight(acb->blk);
1305    acb->common.cb(acb->common.opaque, acb->ret);
1306    qemu_aio_unref(acb);
1307}
1308
1309BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
1310                                  BlockCompletionFunc *cb,
1311                                  void *opaque, int ret)
1312{
1313    struct BlockBackendAIOCB *acb;
1314
1315    blk_inc_in_flight(blk);
1316    acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
1317    acb->blk = blk;
1318    acb->ret = ret;
1319
1320    replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1321                                     error_callback_bh, acb);
1322    return &acb->common;
1323}
1324
1325typedef struct BlkAioEmAIOCB {
1326    BlockAIOCB common;
1327    BlkRwCo rwco;
1328    int bytes;
1329    bool has_returned;
1330} BlkAioEmAIOCB;
1331
1332static const AIOCBInfo blk_aio_em_aiocb_info = {
1333    .aiocb_size         = sizeof(BlkAioEmAIOCB),
1334};
1335
1336static void blk_aio_complete(BlkAioEmAIOCB *acb)
1337{
1338    if (acb->has_returned) {
1339        acb->common.cb(acb->common.opaque, acb->rwco.ret);
1340        blk_dec_in_flight(acb->rwco.blk);
1341        qemu_aio_unref(acb);
1342    }
1343}
1344
1345static void blk_aio_complete_bh(void *opaque)
1346{
1347    BlkAioEmAIOCB *acb = opaque;
1348    assert(acb->has_returned);
1349    blk_aio_complete(acb);
1350}
1351
1352static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
1353                                void *iobuf, CoroutineEntry co_entry,
1354                                BdrvRequestFlags flags,
1355                                BlockCompletionFunc *cb, void *opaque)
1356{
1357    BlkAioEmAIOCB *acb;
1358    Coroutine *co;
1359
1360    blk_inc_in_flight(blk);
1361    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1362    acb->rwco = (BlkRwCo) {
1363        .blk    = blk,
1364        .offset = offset,
1365        .iobuf  = iobuf,
1366        .flags  = flags,
1367        .ret    = NOT_DONE,
1368    };
1369    acb->bytes = bytes;
1370    acb->has_returned = false;
1371
1372    co = qemu_coroutine_create(co_entry, acb);
1373    bdrv_coroutine_enter(blk_bs(blk), co);
1374
1375    acb->has_returned = true;
1376    if (acb->rwco.ret != NOT_DONE) {
1377        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1378                                         blk_aio_complete_bh, acb);
1379    }
1380
1381    return &acb->common;
1382}
1383
1384static void blk_aio_read_entry(void *opaque)
1385{
1386    BlkAioEmAIOCB *acb = opaque;
1387    BlkRwCo *rwco = &acb->rwco;
1388    QEMUIOVector *qiov = rwco->iobuf;
1389
1390    if (rwco->blk->quiesce_counter) {
1391        blk_dec_in_flight(rwco->blk);
1392        blk_wait_while_drained(rwco->blk);
1393        blk_inc_in_flight(rwco->blk);
1394    }
1395
1396    assert(qiov->size == acb->bytes);
1397    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, acb->bytes,
1398                              qiov, rwco->flags);
1399    blk_aio_complete(acb);
1400}
1401
1402static void blk_aio_write_entry(void *opaque)
1403{
1404    BlkAioEmAIOCB *acb = opaque;
1405    BlkRwCo *rwco = &acb->rwco;
1406    QEMUIOVector *qiov = rwco->iobuf;
1407
1408    if (rwco->blk->quiesce_counter) {
1409        blk_dec_in_flight(rwco->blk);
1410        blk_wait_while_drained(rwco->blk);
1411        blk_inc_in_flight(rwco->blk);
1412    }
1413
1414    assert(!qiov || qiov->size == acb->bytes);
1415    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, acb->bytes,
1416                               qiov, rwco->flags);
1417    blk_aio_complete(acb);
1418}
1419
1420BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1421                                  int count, BdrvRequestFlags flags,
1422                                  BlockCompletionFunc *cb, void *opaque)
1423{
1424    return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
1425                        flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
1426}
1427
1428int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
1429{
1430    int ret = blk_prw(blk, offset, buf, count, blk_read_entry, 0);
1431    if (ret < 0) {
1432        return ret;
1433    }
1434    return count;
1435}
1436
1437int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count,
1438               BdrvRequestFlags flags)
1439{
1440    int ret = blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
1441                      flags);
1442    if (ret < 0) {
1443        return ret;
1444    }
1445    return count;
1446}
1447
1448int64_t blk_getlength(BlockBackend *blk)
1449{
1450    if (!blk_is_available(blk)) {
1451        return -ENOMEDIUM;
1452    }
1453
1454    return bdrv_getlength(blk_bs(blk));
1455}
1456
1457void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr)
1458{
1459    if (!blk_bs(blk)) {
1460        *nb_sectors_ptr = 0;
1461    } else {
1462        bdrv_get_geometry(blk_bs(blk), nb_sectors_ptr);
1463    }
1464}
1465
1466int64_t blk_nb_sectors(BlockBackend *blk)
1467{
1468    if (!blk_is_available(blk)) {
1469        return -ENOMEDIUM;
1470    }
1471
1472    return bdrv_nb_sectors(blk_bs(blk));
1473}
1474
1475BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
1476                           QEMUIOVector *qiov, BdrvRequestFlags flags,
1477                           BlockCompletionFunc *cb, void *opaque)
1478{
1479    return blk_aio_prwv(blk, offset, qiov->size, qiov,
1480                        blk_aio_read_entry, flags, cb, opaque);
1481}
1482
1483BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
1484                            QEMUIOVector *qiov, BdrvRequestFlags flags,
1485                            BlockCompletionFunc *cb, void *opaque)
1486{
1487    return blk_aio_prwv(blk, offset, qiov->size, qiov,
1488                        blk_aio_write_entry, flags, cb, opaque);
1489}
1490
1491static void blk_aio_flush_entry(void *opaque)
1492{
1493    BlkAioEmAIOCB *acb = opaque;
1494    BlkRwCo *rwco = &acb->rwco;
1495
1496    rwco->ret = blk_co_flush(rwco->blk);
1497    blk_aio_complete(acb);
1498}
1499
1500BlockAIOCB *blk_aio_flush(BlockBackend *blk,
1501                          BlockCompletionFunc *cb, void *opaque)
1502{
1503    return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque);
1504}
1505
1506static void blk_aio_pdiscard_entry(void *opaque)
1507{
1508    BlkAioEmAIOCB *acb = opaque;
1509    BlkRwCo *rwco = &acb->rwco;
1510
1511    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, acb->bytes);
1512    blk_aio_complete(acb);
1513}
1514
1515BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
1516                             int64_t offset, int bytes,
1517                             BlockCompletionFunc *cb, void *opaque)
1518{
1519    return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0,
1520                        cb, opaque);
1521}
1522
1523void blk_aio_cancel(BlockAIOCB *acb)
1524{
1525    bdrv_aio_cancel(acb);
1526}
1527
1528void blk_aio_cancel_async(BlockAIOCB *acb)
1529{
1530    bdrv_aio_cancel_async(acb);
1531}
1532
1533int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1534{
1535    blk_wait_while_drained(blk);
1536
1537    if (!blk_is_available(blk)) {
1538        return -ENOMEDIUM;
1539    }
1540
1541    return bdrv_co_ioctl(blk_bs(blk), req, buf);
1542}
1543
1544static void blk_ioctl_entry(void *opaque)
1545{
1546    BlkRwCo *rwco = opaque;
1547    QEMUIOVector *qiov = rwco->iobuf;
1548
1549    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
1550                             qiov->iov[0].iov_base);
1551    aio_wait_kick();
1552}
1553
1554int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1555{
1556    return blk_prw(blk, req, buf, 0, blk_ioctl_entry, 0);
1557}
1558
1559static void blk_aio_ioctl_entry(void *opaque)
1560{
1561    BlkAioEmAIOCB *acb = opaque;
1562    BlkRwCo *rwco = &acb->rwco;
1563
1564    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset, rwco->iobuf);
1565
1566    blk_aio_complete(acb);
1567}
1568
1569BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
1570                          BlockCompletionFunc *cb, void *opaque)
1571{
1572    return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque);
1573}
1574
1575int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1576{
1577    int ret;
1578
1579    blk_wait_while_drained(blk);
1580
1581    ret = blk_check_byte_request(blk, offset, bytes);
1582    if (ret < 0) {
1583        return ret;
1584    }
1585
1586    return bdrv_co_pdiscard(blk->root, offset, bytes);
1587}
1588
1589int blk_co_flush(BlockBackend *blk)
1590{
1591    blk_wait_while_drained(blk);
1592
1593    if (!blk_is_available(blk)) {
1594        return -ENOMEDIUM;
1595    }
1596
1597    return bdrv_co_flush(blk_bs(blk));
1598}
1599
1600static void blk_flush_entry(void *opaque)
1601{
1602    BlkRwCo *rwco = opaque;
1603    rwco->ret = blk_co_flush(rwco->blk);
1604    aio_wait_kick();
1605}
1606
1607int blk_flush(BlockBackend *blk)
1608{
1609    return blk_prw(blk, 0, NULL, 0, blk_flush_entry, 0);
1610}
1611
1612void blk_drain(BlockBackend *blk)
1613{
1614    BlockDriverState *bs = blk_bs(blk);
1615
1616    if (bs) {
1617        bdrv_drained_begin(bs);
1618    }
1619
1620    /* We may have -ENOMEDIUM completions in flight */
1621    AIO_WAIT_WHILE(blk_get_aio_context(blk),
1622                   atomic_mb_read(&blk->in_flight) > 0);
1623
1624    if (bs) {
1625        bdrv_drained_end(bs);
1626    }
1627}
1628
1629void blk_drain_all(void)
1630{
1631    BlockBackend *blk = NULL;
1632
1633    bdrv_drain_all_begin();
1634
1635    while ((blk = blk_all_next(blk)) != NULL) {
1636        AioContext *ctx = blk_get_aio_context(blk);
1637
1638        aio_context_acquire(ctx);
1639
1640        /* We may have -ENOMEDIUM completions in flight */
1641        AIO_WAIT_WHILE(ctx, atomic_mb_read(&blk->in_flight) > 0);
1642
1643        aio_context_release(ctx);
1644    }
1645
1646    bdrv_drain_all_end();
1647}
1648
1649void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
1650                      BlockdevOnError on_write_error)
1651{
1652    blk->on_read_error = on_read_error;
1653    blk->on_write_error = on_write_error;
1654}
1655
1656BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read)
1657{
1658    return is_read ? blk->on_read_error : blk->on_write_error;
1659}
1660
1661BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
1662                                      int error)
1663{
1664    BlockdevOnError on_err = blk_get_on_error(blk, is_read);
1665
1666    switch (on_err) {
1667    case BLOCKDEV_ON_ERROR_ENOSPC:
1668        return (error == ENOSPC) ?
1669               BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
1670    case BLOCKDEV_ON_ERROR_STOP:
1671        return BLOCK_ERROR_ACTION_STOP;
1672    case BLOCKDEV_ON_ERROR_REPORT:
1673        return BLOCK_ERROR_ACTION_REPORT;
1674    case BLOCKDEV_ON_ERROR_IGNORE:
1675        return BLOCK_ERROR_ACTION_IGNORE;
1676    case BLOCKDEV_ON_ERROR_AUTO:
1677    default:
1678        abort();
1679    }
1680}
1681
1682static void send_qmp_error_event(BlockBackend *blk,
1683                                 BlockErrorAction action,
1684                                 bool is_read, int error)
1685{
1686    IoOperationType optype;
1687    BlockDriverState *bs = blk_bs(blk);
1688
1689    optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
1690    qapi_event_send_block_io_error(blk_name(blk), !!bs,
1691                                   bs ? bdrv_get_node_name(bs) : NULL, optype,
1692                                   action, blk_iostatus_is_enabled(blk),
1693                                   error == ENOSPC, strerror(error));
1694}
1695
1696/* This is done by device models because, while the block layer knows
1697 * about the error, it does not know whether an operation comes from
1698 * the device or the block layer (from a job, for example).
1699 */
1700void blk_error_action(BlockBackend *blk, BlockErrorAction action,
1701                      bool is_read, int error)
1702{
1703    assert(error >= 0);
1704
1705    if (action == BLOCK_ERROR_ACTION_STOP) {
1706        /* First set the iostatus, so that "info block" returns an iostatus
1707         * that matches the events raised so far (an additional error iostatus
1708         * is fine, but not a lost one).
1709         */
1710        blk_iostatus_set_err(blk, error);
1711
1712        /* Then raise the request to stop the VM and the event.
1713         * qemu_system_vmstop_request_prepare has two effects.  First,
1714         * it ensures that the STOP event always comes after the
1715         * BLOCK_IO_ERROR event.  Second, it ensures that even if management
1716         * can observe the STOP event and do a "cont" before the STOP
1717         * event is issued, the VM will not stop.  In this case, vm_start()
1718         * also ensures that the STOP/RESUME pair of events is emitted.
1719         */
1720        qemu_system_vmstop_request_prepare();
1721        send_qmp_error_event(blk, action, is_read, error);
1722        qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
1723    } else {
1724        send_qmp_error_event(blk, action, is_read, error);
1725    }
1726}
1727
1728bool blk_is_read_only(BlockBackend *blk)
1729{
1730    BlockDriverState *bs = blk_bs(blk);
1731
1732    if (bs) {
1733        return bdrv_is_read_only(bs);
1734    } else {
1735        return blk->root_state.read_only;
1736    }
1737}
1738
1739bool blk_is_sg(BlockBackend *blk)
1740{
1741    BlockDriverState *bs = blk_bs(blk);
1742
1743    if (!bs) {
1744        return false;
1745    }
1746
1747    return bdrv_is_sg(bs);
1748}
1749
1750bool blk_enable_write_cache(BlockBackend *blk)
1751{
1752    return blk->enable_write_cache;
1753}
1754
1755void blk_set_enable_write_cache(BlockBackend *blk, bool wce)
1756{
1757    blk->enable_write_cache = wce;
1758}
1759
1760void blk_invalidate_cache(BlockBackend *blk, Error **errp)
1761{
1762    BlockDriverState *bs = blk_bs(blk);
1763
1764    if (!bs) {
1765        error_setg(errp, "Device '%s' has no medium", blk->name);
1766        return;
1767    }
1768
1769    bdrv_invalidate_cache(bs, errp);
1770}
1771
1772bool blk_is_inserted(BlockBackend *blk)
1773{
1774    BlockDriverState *bs = blk_bs(blk);
1775
1776    return bs && bdrv_is_inserted(bs);
1777}
1778
1779bool blk_is_available(BlockBackend *blk)
1780{
1781    return blk_is_inserted(blk) && !blk_dev_is_tray_open(blk);
1782}
1783
1784void blk_lock_medium(BlockBackend *blk, bool locked)
1785{
1786    BlockDriverState *bs = blk_bs(blk);
1787
1788    if (bs) {
1789        bdrv_lock_medium(bs, locked);
1790    }
1791}
1792
1793void blk_eject(BlockBackend *blk, bool eject_flag)
1794{
1795    BlockDriverState *bs = blk_bs(blk);
1796    char *id;
1797
1798    if (bs) {
1799        bdrv_eject(bs, eject_flag);
1800    }
1801
1802    /* Whether or not we ejected on the backend,
1803     * the frontend experienced a tray event. */
1804    id = blk_get_attached_dev_id(blk);
1805    qapi_event_send_device_tray_moved(blk_name(blk), id,
1806                                      eject_flag);
1807    g_free(id);
1808}
1809
1810int blk_get_flags(BlockBackend *blk)
1811{
1812    BlockDriverState *bs = blk_bs(blk);
1813
1814    if (bs) {
1815        return bdrv_get_flags(bs);
1816    } else {
1817        return blk->root_state.open_flags;
1818    }
1819}
1820
1821/* Returns the minimum request alignment, in bytes; guaranteed nonzero */
1822uint32_t blk_get_request_alignment(BlockBackend *blk)
1823{
1824    BlockDriverState *bs = blk_bs(blk);
1825    return bs ? bs->bl.request_alignment : BDRV_SECTOR_SIZE;
1826}
1827
1828/* Returns the maximum transfer length, in bytes; guaranteed nonzero */
1829uint32_t blk_get_max_transfer(BlockBackend *blk)
1830{
1831    BlockDriverState *bs = blk_bs(blk);
1832    uint32_t max = 0;
1833
1834    if (bs) {
1835        max = bs->bl.max_transfer;
1836    }
1837    return MIN_NON_ZERO(max, INT_MAX);
1838}
1839
1840int blk_get_max_iov(BlockBackend *blk)
1841{
1842    return blk->root->bs->bl.max_iov;
1843}
1844
1845void blk_set_guest_block_size(BlockBackend *blk, int align)
1846{
1847    blk->guest_block_size = align;
1848}
1849
1850void *blk_try_blockalign(BlockBackend *blk, size_t size)
1851{
1852    return qemu_try_blockalign(blk ? blk_bs(blk) : NULL, size);
1853}
1854
1855void *blk_blockalign(BlockBackend *blk, size_t size)
1856{
1857    return qemu_blockalign(blk ? blk_bs(blk) : NULL, size);
1858}
1859
1860bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
1861{
1862    BlockDriverState *bs = blk_bs(blk);
1863
1864    if (!bs) {
1865        return false;
1866    }
1867
1868    return bdrv_op_is_blocked(bs, op, errp);
1869}
1870
1871void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
1872{
1873    BlockDriverState *bs = blk_bs(blk);
1874
1875    if (bs) {
1876        bdrv_op_unblock(bs, op, reason);
1877    }
1878}
1879
1880void blk_op_block_all(BlockBackend *blk, Error *reason)
1881{
1882    BlockDriverState *bs = blk_bs(blk);
1883
1884    if (bs) {
1885        bdrv_op_block_all(bs, reason);
1886    }
1887}
1888
1889void blk_op_unblock_all(BlockBackend *blk, Error *reason)
1890{
1891    BlockDriverState *bs = blk_bs(blk);
1892
1893    if (bs) {
1894        bdrv_op_unblock_all(bs, reason);
1895    }
1896}
1897
1898AioContext *blk_get_aio_context(BlockBackend *blk)
1899{
1900    BlockDriverState *bs = blk_bs(blk);
1901
1902    if (bs) {
1903        AioContext *ctx = bdrv_get_aio_context(blk_bs(blk));
1904        assert(ctx == blk->ctx);
1905    }
1906
1907    return blk->ctx;
1908}
1909
1910static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
1911{
1912    BlockBackendAIOCB *blk_acb = DO_UPCAST(BlockBackendAIOCB, common, acb);
1913    return blk_get_aio_context(blk_acb->blk);
1914}
1915
1916static int blk_do_set_aio_context(BlockBackend *blk, AioContext *new_context,
1917                                  bool update_root_node, Error **errp)
1918{
1919    BlockDriverState *bs = blk_bs(blk);
1920    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
1921    int ret;
1922
1923    if (bs) {
1924        if (update_root_node) {
1925            ret = bdrv_child_try_set_aio_context(bs, new_context, blk->root,
1926                                                 errp);
1927            if (ret < 0) {
1928                return ret;
1929            }
1930        }
1931        if (tgm->throttle_state) {
1932            bdrv_drained_begin(bs);
1933            throttle_group_detach_aio_context(tgm);
1934            throttle_group_attach_aio_context(tgm, new_context);
1935            bdrv_drained_end(bs);
1936        }
1937    }
1938
1939    blk->ctx = new_context;
1940    return 0;
1941}
1942
1943int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
1944                        Error **errp)
1945{
1946    return blk_do_set_aio_context(blk, new_context, true, errp);
1947}
1948
1949static bool blk_root_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
1950                                     GSList **ignore, Error **errp)
1951{
1952    BlockBackend *blk = child->opaque;
1953
1954    if (blk->allow_aio_context_change) {
1955        return true;
1956    }
1957
1958    /* Only manually created BlockBackends that are not attached to anything
1959     * can change their AioContext without updating their user. */
1960    if (!blk->name || blk->dev) {
1961        /* TODO Add BB name/QOM path */
1962        error_setg(errp, "Cannot change iothread of active block backend");
1963        return false;
1964    }
1965
1966    return true;
1967}
1968
1969static void blk_root_set_aio_ctx(BdrvChild *child, AioContext *ctx,
1970                                 GSList **ignore)
1971{
1972    BlockBackend *blk = child->opaque;
1973    blk_do_set_aio_context(blk, ctx, false, &error_abort);
1974}
1975
1976void blk_add_aio_context_notifier(BlockBackend *blk,
1977        void (*attached_aio_context)(AioContext *new_context, void *opaque),
1978        void (*detach_aio_context)(void *opaque), void *opaque)
1979{
1980    BlockBackendAioNotifier *notifier;
1981    BlockDriverState *bs = blk_bs(blk);
1982
1983    notifier = g_new(BlockBackendAioNotifier, 1);
1984    notifier->attached_aio_context = attached_aio_context;
1985    notifier->detach_aio_context = detach_aio_context;
1986    notifier->opaque = opaque;
1987    QLIST_INSERT_HEAD(&blk->aio_notifiers, notifier, list);
1988
1989    if (bs) {
1990        bdrv_add_aio_context_notifier(bs, attached_aio_context,
1991                                      detach_aio_context, opaque);
1992    }
1993}
1994
1995void blk_remove_aio_context_notifier(BlockBackend *blk,
1996                                     void (*attached_aio_context)(AioContext *,
1997                                                                  void *),
1998                                     void (*detach_aio_context)(void *),
1999                                     void *opaque)
2000{
2001    BlockBackendAioNotifier *notifier;
2002    BlockDriverState *bs = blk_bs(blk);
2003
2004    if (bs) {
2005        bdrv_remove_aio_context_notifier(bs, attached_aio_context,
2006                                         detach_aio_context, opaque);
2007    }
2008
2009    QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
2010        if (notifier->attached_aio_context == attached_aio_context &&
2011            notifier->detach_aio_context == detach_aio_context &&
2012            notifier->opaque == opaque) {
2013            QLIST_REMOVE(notifier, list);
2014            g_free(notifier);
2015            return;
2016        }
2017    }
2018
2019    abort();
2020}
2021
2022void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify)
2023{
2024    notifier_list_add(&blk->remove_bs_notifiers, notify);
2025}
2026
2027void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
2028{
2029    notifier_list_add(&blk->insert_bs_notifiers, notify);
2030}
2031
2032void blk_io_plug(BlockBackend *blk)
2033{
2034    BlockDriverState *bs = blk_bs(blk);
2035
2036    if (bs) {
2037        bdrv_io_plug(bs);
2038    }
2039}
2040
2041void blk_io_unplug(BlockBackend *blk)
2042{
2043    BlockDriverState *bs = blk_bs(blk);
2044
2045    if (bs) {
2046        bdrv_io_unplug(bs);
2047    }
2048}
2049
2050BlockAcctStats *blk_get_stats(BlockBackend *blk)
2051{
2052    return &blk->stats;
2053}
2054
2055void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
2056                  BlockCompletionFunc *cb, void *opaque)
2057{
2058    return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
2059}
2060
2061int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
2062                                      int bytes, BdrvRequestFlags flags)
2063{
2064    return blk_co_pwritev(blk, offset, bytes, NULL,
2065                          flags | BDRV_REQ_ZERO_WRITE);
2066}
2067
2068int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
2069                          int count)
2070{
2071    return blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
2072                   BDRV_REQ_WRITE_COMPRESSED);
2073}
2074
2075int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
2076                 PreallocMode prealloc, Error **errp)
2077{
2078    if (!blk_is_available(blk)) {
2079        error_setg(errp, "No medium inserted");
2080        return -ENOMEDIUM;
2081    }
2082
2083    return bdrv_truncate(blk->root, offset, exact, prealloc, errp);
2084}
2085
2086static void blk_pdiscard_entry(void *opaque)
2087{
2088    BlkRwCo *rwco = opaque;
2089    QEMUIOVector *qiov = rwco->iobuf;
2090
2091    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, qiov->size);
2092    aio_wait_kick();
2093}
2094
2095int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
2096{
2097    return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0);
2098}
2099
2100int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
2101                     int64_t pos, int size)
2102{
2103    int ret;
2104
2105    if (!blk_is_available(blk)) {
2106        return -ENOMEDIUM;
2107    }
2108
2109    ret = bdrv_save_vmstate(blk_bs(blk), buf, pos, size);
2110    if (ret < 0) {
2111        return ret;
2112    }
2113
2114    if (ret == size && !blk->enable_write_cache) {
2115        ret = bdrv_flush(blk_bs(blk));
2116    }
2117
2118    return ret < 0 ? ret : size;
2119}
2120
2121int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
2122{
2123    if (!blk_is_available(blk)) {
2124        return -ENOMEDIUM;
2125    }
2126
2127    return bdrv_load_vmstate(blk_bs(blk), buf, pos, size);
2128}
2129
2130int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
2131{
2132    if (!blk_is_available(blk)) {
2133        return -ENOMEDIUM;
2134    }
2135
2136    return bdrv_probe_blocksizes(blk_bs(blk), bsz);
2137}
2138
2139int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo)
2140{
2141    if (!blk_is_available(blk)) {
2142        return -ENOMEDIUM;
2143    }
2144
2145    return bdrv_probe_geometry(blk_bs(blk), geo);
2146}
2147
2148/*
2149 * Updates the BlockBackendRootState object with data from the currently
2150 * attached BlockDriverState.
2151 */
2152void blk_update_root_state(BlockBackend *blk)
2153{
2154    assert(blk->root);
2155
2156    blk->root_state.open_flags    = blk->root->bs->open_flags;
2157    blk->root_state.read_only     = blk->root->bs->read_only;
2158    blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
2159}
2160
2161/*
2162 * Returns the detect-zeroes setting to be used for bdrv_open() of a
2163 * BlockDriverState which is supposed to inherit the root state.
2164 */
2165bool blk_get_detect_zeroes_from_root_state(BlockBackend *blk)
2166{
2167    return blk->root_state.detect_zeroes;
2168}
2169
2170/*
2171 * Returns the flags to be used for bdrv_open() of a BlockDriverState which is
2172 * supposed to inherit the root state.
2173 */
2174int blk_get_open_flags_from_root_state(BlockBackend *blk)
2175{
2176    int bs_flags;
2177
2178    bs_flags = blk->root_state.read_only ? 0 : BDRV_O_RDWR;
2179    bs_flags |= blk->root_state.open_flags & ~BDRV_O_RDWR;
2180
2181    return bs_flags;
2182}
2183
2184BlockBackendRootState *blk_get_root_state(BlockBackend *blk)
2185{
2186    return &blk->root_state;
2187}
2188
2189int blk_commit_all(void)
2190{
2191    BlockBackend *blk = NULL;
2192
2193    while ((blk = blk_all_next(blk)) != NULL) {
2194        AioContext *aio_context = blk_get_aio_context(blk);
2195
2196        aio_context_acquire(aio_context);
2197        if (blk_is_inserted(blk) && blk->root->bs->backing) {
2198            int ret = bdrv_commit(blk->root->bs);
2199            if (ret < 0) {
2200                aio_context_release(aio_context);
2201                return ret;
2202            }
2203        }
2204        aio_context_release(aio_context);
2205    }
2206    return 0;
2207}
2208
2209
2210/* throttling disk I/O limits */
2211void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
2212{
2213    throttle_group_config(&blk->public.throttle_group_member, cfg);
2214}
2215
2216void blk_io_limits_disable(BlockBackend *blk)
2217{
2218    BlockDriverState *bs = blk_bs(blk);
2219    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2220    assert(tgm->throttle_state);
2221    if (bs) {
2222        bdrv_drained_begin(bs);
2223    }
2224    throttle_group_unregister_tgm(tgm);
2225    if (bs) {
2226        bdrv_drained_end(bs);
2227    }
2228}
2229
2230/* should be called before blk_set_io_limits if a limit is set */
2231void blk_io_limits_enable(BlockBackend *blk, const char *group)
2232{
2233    assert(!blk->public.throttle_group_member.throttle_state);
2234    throttle_group_register_tgm(&blk->public.throttle_group_member,
2235                                group, blk_get_aio_context(blk));
2236}
2237
2238void blk_io_limits_update_group(BlockBackend *blk, const char *group)
2239{
2240    /* this BB is not part of any group */
2241    if (!blk->public.throttle_group_member.throttle_state) {
2242        return;
2243    }
2244
2245    /* this BB is a part of the same group than the one we want */
2246    if (!g_strcmp0(throttle_group_get_name(&blk->public.throttle_group_member),
2247                group)) {
2248        return;
2249    }
2250
2251    /* need to change the group this bs belong to */
2252    blk_io_limits_disable(blk);
2253    blk_io_limits_enable(blk, group);
2254}
2255
2256static void blk_root_drained_begin(BdrvChild *child)
2257{
2258    BlockBackend *blk = child->opaque;
2259
2260    if (++blk->quiesce_counter == 1) {
2261        if (blk->dev_ops && blk->dev_ops->drained_begin) {
2262            blk->dev_ops->drained_begin(blk->dev_opaque);
2263        }
2264    }
2265
2266    /* Note that blk->root may not be accessible here yet if we are just
2267     * attaching to a BlockDriverState that is drained. Use child instead. */
2268
2269    if (atomic_fetch_inc(&blk->public.throttle_group_member.io_limits_disabled) == 0) {
2270        throttle_group_restart_tgm(&blk->public.throttle_group_member);
2271    }
2272}
2273
2274static bool blk_root_drained_poll(BdrvChild *child)
2275{
2276    BlockBackend *blk = child->opaque;
2277    assert(blk->quiesce_counter);
2278    return !!blk->in_flight;
2279}
2280
2281static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter)
2282{
2283    BlockBackend *blk = child->opaque;
2284    assert(blk->quiesce_counter);
2285
2286    assert(blk->public.throttle_group_member.io_limits_disabled);
2287    atomic_dec(&blk->public.throttle_group_member.io_limits_disabled);
2288
2289    if (--blk->quiesce_counter == 0) {
2290        if (blk->dev_ops && blk->dev_ops->drained_end) {
2291            blk->dev_ops->drained_end(blk->dev_opaque);
2292        }
2293        while (qemu_co_enter_next(&blk->queued_requests, NULL)) {
2294            /* Resume all queued requests */
2295        }
2296    }
2297}
2298
2299void blk_register_buf(BlockBackend *blk, void *host, size_t size)
2300{
2301    bdrv_register_buf(blk_bs(blk), host, size);
2302}
2303
2304void blk_unregister_buf(BlockBackend *blk, void *host)
2305{
2306    bdrv_unregister_buf(blk_bs(blk), host);
2307}
2308
2309int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
2310                                   BlockBackend *blk_out, int64_t off_out,
2311                                   int bytes, BdrvRequestFlags read_flags,
2312                                   BdrvRequestFlags write_flags)
2313{
2314    int r;
2315    r = blk_check_byte_request(blk_in, off_in, bytes);
2316    if (r) {
2317        return r;
2318    }
2319    r = blk_check_byte_request(blk_out, off_out, bytes);
2320    if (r) {
2321        return r;
2322    }
2323    return bdrv_co_copy_range(blk_in->root, off_in,
2324                              blk_out->root, off_out,
2325                              bytes, read_flags, write_flags);
2326}
2327
2328const BdrvChild *blk_root(BlockBackend *blk)
2329{
2330    return blk->root;
2331}
2332