qemu/block/block-backend.c
<<
>>
Prefs
   1/*
   2 * QEMU Block backends
   3 *
   4 * Copyright (C) 2014-2016 Red Hat, Inc.
   5 *
   6 * Authors:
   7 *  Markus Armbruster <armbru@redhat.com>,
   8 *
   9 * This work is licensed under the terms of the GNU LGPL, version 2.1
  10 * or later.  See the COPYING.LIB file in the top-level directory.
  11 */
  12
  13#include "qemu/osdep.h"
  14#include "sysemu/block-backend.h"
  15#include "block/block_int.h"
  16#include "block/blockjob.h"
  17#include "block/throttle-groups.h"
  18#include "hw/qdev-core.h"
  19#include "sysemu/blockdev.h"
  20#include "sysemu/runstate.h"
  21#include "sysemu/replay.h"
  22#include "qapi/error.h"
  23#include "qapi/qapi-events-block.h"
  24#include "qemu/id.h"
  25#include "qemu/main-loop.h"
  26#include "qemu/option.h"
  27#include "trace.h"
  28#include "migration/misc.h"
  29
  30/* Number of coroutines to reserve per attached device model */
  31#define COROUTINE_POOL_RESERVATION 64
  32
  33#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  34
  35static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb);
  36
  37typedef struct BlockBackendAioNotifier {
  38    void (*attached_aio_context)(AioContext *new_context, void *opaque);
  39    void (*detach_aio_context)(void *opaque);
  40    void *opaque;
  41    QLIST_ENTRY(BlockBackendAioNotifier) list;
  42} BlockBackendAioNotifier;
  43
  44struct BlockBackend {
  45    char *name;
  46    int refcnt;
  47    BdrvChild *root;
  48    AioContext *ctx;
  49    DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
  50    QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
  51    QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
  52    BlockBackendPublic public;
  53
  54    DeviceState *dev;           /* attached device model, if any */
  55    const BlockDevOps *dev_ops;
  56    void *dev_opaque;
  57
  58    /* the block size for which the guest device expects atomicity */
  59    int guest_block_size;
  60
  61    /* If the BDS tree is removed, some of its options are stored here (which
  62     * can be used to restore those options in the new BDS on insert) */
  63    BlockBackendRootState root_state;
  64
  65    bool enable_write_cache;
  66
  67    /* I/O stats (display with "info blockstats"). */
  68    BlockAcctStats stats;
  69
  70    BlockdevOnError on_read_error, on_write_error;
  71    bool iostatus_enabled;
  72    BlockDeviceIoStatus iostatus;
  73
  74    uint64_t perm;
  75    uint64_t shared_perm;
  76    bool disable_perm;
  77
  78    bool allow_aio_context_change;
  79    bool allow_write_beyond_eof;
  80
  81    NotifierList remove_bs_notifiers, insert_bs_notifiers;
  82    QLIST_HEAD(, BlockBackendAioNotifier) aio_notifiers;
  83
  84    int quiesce_counter;
  85    CoQueue queued_requests;
  86    bool disable_request_queuing;
  87
  88    VMChangeStateEntry *vmsh;
  89    bool force_allow_inactivate;
  90
  91    /* Number of in-flight aio requests.  BlockDriverState also counts
  92     * in-flight requests but aio requests can exist even when blk->root is
  93     * NULL, so we cannot rely on its counter for that case.
  94     * Accessed with atomic ops.
  95     */
  96    unsigned int in_flight;
  97};
  98
  99typedef struct BlockBackendAIOCB {
 100    BlockAIOCB common;
 101    BlockBackend *blk;
 102    int ret;
 103} BlockBackendAIOCB;
 104
 105static const AIOCBInfo block_backend_aiocb_info = {
 106    .get_aio_context = blk_aiocb_get_aio_context,
 107    .aiocb_size = sizeof(BlockBackendAIOCB),
 108};
 109
 110static void drive_info_del(DriveInfo *dinfo);
 111static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
 112
 113/* All BlockBackends */
 114static QTAILQ_HEAD(, BlockBackend) block_backends =
 115    QTAILQ_HEAD_INITIALIZER(block_backends);
 116
 117/* All BlockBackends referenced by the monitor and which are iterated through by
 118 * blk_next() */
 119static QTAILQ_HEAD(, BlockBackend) monitor_block_backends =
 120    QTAILQ_HEAD_INITIALIZER(monitor_block_backends);
 121
 122static void blk_root_inherit_options(BdrvChildRole role, bool parent_is_format,
 123                                     int *child_flags, QDict *child_options,
 124                                     int parent_flags, QDict *parent_options)
 125{
 126    /* We're not supposed to call this function for root nodes */
 127    abort();
 128}
 129static void blk_root_drained_begin(BdrvChild *child);
 130static bool blk_root_drained_poll(BdrvChild *child);
 131static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter);
 132
 133static void blk_root_change_media(BdrvChild *child, bool load);
 134static void blk_root_resize(BdrvChild *child);
 135
 136static bool blk_root_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
 137                                     GSList **ignore, Error **errp);
 138static void blk_root_set_aio_ctx(BdrvChild *child, AioContext *ctx,
 139                                 GSList **ignore);
 140
 141static char *blk_root_get_parent_desc(BdrvChild *child)
 142{
 143    BlockBackend *blk = child->opaque;
 144    g_autofree char *dev_id = NULL;
 145
 146    if (blk->name) {
 147        return g_strdup_printf("block device '%s'", blk->name);
 148    }
 149
 150    dev_id = blk_get_attached_dev_id(blk);
 151    if (*dev_id) {
 152        return g_strdup_printf("block device '%s'", dev_id);
 153    } else {
 154        /* TODO Callback into the BB owner for something more detailed */
 155        return g_strdup("an unnamed block device");
 156    }
 157}
 158
 159static const char *blk_root_get_name(BdrvChild *child)
 160{
 161    return blk_name(child->opaque);
 162}
 163
 164static void blk_vm_state_changed(void *opaque, bool running, RunState state)
 165{
 166    Error *local_err = NULL;
 167    BlockBackend *blk = opaque;
 168
 169    if (state == RUN_STATE_INMIGRATE) {
 170        return;
 171    }
 172
 173    qemu_del_vm_change_state_handler(blk->vmsh);
 174    blk->vmsh = NULL;
 175    blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
 176    if (local_err) {
 177        error_report_err(local_err);
 178    }
 179}
 180
 181/*
 182 * Notifies the user of the BlockBackend that migration has completed. qdev
 183 * devices can tighten their permissions in response (specifically revoke
 184 * shared write permissions that we needed for storage migration).
 185 *
 186 * If an error is returned, the VM cannot be allowed to be resumed.
 187 */
 188static void blk_root_activate(BdrvChild *child, Error **errp)
 189{
 190    BlockBackend *blk = child->opaque;
 191    Error *local_err = NULL;
 192
 193    if (!blk->disable_perm) {
 194        return;
 195    }
 196
 197    blk->disable_perm = false;
 198
 199    blk_set_perm(blk, blk->perm, BLK_PERM_ALL, &local_err);
 200    if (local_err) {
 201        error_propagate(errp, local_err);
 202        blk->disable_perm = true;
 203        return;
 204    }
 205
 206    if (runstate_check(RUN_STATE_INMIGRATE)) {
 207        /* Activation can happen when migration process is still active, for
 208         * example when nbd_server_add is called during non-shared storage
 209         * migration. Defer the shared_perm update to migration completion. */
 210        if (!blk->vmsh) {
 211            blk->vmsh = qemu_add_vm_change_state_handler(blk_vm_state_changed,
 212                                                         blk);
 213        }
 214        return;
 215    }
 216
 217    blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
 218    if (local_err) {
 219        error_propagate(errp, local_err);
 220        blk->disable_perm = true;
 221        return;
 222    }
 223}
 224
 225void blk_set_force_allow_inactivate(BlockBackend *blk)
 226{
 227    blk->force_allow_inactivate = true;
 228}
 229
 230static bool blk_can_inactivate(BlockBackend *blk)
 231{
 232    /* If it is a guest device, inactivate is ok. */
 233    if (blk->dev || blk_name(blk)[0]) {
 234        return true;
 235    }
 236
 237    /* Inactivating means no more writes to the image can be done,
 238     * even if those writes would be changes invisible to the
 239     * guest.  For block job BBs that satisfy this, we can just allow
 240     * it.  This is the case for mirror job source, which is required
 241     * by libvirt non-shared block migration. */
 242    if (!(blk->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED))) {
 243        return true;
 244    }
 245
 246    return blk->force_allow_inactivate;
 247}
 248
 249static int blk_root_inactivate(BdrvChild *child)
 250{
 251    BlockBackend *blk = child->opaque;
 252
 253    if (blk->disable_perm) {
 254        return 0;
 255    }
 256
 257    if (!blk_can_inactivate(blk)) {
 258        return -EPERM;
 259    }
 260
 261    blk->disable_perm = true;
 262    if (blk->root) {
 263        bdrv_child_try_set_perm(blk->root, 0, BLK_PERM_ALL, &error_abort);
 264    }
 265
 266    return 0;
 267}
 268
 269static void blk_root_attach(BdrvChild *child)
 270{
 271    BlockBackend *blk = child->opaque;
 272    BlockBackendAioNotifier *notifier;
 273
 274    trace_blk_root_attach(child, blk, child->bs);
 275
 276    QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
 277        bdrv_add_aio_context_notifier(child->bs,
 278                notifier->attached_aio_context,
 279                notifier->detach_aio_context,
 280                notifier->opaque);
 281    }
 282}
 283
 284static void blk_root_detach(BdrvChild *child)
 285{
 286    BlockBackend *blk = child->opaque;
 287    BlockBackendAioNotifier *notifier;
 288
 289    trace_blk_root_detach(child, blk, child->bs);
 290
 291    QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
 292        bdrv_remove_aio_context_notifier(child->bs,
 293                notifier->attached_aio_context,
 294                notifier->detach_aio_context,
 295                notifier->opaque);
 296    }
 297}
 298
 299static AioContext *blk_root_get_parent_aio_context(BdrvChild *c)
 300{
 301    BlockBackend *blk = c->opaque;
 302
 303    return blk_get_aio_context(blk);
 304}
 305
 306static const BdrvChildClass child_root = {
 307    .inherit_options    = blk_root_inherit_options,
 308
 309    .change_media       = blk_root_change_media,
 310    .resize             = blk_root_resize,
 311    .get_name           = blk_root_get_name,
 312    .get_parent_desc    = blk_root_get_parent_desc,
 313
 314    .drained_begin      = blk_root_drained_begin,
 315    .drained_poll       = blk_root_drained_poll,
 316    .drained_end        = blk_root_drained_end,
 317
 318    .activate           = blk_root_activate,
 319    .inactivate         = blk_root_inactivate,
 320
 321    .attach             = blk_root_attach,
 322    .detach             = blk_root_detach,
 323
 324    .can_set_aio_ctx    = blk_root_can_set_aio_ctx,
 325    .set_aio_ctx        = blk_root_set_aio_ctx,
 326
 327    .get_parent_aio_context = blk_root_get_parent_aio_context,
 328};
 329
 330/*
 331 * Create a new BlockBackend with a reference count of one.
 332 *
 333 * @perm is a bitmasks of BLK_PERM_* constants which describes the permissions
 334 * to request for a block driver node that is attached to this BlockBackend.
 335 * @shared_perm is a bitmask which describes which permissions may be granted
 336 * to other users of the attached node.
 337 * Both sets of permissions can be changed later using blk_set_perm().
 338 *
 339 * Return the new BlockBackend on success, null on failure.
 340 */
 341BlockBackend *blk_new(AioContext *ctx, uint64_t perm, uint64_t shared_perm)
 342{
 343    BlockBackend *blk;
 344
 345    blk = g_new0(BlockBackend, 1);
 346    blk->refcnt = 1;
 347    blk->ctx = ctx;
 348    blk->perm = perm;
 349    blk->shared_perm = shared_perm;
 350    blk_set_enable_write_cache(blk, true);
 351
 352    blk->on_read_error = BLOCKDEV_ON_ERROR_REPORT;
 353    blk->on_write_error = BLOCKDEV_ON_ERROR_ENOSPC;
 354
 355    block_acct_init(&blk->stats);
 356
 357    qemu_co_queue_init(&blk->queued_requests);
 358    notifier_list_init(&blk->remove_bs_notifiers);
 359    notifier_list_init(&blk->insert_bs_notifiers);
 360    QLIST_INIT(&blk->aio_notifiers);
 361
 362    QTAILQ_INSERT_TAIL(&block_backends, blk, link);
 363    return blk;
 364}
 365
 366/*
 367 * Create a new BlockBackend connected to an existing BlockDriverState.
 368 *
 369 * @perm is a bitmasks of BLK_PERM_* constants which describes the
 370 * permissions to request for @bs that is attached to this
 371 * BlockBackend.  @shared_perm is a bitmask which describes which
 372 * permissions may be granted to other users of the attached node.
 373 * Both sets of permissions can be changed later using blk_set_perm().
 374 *
 375 * Return the new BlockBackend on success, null on failure.
 376 */
 377BlockBackend *blk_new_with_bs(BlockDriverState *bs, uint64_t perm,
 378                              uint64_t shared_perm, Error **errp)
 379{
 380    BlockBackend *blk = blk_new(bdrv_get_aio_context(bs), perm, shared_perm);
 381
 382    if (blk_insert_bs(blk, bs, errp) < 0) {
 383        blk_unref(blk);
 384        return NULL;
 385    }
 386    return blk;
 387}
 388
 389/*
 390 * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
 391 * The new BlockBackend is in the main AioContext.
 392 *
 393 * Just as with bdrv_open(), after having called this function the reference to
 394 * @options belongs to the block layer (even on failure).
 395 *
 396 * TODO: Remove @filename and @flags; it should be possible to specify a whole
 397 * BDS tree just by specifying the @options QDict (or @reference,
 398 * alternatively). At the time of adding this function, this is not possible,
 399 * though, so callers of this function have to be able to specify @filename and
 400 * @flags.
 401 */
 402BlockBackend *blk_new_open(const char *filename, const char *reference,
 403                           QDict *options, int flags, Error **errp)
 404{
 405    BlockBackend *blk;
 406    BlockDriverState *bs;
 407    uint64_t perm = 0;
 408    uint64_t shared = BLK_PERM_ALL;
 409
 410    /*
 411     * blk_new_open() is mainly used in .bdrv_create implementations and the
 412     * tools where sharing isn't a major concern because the BDS stays private
 413     * and the file is generally not supposed to be used by a second process,
 414     * so we just request permission according to the flags.
 415     *
 416     * The exceptions are xen_disk and blockdev_init(); in these cases, the
 417     * caller of blk_new_open() doesn't make use of the permissions, but they
 418     * shouldn't hurt either. We can still share everything here because the
 419     * guest devices will add their own blockers if they can't share.
 420     */
 421    if ((flags & BDRV_O_NO_IO) == 0) {
 422        perm |= BLK_PERM_CONSISTENT_READ;
 423        if (flags & BDRV_O_RDWR) {
 424            perm |= BLK_PERM_WRITE;
 425        }
 426    }
 427    if (flags & BDRV_O_RESIZE) {
 428        perm |= BLK_PERM_RESIZE;
 429    }
 430    if (flags & BDRV_O_NO_SHARE) {
 431        shared = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
 432    }
 433
 434    blk = blk_new(qemu_get_aio_context(), perm, shared);
 435    bs = bdrv_open(filename, reference, options, flags, errp);
 436    if (!bs) {
 437        blk_unref(blk);
 438        return NULL;
 439    }
 440
 441    blk->root = bdrv_root_attach_child(bs, "root", &child_root,
 442                                       BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
 443                                       perm, shared, blk, errp);
 444    if (!blk->root) {
 445        blk_unref(blk);
 446        return NULL;
 447    }
 448
 449    return blk;
 450}
 451
 452static void blk_delete(BlockBackend *blk)
 453{
 454    assert(!blk->refcnt);
 455    assert(!blk->name);
 456    assert(!blk->dev);
 457    if (blk->public.throttle_group_member.throttle_state) {
 458        blk_io_limits_disable(blk);
 459    }
 460    if (blk->root) {
 461        blk_remove_bs(blk);
 462    }
 463    if (blk->vmsh) {
 464        qemu_del_vm_change_state_handler(blk->vmsh);
 465        blk->vmsh = NULL;
 466    }
 467    assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
 468    assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
 469    assert(QLIST_EMPTY(&blk->aio_notifiers));
 470    QTAILQ_REMOVE(&block_backends, blk, link);
 471    drive_info_del(blk->legacy_dinfo);
 472    block_acct_cleanup(&blk->stats);
 473    g_free(blk);
 474}
 475
 476static void drive_info_del(DriveInfo *dinfo)
 477{
 478    if (!dinfo) {
 479        return;
 480    }
 481    qemu_opts_del(dinfo->opts);
 482    g_free(dinfo);
 483}
 484
 485int blk_get_refcnt(BlockBackend *blk)
 486{
 487    return blk ? blk->refcnt : 0;
 488}
 489
 490/*
 491 * Increment @blk's reference count.
 492 * @blk must not be null.
 493 */
 494void blk_ref(BlockBackend *blk)
 495{
 496    assert(blk->refcnt > 0);
 497    blk->refcnt++;
 498}
 499
 500/*
 501 * Decrement @blk's reference count.
 502 * If this drops it to zero, destroy @blk.
 503 * For convenience, do nothing if @blk is null.
 504 */
 505void blk_unref(BlockBackend *blk)
 506{
 507    if (blk) {
 508        assert(blk->refcnt > 0);
 509        if (blk->refcnt > 1) {
 510            blk->refcnt--;
 511        } else {
 512            blk_drain(blk);
 513            /* blk_drain() cannot resurrect blk, nobody held a reference */
 514            assert(blk->refcnt == 1);
 515            blk->refcnt = 0;
 516            blk_delete(blk);
 517        }
 518    }
 519}
 520
 521/*
 522 * Behaves similarly to blk_next() but iterates over all BlockBackends, even the
 523 * ones which are hidden (i.e. are not referenced by the monitor).
 524 */
 525BlockBackend *blk_all_next(BlockBackend *blk)
 526{
 527    return blk ? QTAILQ_NEXT(blk, link)
 528               : QTAILQ_FIRST(&block_backends);
 529}
 530
 531void blk_remove_all_bs(void)
 532{
 533    BlockBackend *blk = NULL;
 534
 535    while ((blk = blk_all_next(blk)) != NULL) {
 536        AioContext *ctx = blk_get_aio_context(blk);
 537
 538        aio_context_acquire(ctx);
 539        if (blk->root) {
 540            blk_remove_bs(blk);
 541        }
 542        aio_context_release(ctx);
 543    }
 544}
 545
 546/*
 547 * Return the monitor-owned BlockBackend after @blk.
 548 * If @blk is null, return the first one.
 549 * Else, return @blk's next sibling, which may be null.
 550 *
 551 * To iterate over all BlockBackends, do
 552 * for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
 553 *     ...
 554 * }
 555 */
 556BlockBackend *blk_next(BlockBackend *blk)
 557{
 558    return blk ? QTAILQ_NEXT(blk, monitor_link)
 559               : QTAILQ_FIRST(&monitor_block_backends);
 560}
 561
 562/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
 563 * the monitor or attached to a BlockBackend */
 564BlockDriverState *bdrv_next(BdrvNextIterator *it)
 565{
 566    BlockDriverState *bs, *old_bs;
 567
 568    /* Must be called from the main loop */
 569    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 570
 571    /* First, return all root nodes of BlockBackends. In order to avoid
 572     * returning a BDS twice when multiple BBs refer to it, we only return it
 573     * if the BB is the first one in the parent list of the BDS. */
 574    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
 575        BlockBackend *old_blk = it->blk;
 576
 577        old_bs = old_blk ? blk_bs(old_blk) : NULL;
 578
 579        do {
 580            it->blk = blk_all_next(it->blk);
 581            bs = it->blk ? blk_bs(it->blk) : NULL;
 582        } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
 583
 584        if (it->blk) {
 585            blk_ref(it->blk);
 586        }
 587        blk_unref(old_blk);
 588
 589        if (bs) {
 590            bdrv_ref(bs);
 591            bdrv_unref(old_bs);
 592            return bs;
 593        }
 594        it->phase = BDRV_NEXT_MONITOR_OWNED;
 595    } else {
 596        old_bs = it->bs;
 597    }
 598
 599    /* Then return the monitor-owned BDSes without a BB attached. Ignore all
 600     * BDSes that are attached to a BlockBackend here; they have been handled
 601     * by the above block already */
 602    do {
 603        it->bs = bdrv_next_monitor_owned(it->bs);
 604        bs = it->bs;
 605    } while (bs && bdrv_has_blk(bs));
 606
 607    if (bs) {
 608        bdrv_ref(bs);
 609    }
 610    bdrv_unref(old_bs);
 611
 612    return bs;
 613}
 614
 615static void bdrv_next_reset(BdrvNextIterator *it)
 616{
 617    *it = (BdrvNextIterator) {
 618        .phase = BDRV_NEXT_BACKEND_ROOTS,
 619    };
 620}
 621
 622BlockDriverState *bdrv_first(BdrvNextIterator *it)
 623{
 624    bdrv_next_reset(it);
 625    return bdrv_next(it);
 626}
 627
 628/* Must be called when aborting a bdrv_next() iteration before
 629 * bdrv_next() returns NULL */
 630void bdrv_next_cleanup(BdrvNextIterator *it)
 631{
 632    /* Must be called from the main loop */
 633    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 634
 635    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
 636        if (it->blk) {
 637            bdrv_unref(blk_bs(it->blk));
 638            blk_unref(it->blk);
 639        }
 640    } else {
 641        bdrv_unref(it->bs);
 642    }
 643
 644    bdrv_next_reset(it);
 645}
 646
 647/*
 648 * Add a BlockBackend into the list of backends referenced by the monitor, with
 649 * the given @name acting as the handle for the monitor.
 650 * Strictly for use by blockdev.c.
 651 *
 652 * @name must not be null or empty.
 653 *
 654 * Returns true on success and false on failure. In the latter case, an Error
 655 * object is returned through @errp.
 656 */
 657bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp)
 658{
 659    assert(!blk->name);
 660    assert(name && name[0]);
 661
 662    if (!id_wellformed(name)) {
 663        error_setg(errp, "Invalid device name");
 664        return false;
 665    }
 666    if (blk_by_name(name)) {
 667        error_setg(errp, "Device with id '%s' already exists", name);
 668        return false;
 669    }
 670    if (bdrv_find_node(name)) {
 671        error_setg(errp,
 672                   "Device name '%s' conflicts with an existing node name",
 673                   name);
 674        return false;
 675    }
 676
 677    blk->name = g_strdup(name);
 678    QTAILQ_INSERT_TAIL(&monitor_block_backends, blk, monitor_link);
 679    return true;
 680}
 681
 682/*
 683 * Remove a BlockBackend from the list of backends referenced by the monitor.
 684 * Strictly for use by blockdev.c.
 685 */
 686void monitor_remove_blk(BlockBackend *blk)
 687{
 688    if (!blk->name) {
 689        return;
 690    }
 691
 692    QTAILQ_REMOVE(&monitor_block_backends, blk, monitor_link);
 693    g_free(blk->name);
 694    blk->name = NULL;
 695}
 696
 697/*
 698 * Return @blk's name, a non-null string.
 699 * Returns an empty string iff @blk is not referenced by the monitor.
 700 */
 701const char *blk_name(const BlockBackend *blk)
 702{
 703    return blk->name ?: "";
 704}
 705
 706/*
 707 * Return the BlockBackend with name @name if it exists, else null.
 708 * @name must not be null.
 709 */
 710BlockBackend *blk_by_name(const char *name)
 711{
 712    BlockBackend *blk = NULL;
 713
 714    assert(name);
 715    while ((blk = blk_next(blk)) != NULL) {
 716        if (!strcmp(name, blk->name)) {
 717            return blk;
 718        }
 719    }
 720    return NULL;
 721}
 722
 723/*
 724 * Return the BlockDriverState attached to @blk if any, else null.
 725 */
 726BlockDriverState *blk_bs(BlockBackend *blk)
 727{
 728    return blk->root ? blk->root->bs : NULL;
 729}
 730
 731static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
 732{
 733    BdrvChild *child;
 734    QLIST_FOREACH(child, &bs->parents, next_parent) {
 735        if (child->klass == &child_root) {
 736            return child->opaque;
 737        }
 738    }
 739
 740    return NULL;
 741}
 742
 743/*
 744 * Returns true if @bs has an associated BlockBackend.
 745 */
 746bool bdrv_has_blk(BlockDriverState *bs)
 747{
 748    return bdrv_first_blk(bs) != NULL;
 749}
 750
 751/*
 752 * Returns true if @bs has only BlockBackends as parents.
 753 */
 754bool bdrv_is_root_node(BlockDriverState *bs)
 755{
 756    BdrvChild *c;
 757
 758    QLIST_FOREACH(c, &bs->parents, next_parent) {
 759        if (c->klass != &child_root) {
 760            return false;
 761        }
 762    }
 763
 764    return true;
 765}
 766
 767/*
 768 * Return @blk's DriveInfo if any, else null.
 769 */
 770DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
 771{
 772    return blk->legacy_dinfo;
 773}
 774
 775/*
 776 * Set @blk's DriveInfo to @dinfo, and return it.
 777 * @blk must not have a DriveInfo set already.
 778 * No other BlockBackend may have the same DriveInfo set.
 779 */
 780DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo)
 781{
 782    assert(!blk->legacy_dinfo);
 783    return blk->legacy_dinfo = dinfo;
 784}
 785
 786/*
 787 * Return the BlockBackend with DriveInfo @dinfo.
 788 * It must exist.
 789 */
 790BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
 791{
 792    BlockBackend *blk = NULL;
 793
 794    while ((blk = blk_next(blk)) != NULL) {
 795        if (blk->legacy_dinfo == dinfo) {
 796            return blk;
 797        }
 798    }
 799    abort();
 800}
 801
 802/*
 803 * Returns a pointer to the publicly accessible fields of @blk.
 804 */
 805BlockBackendPublic *blk_get_public(BlockBackend *blk)
 806{
 807    return &blk->public;
 808}
 809
 810/*
 811 * Returns a BlockBackend given the associated @public fields.
 812 */
 813BlockBackend *blk_by_public(BlockBackendPublic *public)
 814{
 815    return container_of(public, BlockBackend, public);
 816}
 817
 818/*
 819 * Disassociates the currently associated BlockDriverState from @blk.
 820 */
 821void blk_remove_bs(BlockBackend *blk)
 822{
 823    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
 824    BlockDriverState *bs;
 825    BdrvChild *root;
 826
 827    notifier_list_notify(&blk->remove_bs_notifiers, blk);
 828    if (tgm->throttle_state) {
 829        bs = blk_bs(blk);
 830        bdrv_drained_begin(bs);
 831        throttle_group_detach_aio_context(tgm);
 832        throttle_group_attach_aio_context(tgm, qemu_get_aio_context());
 833        bdrv_drained_end(bs);
 834    }
 835
 836    blk_update_root_state(blk);
 837
 838    /* bdrv_root_unref_child() will cause blk->root to become stale and may
 839     * switch to a completion coroutine later on. Let's drain all I/O here
 840     * to avoid that and a potential QEMU crash.
 841     */
 842    blk_drain(blk);
 843    root = blk->root;
 844    blk->root = NULL;
 845    bdrv_root_unref_child(root);
 846}
 847
 848/*
 849 * Associates a new BlockDriverState with @blk.
 850 */
 851int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
 852{
 853    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
 854    bdrv_ref(bs);
 855    blk->root = bdrv_root_attach_child(bs, "root", &child_root,
 856                                       BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
 857                                       blk->perm, blk->shared_perm,
 858                                       blk, errp);
 859    if (blk->root == NULL) {
 860        return -EPERM;
 861    }
 862
 863    notifier_list_notify(&blk->insert_bs_notifiers, blk);
 864    if (tgm->throttle_state) {
 865        throttle_group_detach_aio_context(tgm);
 866        throttle_group_attach_aio_context(tgm, bdrv_get_aio_context(bs));
 867    }
 868
 869    return 0;
 870}
 871
 872/*
 873 * Sets the permission bitmasks that the user of the BlockBackend needs.
 874 */
 875int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
 876                 Error **errp)
 877{
 878    int ret;
 879
 880    if (blk->root && !blk->disable_perm) {
 881        ret = bdrv_child_try_set_perm(blk->root, perm, shared_perm, errp);
 882        if (ret < 0) {
 883            return ret;
 884        }
 885    }
 886
 887    blk->perm = perm;
 888    blk->shared_perm = shared_perm;
 889
 890    return 0;
 891}
 892
 893void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm)
 894{
 895    *perm = blk->perm;
 896    *shared_perm = blk->shared_perm;
 897}
 898
 899/*
 900 * Attach device model @dev to @blk.
 901 * Return 0 on success, -EBUSY when a device model is attached already.
 902 */
 903int blk_attach_dev(BlockBackend *blk, DeviceState *dev)
 904{
 905    if (blk->dev) {
 906        return -EBUSY;
 907    }
 908
 909    /* While migration is still incoming, we don't need to apply the
 910     * permissions of guest device BlockBackends. We might still have a block
 911     * job or NBD server writing to the image for storage migration. */
 912    if (runstate_check(RUN_STATE_INMIGRATE)) {
 913        blk->disable_perm = true;
 914    }
 915
 916    blk_ref(blk);
 917    blk->dev = dev;
 918    blk_iostatus_reset(blk);
 919
 920    return 0;
 921}
 922
 923/*
 924 * Detach device model @dev from @blk.
 925 * @dev must be currently attached to @blk.
 926 */
 927void blk_detach_dev(BlockBackend *blk, DeviceState *dev)
 928{
 929    assert(blk->dev == dev);
 930    blk->dev = NULL;
 931    blk->dev_ops = NULL;
 932    blk->dev_opaque = NULL;
 933    blk->guest_block_size = 512;
 934    blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort);
 935    blk_unref(blk);
 936}
 937
 938/*
 939 * Return the device model attached to @blk if any, else null.
 940 */
 941DeviceState *blk_get_attached_dev(BlockBackend *blk)
 942{
 943    return blk->dev;
 944}
 945
 946/* Return the qdev ID, or if no ID is assigned the QOM path, of the block
 947 * device attached to the BlockBackend. */
 948char *blk_get_attached_dev_id(BlockBackend *blk)
 949{
 950    DeviceState *dev = blk->dev;
 951
 952    if (!dev) {
 953        return g_strdup("");
 954    } else if (dev->id) {
 955        return g_strdup(dev->id);
 956    }
 957
 958    return object_get_canonical_path(OBJECT(dev)) ?: g_strdup("");
 959}
 960
 961/*
 962 * Return the BlockBackend which has the device model @dev attached if it
 963 * exists, else null.
 964 *
 965 * @dev must not be null.
 966 */
 967BlockBackend *blk_by_dev(void *dev)
 968{
 969    BlockBackend *blk = NULL;
 970
 971    assert(dev != NULL);
 972    while ((blk = blk_all_next(blk)) != NULL) {
 973        if (blk->dev == dev) {
 974            return blk;
 975        }
 976    }
 977    return NULL;
 978}
 979
 980/*
 981 * Set @blk's device model callbacks to @ops.
 982 * @opaque is the opaque argument to pass to the callbacks.
 983 * This is for use by device models.
 984 */
 985void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
 986                     void *opaque)
 987{
 988    blk->dev_ops = ops;
 989    blk->dev_opaque = opaque;
 990
 991    /* Are we currently quiesced? Should we enforce this right now? */
 992    if (blk->quiesce_counter && ops->drained_begin) {
 993        ops->drained_begin(opaque);
 994    }
 995}
 996
 997/*
 998 * Notify @blk's attached device model of media change.
 999 *
1000 * If @load is true, notify of media load. This action can fail, meaning that
1001 * the medium cannot be loaded. @errp is set then.
1002 *
1003 * If @load is false, notify of media eject. This can never fail.
1004 *
1005 * Also send DEVICE_TRAY_MOVED events as appropriate.
1006 */
1007void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp)
1008{
1009    if (blk->dev_ops && blk->dev_ops->change_media_cb) {
1010        bool tray_was_open, tray_is_open;
1011        Error *local_err = NULL;
1012
1013        tray_was_open = blk_dev_is_tray_open(blk);
1014        blk->dev_ops->change_media_cb(blk->dev_opaque, load, &local_err);
1015        if (local_err) {
1016            assert(load == true);
1017            error_propagate(errp, local_err);
1018            return;
1019        }
1020        tray_is_open = blk_dev_is_tray_open(blk);
1021
1022        if (tray_was_open != tray_is_open) {
1023            char *id = blk_get_attached_dev_id(blk);
1024            qapi_event_send_device_tray_moved(blk_name(blk), id, tray_is_open);
1025            g_free(id);
1026        }
1027    }
1028}
1029
1030static void blk_root_change_media(BdrvChild *child, bool load)
1031{
1032    blk_dev_change_media_cb(child->opaque, load, NULL);
1033}
1034
1035/*
1036 * Does @blk's attached device model have removable media?
1037 * %true if no device model is attached.
1038 */
1039bool blk_dev_has_removable_media(BlockBackend *blk)
1040{
1041    return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb);
1042}
1043
1044/*
1045 * Does @blk's attached device model have a tray?
1046 */
1047bool blk_dev_has_tray(BlockBackend *blk)
1048{
1049    return blk->dev_ops && blk->dev_ops->is_tray_open;
1050}
1051
1052/*
1053 * Notify @blk's attached device model of a media eject request.
1054 * If @force is true, the medium is about to be yanked out forcefully.
1055 */
1056void blk_dev_eject_request(BlockBackend *blk, bool force)
1057{
1058    if (blk->dev_ops && blk->dev_ops->eject_request_cb) {
1059        blk->dev_ops->eject_request_cb(blk->dev_opaque, force);
1060    }
1061}
1062
1063/*
1064 * Does @blk's attached device model have a tray, and is it open?
1065 */
1066bool blk_dev_is_tray_open(BlockBackend *blk)
1067{
1068    if (blk_dev_has_tray(blk)) {
1069        return blk->dev_ops->is_tray_open(blk->dev_opaque);
1070    }
1071    return false;
1072}
1073
1074/*
1075 * Does @blk's attached device model have the medium locked?
1076 * %false if the device model has no such lock.
1077 */
1078bool blk_dev_is_medium_locked(BlockBackend *blk)
1079{
1080    if (blk->dev_ops && blk->dev_ops->is_medium_locked) {
1081        return blk->dev_ops->is_medium_locked(blk->dev_opaque);
1082    }
1083    return false;
1084}
1085
1086/*
1087 * Notify @blk's attached device model of a backend size change.
1088 */
1089static void blk_root_resize(BdrvChild *child)
1090{
1091    BlockBackend *blk = child->opaque;
1092
1093    if (blk->dev_ops && blk->dev_ops->resize_cb) {
1094        blk->dev_ops->resize_cb(blk->dev_opaque);
1095    }
1096}
1097
1098void blk_iostatus_enable(BlockBackend *blk)
1099{
1100    blk->iostatus_enabled = true;
1101    blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1102}
1103
1104/* The I/O status is only enabled if the drive explicitly
1105 * enables it _and_ the VM is configured to stop on errors */
1106bool blk_iostatus_is_enabled(const BlockBackend *blk)
1107{
1108    return (blk->iostatus_enabled &&
1109           (blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
1110            blk->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
1111            blk->on_read_error == BLOCKDEV_ON_ERROR_STOP));
1112}
1113
1114BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk)
1115{
1116    return blk->iostatus;
1117}
1118
1119void blk_iostatus_disable(BlockBackend *blk)
1120{
1121    blk->iostatus_enabled = false;
1122}
1123
1124void blk_iostatus_reset(BlockBackend *blk)
1125{
1126    if (blk_iostatus_is_enabled(blk)) {
1127        blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1128    }
1129}
1130
1131void blk_iostatus_set_err(BlockBackend *blk, int error)
1132{
1133    assert(blk_iostatus_is_enabled(blk));
1134    if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
1135        blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
1136                                          BLOCK_DEVICE_IO_STATUS_FAILED;
1137    }
1138}
1139
1140void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow)
1141{
1142    blk->allow_write_beyond_eof = allow;
1143}
1144
1145void blk_set_allow_aio_context_change(BlockBackend *blk, bool allow)
1146{
1147    blk->allow_aio_context_change = allow;
1148}
1149
1150void blk_set_disable_request_queuing(BlockBackend *blk, bool disable)
1151{
1152    blk->disable_request_queuing = disable;
1153}
1154
1155static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
1156                                  size_t size)
1157{
1158    int64_t len;
1159
1160    if (size > INT_MAX) {
1161        return -EIO;
1162    }
1163
1164    if (!blk_is_available(blk)) {
1165        return -ENOMEDIUM;
1166    }
1167
1168    if (offset < 0) {
1169        return -EIO;
1170    }
1171
1172    if (!blk->allow_write_beyond_eof) {
1173        len = blk_getlength(blk);
1174        if (len < 0) {
1175            return len;
1176        }
1177
1178        if (offset > len || len - offset < size) {
1179            return -EIO;
1180        }
1181    }
1182
1183    return 0;
1184}
1185
1186/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1187static void coroutine_fn blk_wait_while_drained(BlockBackend *blk)
1188{
1189    assert(blk->in_flight > 0);
1190
1191    if (blk->quiesce_counter && !blk->disable_request_queuing) {
1192        blk_dec_in_flight(blk);
1193        qemu_co_queue_wait(&blk->queued_requests, NULL);
1194        blk_inc_in_flight(blk);
1195    }
1196}
1197
1198/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1199static int coroutine_fn
1200blk_do_preadv(BlockBackend *blk, int64_t offset, unsigned int bytes,
1201              QEMUIOVector *qiov, BdrvRequestFlags flags)
1202{
1203    int ret;
1204    BlockDriverState *bs;
1205
1206    blk_wait_while_drained(blk);
1207
1208    /* Call blk_bs() only after waiting, the graph may have changed */
1209    bs = blk_bs(blk);
1210    trace_blk_co_preadv(blk, bs, offset, bytes, flags);
1211
1212    ret = blk_check_byte_request(blk, offset, bytes);
1213    if (ret < 0) {
1214        return ret;
1215    }
1216
1217    bdrv_inc_in_flight(bs);
1218
1219    /* throttling disk I/O */
1220    if (blk->public.throttle_group_member.throttle_state) {
1221        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1222                bytes, false);
1223    }
1224
1225    ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
1226    bdrv_dec_in_flight(bs);
1227    return ret;
1228}
1229
1230int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
1231                               unsigned int bytes, QEMUIOVector *qiov,
1232                               BdrvRequestFlags flags)
1233{
1234    int ret;
1235
1236    blk_inc_in_flight(blk);
1237    ret = blk_do_preadv(blk, offset, bytes, qiov, flags);
1238    blk_dec_in_flight(blk);
1239
1240    return ret;
1241}
1242
1243/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1244static int coroutine_fn
1245blk_do_pwritev_part(BlockBackend *blk, int64_t offset, unsigned int bytes,
1246                    QEMUIOVector *qiov, size_t qiov_offset,
1247                    BdrvRequestFlags flags)
1248{
1249    int ret;
1250    BlockDriverState *bs;
1251
1252    blk_wait_while_drained(blk);
1253
1254    /* Call blk_bs() only after waiting, the graph may have changed */
1255    bs = blk_bs(blk);
1256    trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
1257
1258    ret = blk_check_byte_request(blk, offset, bytes);
1259    if (ret < 0) {
1260        return ret;
1261    }
1262
1263    bdrv_inc_in_flight(bs);
1264    /* throttling disk I/O */
1265    if (blk->public.throttle_group_member.throttle_state) {
1266        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1267                bytes, true);
1268    }
1269
1270    if (!blk->enable_write_cache) {
1271        flags |= BDRV_REQ_FUA;
1272    }
1273
1274    ret = bdrv_co_pwritev_part(blk->root, offset, bytes, qiov, qiov_offset,
1275                               flags);
1276    bdrv_dec_in_flight(bs);
1277    return ret;
1278}
1279
1280int coroutine_fn blk_co_pwritev_part(BlockBackend *blk, int64_t offset,
1281                                     unsigned int bytes,
1282                                     QEMUIOVector *qiov, size_t qiov_offset,
1283                                     BdrvRequestFlags flags)
1284{
1285    int ret;
1286
1287    blk_inc_in_flight(blk);
1288    ret = blk_do_pwritev_part(blk, offset, bytes, qiov, qiov_offset, flags);
1289    blk_dec_in_flight(blk);
1290
1291    return ret;
1292}
1293
1294int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
1295                                unsigned int bytes, QEMUIOVector *qiov,
1296                                BdrvRequestFlags flags)
1297{
1298    return blk_co_pwritev_part(blk, offset, bytes, qiov, 0, flags);
1299}
1300
1301typedef struct BlkRwCo {
1302    BlockBackend *blk;
1303    int64_t offset;
1304    void *iobuf;
1305    int ret;
1306    BdrvRequestFlags flags;
1307} BlkRwCo;
1308
1309static void blk_read_entry(void *opaque)
1310{
1311    BlkRwCo *rwco = opaque;
1312    QEMUIOVector *qiov = rwco->iobuf;
1313
1314    rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, qiov->size,
1315                              qiov, rwco->flags);
1316    aio_wait_kick();
1317}
1318
1319static void blk_write_entry(void *opaque)
1320{
1321    BlkRwCo *rwco = opaque;
1322    QEMUIOVector *qiov = rwco->iobuf;
1323
1324    rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, qiov->size,
1325                                    qiov, 0, rwco->flags);
1326    aio_wait_kick();
1327}
1328
1329static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
1330                   int64_t bytes, CoroutineEntry co_entry,
1331                   BdrvRequestFlags flags)
1332{
1333    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
1334    BlkRwCo rwco = {
1335        .blk    = blk,
1336        .offset = offset,
1337        .iobuf  = &qiov,
1338        .flags  = flags,
1339        .ret    = NOT_DONE,
1340    };
1341
1342    blk_inc_in_flight(blk);
1343    if (qemu_in_coroutine()) {
1344        /* Fast-path if already in coroutine context */
1345        co_entry(&rwco);
1346    } else {
1347        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
1348        bdrv_coroutine_enter(blk_bs(blk), co);
1349        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
1350    }
1351    blk_dec_in_flight(blk);
1352
1353    return rwco.ret;
1354}
1355
1356int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1357                      int bytes, BdrvRequestFlags flags)
1358{
1359    return blk_prw(blk, offset, NULL, bytes, blk_write_entry,
1360                   flags | BDRV_REQ_ZERO_WRITE);
1361}
1362
1363int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
1364{
1365    return bdrv_make_zero(blk->root, flags);
1366}
1367
1368void blk_inc_in_flight(BlockBackend *blk)
1369{
1370    qatomic_inc(&blk->in_flight);
1371}
1372
1373void blk_dec_in_flight(BlockBackend *blk)
1374{
1375    qatomic_dec(&blk->in_flight);
1376    aio_wait_kick();
1377}
1378
1379static void error_callback_bh(void *opaque)
1380{
1381    struct BlockBackendAIOCB *acb = opaque;
1382
1383    blk_dec_in_flight(acb->blk);
1384    acb->common.cb(acb->common.opaque, acb->ret);
1385    qemu_aio_unref(acb);
1386}
1387
1388BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
1389                                  BlockCompletionFunc *cb,
1390                                  void *opaque, int ret)
1391{
1392    struct BlockBackendAIOCB *acb;
1393
1394    blk_inc_in_flight(blk);
1395    acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
1396    acb->blk = blk;
1397    acb->ret = ret;
1398
1399    replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1400                                     error_callback_bh, acb);
1401    return &acb->common;
1402}
1403
1404typedef struct BlkAioEmAIOCB {
1405    BlockAIOCB common;
1406    BlkRwCo rwco;
1407    int bytes;
1408    bool has_returned;
1409} BlkAioEmAIOCB;
1410
1411static AioContext *blk_aio_em_aiocb_get_aio_context(BlockAIOCB *acb_)
1412{
1413    BlkAioEmAIOCB *acb = container_of(acb_, BlkAioEmAIOCB, common);
1414
1415    return blk_get_aio_context(acb->rwco.blk);
1416}
1417
1418static const AIOCBInfo blk_aio_em_aiocb_info = {
1419    .aiocb_size         = sizeof(BlkAioEmAIOCB),
1420    .get_aio_context    = blk_aio_em_aiocb_get_aio_context,
1421};
1422
1423static void blk_aio_complete(BlkAioEmAIOCB *acb)
1424{
1425    if (acb->has_returned) {
1426        acb->common.cb(acb->common.opaque, acb->rwco.ret);
1427        blk_dec_in_flight(acb->rwco.blk);
1428        qemu_aio_unref(acb);
1429    }
1430}
1431
1432static void blk_aio_complete_bh(void *opaque)
1433{
1434    BlkAioEmAIOCB *acb = opaque;
1435    assert(acb->has_returned);
1436    blk_aio_complete(acb);
1437}
1438
1439static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
1440                                void *iobuf, CoroutineEntry co_entry,
1441                                BdrvRequestFlags flags,
1442                                BlockCompletionFunc *cb, void *opaque)
1443{
1444    BlkAioEmAIOCB *acb;
1445    Coroutine *co;
1446
1447    blk_inc_in_flight(blk);
1448    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1449    acb->rwco = (BlkRwCo) {
1450        .blk    = blk,
1451        .offset = offset,
1452        .iobuf  = iobuf,
1453        .flags  = flags,
1454        .ret    = NOT_DONE,
1455    };
1456    acb->bytes = bytes;
1457    acb->has_returned = false;
1458
1459    co = qemu_coroutine_create(co_entry, acb);
1460    bdrv_coroutine_enter(blk_bs(blk), co);
1461
1462    acb->has_returned = true;
1463    if (acb->rwco.ret != NOT_DONE) {
1464        replay_bh_schedule_oneshot_event(blk_get_aio_context(blk),
1465                                         blk_aio_complete_bh, acb);
1466    }
1467
1468    return &acb->common;
1469}
1470
1471static void blk_aio_read_entry(void *opaque)
1472{
1473    BlkAioEmAIOCB *acb = opaque;
1474    BlkRwCo *rwco = &acb->rwco;
1475    QEMUIOVector *qiov = rwco->iobuf;
1476
1477    assert(qiov->size == acb->bytes);
1478    rwco->ret = blk_do_preadv(rwco->blk, rwco->offset, acb->bytes,
1479                              qiov, rwco->flags);
1480    blk_aio_complete(acb);
1481}
1482
1483static void blk_aio_write_entry(void *opaque)
1484{
1485    BlkAioEmAIOCB *acb = opaque;
1486    BlkRwCo *rwco = &acb->rwco;
1487    QEMUIOVector *qiov = rwco->iobuf;
1488
1489    assert(!qiov || qiov->size == acb->bytes);
1490    rwco->ret = blk_do_pwritev_part(rwco->blk, rwco->offset, acb->bytes,
1491                                    qiov, 0, rwco->flags);
1492    blk_aio_complete(acb);
1493}
1494
1495BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1496                                  int count, BdrvRequestFlags flags,
1497                                  BlockCompletionFunc *cb, void *opaque)
1498{
1499    return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
1500                        flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
1501}
1502
1503int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
1504{
1505    int ret = blk_prw(blk, offset, buf, count, blk_read_entry, 0);
1506    if (ret < 0) {
1507        return ret;
1508    }
1509    return count;
1510}
1511
1512int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count,
1513               BdrvRequestFlags flags)
1514{
1515    int ret = blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
1516                      flags);
1517    if (ret < 0) {
1518        return ret;
1519    }
1520    return count;
1521}
1522
1523int64_t blk_getlength(BlockBackend *blk)
1524{
1525    if (!blk_is_available(blk)) {
1526        return -ENOMEDIUM;
1527    }
1528
1529    return bdrv_getlength(blk_bs(blk));
1530}
1531
1532void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr)
1533{
1534    if (!blk_bs(blk)) {
1535        *nb_sectors_ptr = 0;
1536    } else {
1537        bdrv_get_geometry(blk_bs(blk), nb_sectors_ptr);
1538    }
1539}
1540
1541int64_t blk_nb_sectors(BlockBackend *blk)
1542{
1543    if (!blk_is_available(blk)) {
1544        return -ENOMEDIUM;
1545    }
1546
1547    return bdrv_nb_sectors(blk_bs(blk));
1548}
1549
1550BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
1551                           QEMUIOVector *qiov, BdrvRequestFlags flags,
1552                           BlockCompletionFunc *cb, void *opaque)
1553{
1554    return blk_aio_prwv(blk, offset, qiov->size, qiov,
1555                        blk_aio_read_entry, flags, cb, opaque);
1556}
1557
1558BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
1559                            QEMUIOVector *qiov, BdrvRequestFlags flags,
1560                            BlockCompletionFunc *cb, void *opaque)
1561{
1562    return blk_aio_prwv(blk, offset, qiov->size, qiov,
1563                        blk_aio_write_entry, flags, cb, opaque);
1564}
1565
1566void blk_aio_cancel(BlockAIOCB *acb)
1567{
1568    bdrv_aio_cancel(acb);
1569}
1570
1571void blk_aio_cancel_async(BlockAIOCB *acb)
1572{
1573    bdrv_aio_cancel_async(acb);
1574}
1575
1576/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1577static int coroutine_fn
1578blk_do_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1579{
1580    blk_wait_while_drained(blk);
1581
1582    if (!blk_is_available(blk)) {
1583        return -ENOMEDIUM;
1584    }
1585
1586    return bdrv_co_ioctl(blk_bs(blk), req, buf);
1587}
1588
1589static void blk_ioctl_entry(void *opaque)
1590{
1591    BlkRwCo *rwco = opaque;
1592    QEMUIOVector *qiov = rwco->iobuf;
1593
1594    rwco->ret = blk_do_ioctl(rwco->blk, rwco->offset, qiov->iov[0].iov_base);
1595    aio_wait_kick();
1596}
1597
1598int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1599{
1600    return blk_prw(blk, req, buf, 0, blk_ioctl_entry, 0);
1601}
1602
1603static void blk_aio_ioctl_entry(void *opaque)
1604{
1605    BlkAioEmAIOCB *acb = opaque;
1606    BlkRwCo *rwco = &acb->rwco;
1607
1608    rwco->ret = blk_do_ioctl(rwco->blk, rwco->offset, rwco->iobuf);
1609
1610    blk_aio_complete(acb);
1611}
1612
1613BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
1614                          BlockCompletionFunc *cb, void *opaque)
1615{
1616    return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque);
1617}
1618
1619/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1620static int coroutine_fn
1621blk_do_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1622{
1623    int ret;
1624
1625    blk_wait_while_drained(blk);
1626
1627    ret = blk_check_byte_request(blk, offset, bytes);
1628    if (ret < 0) {
1629        return ret;
1630    }
1631
1632    return bdrv_co_pdiscard(blk->root, offset, bytes);
1633}
1634
1635static void blk_aio_pdiscard_entry(void *opaque)
1636{
1637    BlkAioEmAIOCB *acb = opaque;
1638    BlkRwCo *rwco = &acb->rwco;
1639
1640    rwco->ret = blk_do_pdiscard(rwco->blk, rwco->offset, acb->bytes);
1641    blk_aio_complete(acb);
1642}
1643
1644BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
1645                             int64_t offset, int bytes,
1646                             BlockCompletionFunc *cb, void *opaque)
1647{
1648    return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0,
1649                        cb, opaque);
1650}
1651
1652int coroutine_fn blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1653{
1654    int ret;
1655
1656    blk_inc_in_flight(blk);
1657    ret = blk_do_pdiscard(blk, offset, bytes);
1658    blk_dec_in_flight(blk);
1659
1660    return ret;
1661}
1662
1663static void blk_pdiscard_entry(void *opaque)
1664{
1665    BlkRwCo *rwco = opaque;
1666    QEMUIOVector *qiov = rwco->iobuf;
1667
1668    rwco->ret = blk_do_pdiscard(rwco->blk, rwco->offset, qiov->size);
1669    aio_wait_kick();
1670}
1671
1672int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1673{
1674    return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0);
1675}
1676
1677/* To be called between exactly one pair of blk_inc/dec_in_flight() */
1678static int coroutine_fn blk_do_flush(BlockBackend *blk)
1679{
1680    blk_wait_while_drained(blk);
1681
1682    if (!blk_is_available(blk)) {
1683        return -ENOMEDIUM;
1684    }
1685
1686    return bdrv_co_flush(blk_bs(blk));
1687}
1688
1689static void blk_aio_flush_entry(void *opaque)
1690{
1691    BlkAioEmAIOCB *acb = opaque;
1692    BlkRwCo *rwco = &acb->rwco;
1693
1694    rwco->ret = blk_do_flush(rwco->blk);
1695    blk_aio_complete(acb);
1696}
1697
1698BlockAIOCB *blk_aio_flush(BlockBackend *blk,
1699                          BlockCompletionFunc *cb, void *opaque)
1700{
1701    return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque);
1702}
1703
1704int coroutine_fn blk_co_flush(BlockBackend *blk)
1705{
1706    int ret;
1707
1708    blk_inc_in_flight(blk);
1709    ret = blk_do_flush(blk);
1710    blk_dec_in_flight(blk);
1711
1712    return ret;
1713}
1714
1715static void blk_flush_entry(void *opaque)
1716{
1717    BlkRwCo *rwco = opaque;
1718    rwco->ret = blk_do_flush(rwco->blk);
1719    aio_wait_kick();
1720}
1721
1722int blk_flush(BlockBackend *blk)
1723{
1724    return blk_prw(blk, 0, NULL, 0, blk_flush_entry, 0);
1725}
1726
1727void blk_drain(BlockBackend *blk)
1728{
1729    BlockDriverState *bs = blk_bs(blk);
1730
1731    if (bs) {
1732        bdrv_drained_begin(bs);
1733    }
1734
1735    /* We may have -ENOMEDIUM completions in flight */
1736    AIO_WAIT_WHILE(blk_get_aio_context(blk),
1737                   qatomic_mb_read(&blk->in_flight) > 0);
1738
1739    if (bs) {
1740        bdrv_drained_end(bs);
1741    }
1742}
1743
1744void blk_drain_all(void)
1745{
1746    BlockBackend *blk = NULL;
1747
1748    bdrv_drain_all_begin();
1749
1750    while ((blk = blk_all_next(blk)) != NULL) {
1751        AioContext *ctx = blk_get_aio_context(blk);
1752
1753        aio_context_acquire(ctx);
1754
1755        /* We may have -ENOMEDIUM completions in flight */
1756        AIO_WAIT_WHILE(ctx, qatomic_mb_read(&blk->in_flight) > 0);
1757
1758        aio_context_release(ctx);
1759    }
1760
1761    bdrv_drain_all_end();
1762}
1763
1764void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
1765                      BlockdevOnError on_write_error)
1766{
1767    blk->on_read_error = on_read_error;
1768    blk->on_write_error = on_write_error;
1769}
1770
1771BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read)
1772{
1773    return is_read ? blk->on_read_error : blk->on_write_error;
1774}
1775
1776BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
1777                                      int error)
1778{
1779    BlockdevOnError on_err = blk_get_on_error(blk, is_read);
1780
1781    switch (on_err) {
1782    case BLOCKDEV_ON_ERROR_ENOSPC:
1783        return (error == ENOSPC) ?
1784               BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
1785    case BLOCKDEV_ON_ERROR_STOP:
1786        return BLOCK_ERROR_ACTION_STOP;
1787    case BLOCKDEV_ON_ERROR_REPORT:
1788        return BLOCK_ERROR_ACTION_REPORT;
1789    case BLOCKDEV_ON_ERROR_IGNORE:
1790        return BLOCK_ERROR_ACTION_IGNORE;
1791    case BLOCKDEV_ON_ERROR_AUTO:
1792    default:
1793        abort();
1794    }
1795}
1796
1797static void send_qmp_error_event(BlockBackend *blk,
1798                                 BlockErrorAction action,
1799                                 bool is_read, int error)
1800{
1801    IoOperationType optype;
1802    BlockDriverState *bs = blk_bs(blk);
1803
1804    optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
1805    qapi_event_send_block_io_error(blk_name(blk), !!bs,
1806                                   bs ? bdrv_get_node_name(bs) : NULL, optype,
1807                                   action, blk_iostatus_is_enabled(blk),
1808                                   error == ENOSPC, strerror(error));
1809}
1810
1811/* This is done by device models because, while the block layer knows
1812 * about the error, it does not know whether an operation comes from
1813 * the device or the block layer (from a job, for example).
1814 */
1815void blk_error_action(BlockBackend *blk, BlockErrorAction action,
1816                      bool is_read, int error)
1817{
1818    assert(error >= 0);
1819
1820    if (action == BLOCK_ERROR_ACTION_STOP) {
1821        /* First set the iostatus, so that "info block" returns an iostatus
1822         * that matches the events raised so far (an additional error iostatus
1823         * is fine, but not a lost one).
1824         */
1825        blk_iostatus_set_err(blk, error);
1826
1827        /* Then raise the request to stop the VM and the event.
1828         * qemu_system_vmstop_request_prepare has two effects.  First,
1829         * it ensures that the STOP event always comes after the
1830         * BLOCK_IO_ERROR event.  Second, it ensures that even if management
1831         * can observe the STOP event and do a "cont" before the STOP
1832         * event is issued, the VM will not stop.  In this case, vm_start()
1833         * also ensures that the STOP/RESUME pair of events is emitted.
1834         */
1835        qemu_system_vmstop_request_prepare();
1836        send_qmp_error_event(blk, action, is_read, error);
1837        qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
1838    } else {
1839        send_qmp_error_event(blk, action, is_read, error);
1840    }
1841}
1842
1843/*
1844 * Returns true if the BlockBackend can support taking write permissions
1845 * (because its root node is not read-only).
1846 */
1847bool blk_supports_write_perm(BlockBackend *blk)
1848{
1849    BlockDriverState *bs = blk_bs(blk);
1850
1851    if (bs) {
1852        return !bdrv_is_read_only(bs);
1853    } else {
1854        return blk->root_state.open_flags & BDRV_O_RDWR;
1855    }
1856}
1857
1858/*
1859 * Returns true if the BlockBackend can be written to in its current
1860 * configuration (i.e. if write permission have been requested)
1861 */
1862bool blk_is_writable(BlockBackend *blk)
1863{
1864    return blk->perm & BLK_PERM_WRITE;
1865}
1866
1867bool blk_is_sg(BlockBackend *blk)
1868{
1869    BlockDriverState *bs = blk_bs(blk);
1870
1871    if (!bs) {
1872        return false;
1873    }
1874
1875    return bdrv_is_sg(bs);
1876}
1877
1878bool blk_enable_write_cache(BlockBackend *blk)
1879{
1880    return blk->enable_write_cache;
1881}
1882
1883void blk_set_enable_write_cache(BlockBackend *blk, bool wce)
1884{
1885    blk->enable_write_cache = wce;
1886}
1887
1888void blk_invalidate_cache(BlockBackend *blk, Error **errp)
1889{
1890    BlockDriverState *bs = blk_bs(blk);
1891
1892    if (!bs) {
1893        error_setg(errp, "Device '%s' has no medium", blk->name);
1894        return;
1895    }
1896
1897    bdrv_invalidate_cache(bs, errp);
1898}
1899
1900bool blk_is_inserted(BlockBackend *blk)
1901{
1902    BlockDriverState *bs = blk_bs(blk);
1903
1904    return bs && bdrv_is_inserted(bs);
1905}
1906
1907bool blk_is_available(BlockBackend *blk)
1908{
1909    return blk_is_inserted(blk) && !blk_dev_is_tray_open(blk);
1910}
1911
1912void blk_lock_medium(BlockBackend *blk, bool locked)
1913{
1914    BlockDriverState *bs = blk_bs(blk);
1915
1916    if (bs) {
1917        bdrv_lock_medium(bs, locked);
1918    }
1919}
1920
1921void blk_eject(BlockBackend *blk, bool eject_flag)
1922{
1923    BlockDriverState *bs = blk_bs(blk);
1924    char *id;
1925
1926    if (bs) {
1927        bdrv_eject(bs, eject_flag);
1928    }
1929
1930    /* Whether or not we ejected on the backend,
1931     * the frontend experienced a tray event. */
1932    id = blk_get_attached_dev_id(blk);
1933    qapi_event_send_device_tray_moved(blk_name(blk), id,
1934                                      eject_flag);
1935    g_free(id);
1936}
1937
1938int blk_get_flags(BlockBackend *blk)
1939{
1940    BlockDriverState *bs = blk_bs(blk);
1941
1942    if (bs) {
1943        return bdrv_get_flags(bs);
1944    } else {
1945        return blk->root_state.open_flags;
1946    }
1947}
1948
1949/* Returns the minimum request alignment, in bytes; guaranteed nonzero */
1950uint32_t blk_get_request_alignment(BlockBackend *blk)
1951{
1952    BlockDriverState *bs = blk_bs(blk);
1953    return bs ? bs->bl.request_alignment : BDRV_SECTOR_SIZE;
1954}
1955
1956/* Returns the maximum hardware transfer length, in bytes; guaranteed nonzero */
1957uint64_t blk_get_max_hw_transfer(BlockBackend *blk)
1958{
1959    BlockDriverState *bs = blk_bs(blk);
1960    uint64_t max = INT_MAX;
1961
1962    if (bs) {
1963        max = MIN_NON_ZERO(max, bs->bl.max_hw_transfer);
1964        max = MIN_NON_ZERO(max, bs->bl.max_transfer);
1965    }
1966    return ROUND_DOWN(max, blk_get_request_alignment(blk));
1967}
1968
1969/* Returns the maximum transfer length, in bytes; guaranteed nonzero */
1970uint32_t blk_get_max_transfer(BlockBackend *blk)
1971{
1972    BlockDriverState *bs = blk_bs(blk);
1973    uint32_t max = INT_MAX;
1974
1975    if (bs) {
1976        max = MIN_NON_ZERO(max, bs->bl.max_transfer);
1977    }
1978    return ROUND_DOWN(max, blk_get_request_alignment(blk));
1979}
1980
1981int blk_get_max_hw_iov(BlockBackend *blk)
1982{
1983    return MIN_NON_ZERO(blk->root->bs->bl.max_hw_iov,
1984                        blk->root->bs->bl.max_iov);
1985}
1986
1987int blk_get_max_iov(BlockBackend *blk)
1988{
1989    return blk->root->bs->bl.max_iov;
1990}
1991
1992void blk_set_guest_block_size(BlockBackend *blk, int align)
1993{
1994    blk->guest_block_size = align;
1995}
1996
1997void *blk_try_blockalign(BlockBackend *blk, size_t size)
1998{
1999    return qemu_try_blockalign(blk ? blk_bs(blk) : NULL, size);
2000}
2001
2002void *blk_blockalign(BlockBackend *blk, size_t size)
2003{
2004    return qemu_blockalign(blk ? blk_bs(blk) : NULL, size);
2005}
2006
2007bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
2008{
2009    BlockDriverState *bs = blk_bs(blk);
2010
2011    if (!bs) {
2012        return false;
2013    }
2014
2015    return bdrv_op_is_blocked(bs, op, errp);
2016}
2017
2018void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
2019{
2020    BlockDriverState *bs = blk_bs(blk);
2021
2022    if (bs) {
2023        bdrv_op_unblock(bs, op, reason);
2024    }
2025}
2026
2027void blk_op_block_all(BlockBackend *blk, Error *reason)
2028{
2029    BlockDriverState *bs = blk_bs(blk);
2030
2031    if (bs) {
2032        bdrv_op_block_all(bs, reason);
2033    }
2034}
2035
2036void blk_op_unblock_all(BlockBackend *blk, Error *reason)
2037{
2038    BlockDriverState *bs = blk_bs(blk);
2039
2040    if (bs) {
2041        bdrv_op_unblock_all(bs, reason);
2042    }
2043}
2044
2045AioContext *blk_get_aio_context(BlockBackend *blk)
2046{
2047    BlockDriverState *bs = blk_bs(blk);
2048
2049    if (bs) {
2050        AioContext *ctx = bdrv_get_aio_context(blk_bs(blk));
2051        assert(ctx == blk->ctx);
2052    }
2053
2054    return blk->ctx;
2055}
2056
2057static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
2058{
2059    BlockBackendAIOCB *blk_acb = DO_UPCAST(BlockBackendAIOCB, common, acb);
2060    return blk_get_aio_context(blk_acb->blk);
2061}
2062
2063static int blk_do_set_aio_context(BlockBackend *blk, AioContext *new_context,
2064                                  bool update_root_node, Error **errp)
2065{
2066    BlockDriverState *bs = blk_bs(blk);
2067    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2068    int ret;
2069
2070    if (bs) {
2071        if (update_root_node) {
2072            ret = bdrv_child_try_set_aio_context(bs, new_context, blk->root,
2073                                                 errp);
2074            if (ret < 0) {
2075                return ret;
2076            }
2077        }
2078        if (tgm->throttle_state) {
2079            bdrv_drained_begin(bs);
2080            throttle_group_detach_aio_context(tgm);
2081            throttle_group_attach_aio_context(tgm, new_context);
2082            bdrv_drained_end(bs);
2083        }
2084    }
2085
2086    blk->ctx = new_context;
2087    return 0;
2088}
2089
2090int blk_set_aio_context(BlockBackend *blk, AioContext *new_context,
2091                        Error **errp)
2092{
2093    return blk_do_set_aio_context(blk, new_context, true, errp);
2094}
2095
2096static bool blk_root_can_set_aio_ctx(BdrvChild *child, AioContext *ctx,
2097                                     GSList **ignore, Error **errp)
2098{
2099    BlockBackend *blk = child->opaque;
2100
2101    if (blk->allow_aio_context_change) {
2102        return true;
2103    }
2104
2105    /* Only manually created BlockBackends that are not attached to anything
2106     * can change their AioContext without updating their user. */
2107    if (!blk->name || blk->dev) {
2108        /* TODO Add BB name/QOM path */
2109        error_setg(errp, "Cannot change iothread of active block backend");
2110        return false;
2111    }
2112
2113    return true;
2114}
2115
2116static void blk_root_set_aio_ctx(BdrvChild *child, AioContext *ctx,
2117                                 GSList **ignore)
2118{
2119    BlockBackend *blk = child->opaque;
2120    blk_do_set_aio_context(blk, ctx, false, &error_abort);
2121}
2122
2123void blk_add_aio_context_notifier(BlockBackend *blk,
2124        void (*attached_aio_context)(AioContext *new_context, void *opaque),
2125        void (*detach_aio_context)(void *opaque), void *opaque)
2126{
2127    BlockBackendAioNotifier *notifier;
2128    BlockDriverState *bs = blk_bs(blk);
2129
2130    notifier = g_new(BlockBackendAioNotifier, 1);
2131    notifier->attached_aio_context = attached_aio_context;
2132    notifier->detach_aio_context = detach_aio_context;
2133    notifier->opaque = opaque;
2134    QLIST_INSERT_HEAD(&blk->aio_notifiers, notifier, list);
2135
2136    if (bs) {
2137        bdrv_add_aio_context_notifier(bs, attached_aio_context,
2138                                      detach_aio_context, opaque);
2139    }
2140}
2141
2142void blk_remove_aio_context_notifier(BlockBackend *blk,
2143                                     void (*attached_aio_context)(AioContext *,
2144                                                                  void *),
2145                                     void (*detach_aio_context)(void *),
2146                                     void *opaque)
2147{
2148    BlockBackendAioNotifier *notifier;
2149    BlockDriverState *bs = blk_bs(blk);
2150
2151    if (bs) {
2152        bdrv_remove_aio_context_notifier(bs, attached_aio_context,
2153                                         detach_aio_context, opaque);
2154    }
2155
2156    QLIST_FOREACH(notifier, &blk->aio_notifiers, list) {
2157        if (notifier->attached_aio_context == attached_aio_context &&
2158            notifier->detach_aio_context == detach_aio_context &&
2159            notifier->opaque == opaque) {
2160            QLIST_REMOVE(notifier, list);
2161            g_free(notifier);
2162            return;
2163        }
2164    }
2165
2166    abort();
2167}
2168
2169void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify)
2170{
2171    notifier_list_add(&blk->remove_bs_notifiers, notify);
2172}
2173
2174void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
2175{
2176    notifier_list_add(&blk->insert_bs_notifiers, notify);
2177}
2178
2179void blk_io_plug(BlockBackend *blk)
2180{
2181    BlockDriverState *bs = blk_bs(blk);
2182
2183    if (bs) {
2184        bdrv_io_plug(bs);
2185    }
2186}
2187
2188void blk_io_unplug(BlockBackend *blk)
2189{
2190    BlockDriverState *bs = blk_bs(blk);
2191
2192    if (bs) {
2193        bdrv_io_unplug(bs);
2194    }
2195}
2196
2197BlockAcctStats *blk_get_stats(BlockBackend *blk)
2198{
2199    return &blk->stats;
2200}
2201
2202void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
2203                  BlockCompletionFunc *cb, void *opaque)
2204{
2205    return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
2206}
2207
2208int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
2209                                      int bytes, BdrvRequestFlags flags)
2210{
2211    return blk_co_pwritev(blk, offset, bytes, NULL,
2212                          flags | BDRV_REQ_ZERO_WRITE);
2213}
2214
2215int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
2216                          int count)
2217{
2218    return blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
2219                   BDRV_REQ_WRITE_COMPRESSED);
2220}
2221
2222int blk_truncate(BlockBackend *blk, int64_t offset, bool exact,
2223                 PreallocMode prealloc, BdrvRequestFlags flags, Error **errp)
2224{
2225    if (!blk_is_available(blk)) {
2226        error_setg(errp, "No medium inserted");
2227        return -ENOMEDIUM;
2228    }
2229
2230    return bdrv_truncate(blk->root, offset, exact, prealloc, flags, errp);
2231}
2232
2233int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
2234                     int64_t pos, int size)
2235{
2236    int ret;
2237
2238    if (!blk_is_available(blk)) {
2239        return -ENOMEDIUM;
2240    }
2241
2242    ret = bdrv_save_vmstate(blk_bs(blk), buf, pos, size);
2243    if (ret < 0) {
2244        return ret;
2245    }
2246
2247    if (ret == size && !blk->enable_write_cache) {
2248        ret = bdrv_flush(blk_bs(blk));
2249    }
2250
2251    return ret < 0 ? ret : size;
2252}
2253
2254int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
2255{
2256    if (!blk_is_available(blk)) {
2257        return -ENOMEDIUM;
2258    }
2259
2260    return bdrv_load_vmstate(blk_bs(blk), buf, pos, size);
2261}
2262
2263int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
2264{
2265    if (!blk_is_available(blk)) {
2266        return -ENOMEDIUM;
2267    }
2268
2269    return bdrv_probe_blocksizes(blk_bs(blk), bsz);
2270}
2271
2272int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo)
2273{
2274    if (!blk_is_available(blk)) {
2275        return -ENOMEDIUM;
2276    }
2277
2278    return bdrv_probe_geometry(blk_bs(blk), geo);
2279}
2280
2281/*
2282 * Updates the BlockBackendRootState object with data from the currently
2283 * attached BlockDriverState.
2284 */
2285void blk_update_root_state(BlockBackend *blk)
2286{
2287    assert(blk->root);
2288
2289    blk->root_state.open_flags    = blk->root->bs->open_flags;
2290    blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
2291}
2292
2293/*
2294 * Returns the detect-zeroes setting to be used for bdrv_open() of a
2295 * BlockDriverState which is supposed to inherit the root state.
2296 */
2297bool blk_get_detect_zeroes_from_root_state(BlockBackend *blk)
2298{
2299    return blk->root_state.detect_zeroes;
2300}
2301
2302/*
2303 * Returns the flags to be used for bdrv_open() of a BlockDriverState which is
2304 * supposed to inherit the root state.
2305 */
2306int blk_get_open_flags_from_root_state(BlockBackend *blk)
2307{
2308    return blk->root_state.open_flags;
2309}
2310
2311BlockBackendRootState *blk_get_root_state(BlockBackend *blk)
2312{
2313    return &blk->root_state;
2314}
2315
2316int blk_commit_all(void)
2317{
2318    BlockBackend *blk = NULL;
2319
2320    while ((blk = blk_all_next(blk)) != NULL) {
2321        AioContext *aio_context = blk_get_aio_context(blk);
2322        BlockDriverState *unfiltered_bs = bdrv_skip_filters(blk_bs(blk));
2323
2324        aio_context_acquire(aio_context);
2325        if (blk_is_inserted(blk) && bdrv_cow_child(unfiltered_bs)) {
2326            int ret;
2327
2328            ret = bdrv_commit(unfiltered_bs);
2329            if (ret < 0) {
2330                aio_context_release(aio_context);
2331                return ret;
2332            }
2333        }
2334        aio_context_release(aio_context);
2335    }
2336    return 0;
2337}
2338
2339
2340/* throttling disk I/O limits */
2341void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
2342{
2343    throttle_group_config(&blk->public.throttle_group_member, cfg);
2344}
2345
2346void blk_io_limits_disable(BlockBackend *blk)
2347{
2348    BlockDriverState *bs = blk_bs(blk);
2349    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2350    assert(tgm->throttle_state);
2351    if (bs) {
2352        bdrv_drained_begin(bs);
2353    }
2354    throttle_group_unregister_tgm(tgm);
2355    if (bs) {
2356        bdrv_drained_end(bs);
2357    }
2358}
2359
2360/* should be called before blk_set_io_limits if a limit is set */
2361void blk_io_limits_enable(BlockBackend *blk, const char *group)
2362{
2363    assert(!blk->public.throttle_group_member.throttle_state);
2364    throttle_group_register_tgm(&blk->public.throttle_group_member,
2365                                group, blk_get_aio_context(blk));
2366}
2367
2368void blk_io_limits_update_group(BlockBackend *blk, const char *group)
2369{
2370    /* this BB is not part of any group */
2371    if (!blk->public.throttle_group_member.throttle_state) {
2372        return;
2373    }
2374
2375    /* this BB is a part of the same group than the one we want */
2376    if (!g_strcmp0(throttle_group_get_name(&blk->public.throttle_group_member),
2377                group)) {
2378        return;
2379    }
2380
2381    /* need to change the group this bs belong to */
2382    blk_io_limits_disable(blk);
2383    blk_io_limits_enable(blk, group);
2384}
2385
2386static void blk_root_drained_begin(BdrvChild *child)
2387{
2388    BlockBackend *blk = child->opaque;
2389    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2390
2391    if (++blk->quiesce_counter == 1) {
2392        if (blk->dev_ops && blk->dev_ops->drained_begin) {
2393            blk->dev_ops->drained_begin(blk->dev_opaque);
2394        }
2395    }
2396
2397    /* Note that blk->root may not be accessible here yet if we are just
2398     * attaching to a BlockDriverState that is drained. Use child instead. */
2399
2400    if (qatomic_fetch_inc(&tgm->io_limits_disabled) == 0) {
2401        throttle_group_restart_tgm(tgm);
2402    }
2403}
2404
2405static bool blk_root_drained_poll(BdrvChild *child)
2406{
2407    BlockBackend *blk = child->opaque;
2408    bool busy = false;
2409    assert(blk->quiesce_counter);
2410
2411    if (blk->dev_ops && blk->dev_ops->drained_poll) {
2412        busy = blk->dev_ops->drained_poll(blk->dev_opaque);
2413    }
2414    return busy || !!blk->in_flight;
2415}
2416
2417static void blk_root_drained_end(BdrvChild *child, int *drained_end_counter)
2418{
2419    BlockBackend *blk = child->opaque;
2420    assert(blk->quiesce_counter);
2421
2422    assert(blk->public.throttle_group_member.io_limits_disabled);
2423    qatomic_dec(&blk->public.throttle_group_member.io_limits_disabled);
2424
2425    if (--blk->quiesce_counter == 0) {
2426        if (blk->dev_ops && blk->dev_ops->drained_end) {
2427            blk->dev_ops->drained_end(blk->dev_opaque);
2428        }
2429        while (qemu_co_enter_next(&blk->queued_requests, NULL)) {
2430            /* Resume all queued requests */
2431        }
2432    }
2433}
2434
2435void blk_register_buf(BlockBackend *blk, void *host, size_t size)
2436{
2437    bdrv_register_buf(blk_bs(blk), host, size);
2438}
2439
2440void blk_unregister_buf(BlockBackend *blk, void *host)
2441{
2442    bdrv_unregister_buf(blk_bs(blk), host);
2443}
2444
2445int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
2446                                   BlockBackend *blk_out, int64_t off_out,
2447                                   int bytes, BdrvRequestFlags read_flags,
2448                                   BdrvRequestFlags write_flags)
2449{
2450    int r;
2451    r = blk_check_byte_request(blk_in, off_in, bytes);
2452    if (r) {
2453        return r;
2454    }
2455    r = blk_check_byte_request(blk_out, off_out, bytes);
2456    if (r) {
2457        return r;
2458    }
2459    return bdrv_co_copy_range(blk_in->root, off_in,
2460                              blk_out->root, off_out,
2461                              bytes, read_flags, write_flags);
2462}
2463
2464const BdrvChild *blk_root(BlockBackend *blk)
2465{
2466    return blk->root;
2467}
2468
2469int blk_make_empty(BlockBackend *blk, Error **errp)
2470{
2471    if (!blk_is_available(blk)) {
2472        error_setg(errp, "No medium inserted");
2473        return -ENOMEDIUM;
2474    }
2475
2476    return bdrv_make_empty(blk->root, errp);
2477}
2478