qemu/block/block-backend.c
<<
>>
Prefs
   1/*
   2 * QEMU Block backends
   3 *
   4 * Copyright (C) 2014-2016 Red Hat, Inc.
   5 *
   6 * Authors:
   7 *  Markus Armbruster <armbru@redhat.com>,
   8 *
   9 * This work is licensed under the terms of the GNU LGPL, version 2.1
  10 * or later.  See the COPYING.LIB file in the top-level directory.
  11 */
  12
  13#include "qemu/osdep.h"
  14#include "sysemu/block-backend.h"
  15#include "block/block_int.h"
  16#include "block/blockjob.h"
  17#include "block/throttle-groups.h"
  18#include "sysemu/blockdev.h"
  19#include "sysemu/sysemu.h"
  20#include "qapi-event.h"
  21#include "qemu/id.h"
  22#include "trace.h"
  23#include "migration/misc.h"
  24
  25/* Number of coroutines to reserve per attached device model */
  26#define COROUTINE_POOL_RESERVATION 64
  27
  28#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  29
  30static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb);
  31
  32struct BlockBackend {
  33    char *name;
  34    int refcnt;
  35    BdrvChild *root;
  36    DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
  37    QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
  38    QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
  39    BlockBackendPublic public;
  40
  41    void *dev;                  /* attached device model, if any */
  42    bool legacy_dev;            /* true if dev is not a DeviceState */
  43    /* TODO change to DeviceState when all users are qdevified */
  44    const BlockDevOps *dev_ops;
  45    void *dev_opaque;
  46
  47    /* the block size for which the guest device expects atomicity */
  48    int guest_block_size;
  49
  50    /* If the BDS tree is removed, some of its options are stored here (which
  51     * can be used to restore those options in the new BDS on insert) */
  52    BlockBackendRootState root_state;
  53
  54    bool enable_write_cache;
  55
  56    /* I/O stats (display with "info blockstats"). */
  57    BlockAcctStats stats;
  58
  59    BlockdevOnError on_read_error, on_write_error;
  60    bool iostatus_enabled;
  61    BlockDeviceIoStatus iostatus;
  62
  63    uint64_t perm;
  64    uint64_t shared_perm;
  65    bool disable_perm;
  66
  67    bool allow_write_beyond_eof;
  68
  69    NotifierList remove_bs_notifiers, insert_bs_notifiers;
  70
  71    int quiesce_counter;
  72    VMChangeStateEntry *vmsh;
  73    bool force_allow_inactivate;
  74};
  75
  76typedef struct BlockBackendAIOCB {
  77    BlockAIOCB common;
  78    BlockBackend *blk;
  79    int ret;
  80} BlockBackendAIOCB;
  81
  82static const AIOCBInfo block_backend_aiocb_info = {
  83    .get_aio_context = blk_aiocb_get_aio_context,
  84    .aiocb_size = sizeof(BlockBackendAIOCB),
  85};
  86
  87static void drive_info_del(DriveInfo *dinfo);
  88static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
  89
  90/* All BlockBackends */
  91static QTAILQ_HEAD(, BlockBackend) block_backends =
  92    QTAILQ_HEAD_INITIALIZER(block_backends);
  93
  94/* All BlockBackends referenced by the monitor and which are iterated through by
  95 * blk_next() */
  96static QTAILQ_HEAD(, BlockBackend) monitor_block_backends =
  97    QTAILQ_HEAD_INITIALIZER(monitor_block_backends);
  98
  99static void blk_root_inherit_options(int *child_flags, QDict *child_options,
 100                                     int parent_flags, QDict *parent_options)
 101{
 102    /* We're not supposed to call this function for root nodes */
 103    abort();
 104}
 105static void blk_root_drained_begin(BdrvChild *child);
 106static void blk_root_drained_end(BdrvChild *child);
 107
 108static void blk_root_change_media(BdrvChild *child, bool load);
 109static void blk_root_resize(BdrvChild *child);
 110
 111static char *blk_root_get_parent_desc(BdrvChild *child)
 112{
 113    BlockBackend *blk = child->opaque;
 114    char *dev_id;
 115
 116    if (blk->name) {
 117        return g_strdup(blk->name);
 118    }
 119
 120    dev_id = blk_get_attached_dev_id(blk);
 121    if (*dev_id) {
 122        return dev_id;
 123    } else {
 124        /* TODO Callback into the BB owner for something more detailed */
 125        g_free(dev_id);
 126        return g_strdup("a block device");
 127    }
 128}
 129
 130static const char *blk_root_get_name(BdrvChild *child)
 131{
 132    return blk_name(child->opaque);
 133}
 134
 135static void blk_vm_state_changed(void *opaque, int running, RunState state)
 136{
 137    Error *local_err = NULL;
 138    BlockBackend *blk = opaque;
 139
 140    if (state == RUN_STATE_INMIGRATE) {
 141        return;
 142    }
 143
 144    qemu_del_vm_change_state_handler(blk->vmsh);
 145    blk->vmsh = NULL;
 146    blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
 147    if (local_err) {
 148        error_report_err(local_err);
 149    }
 150}
 151
 152/*
 153 * Notifies the user of the BlockBackend that migration has completed. qdev
 154 * devices can tighten their permissions in response (specifically revoke
 155 * shared write permissions that we needed for storage migration).
 156 *
 157 * If an error is returned, the VM cannot be allowed to be resumed.
 158 */
 159static void blk_root_activate(BdrvChild *child, Error **errp)
 160{
 161    BlockBackend *blk = child->opaque;
 162    Error *local_err = NULL;
 163
 164    if (!blk->disable_perm) {
 165        return;
 166    }
 167
 168    blk->disable_perm = false;
 169
 170    blk_set_perm(blk, blk->perm, BLK_PERM_ALL, &local_err);
 171    if (local_err) {
 172        error_propagate(errp, local_err);
 173        blk->disable_perm = true;
 174        return;
 175    }
 176
 177    if (runstate_check(RUN_STATE_INMIGRATE)) {
 178        /* Activation can happen when migration process is still active, for
 179         * example when nbd_server_add is called during non-shared storage
 180         * migration. Defer the shared_perm update to migration completion. */
 181        if (!blk->vmsh) {
 182            blk->vmsh = qemu_add_vm_change_state_handler(blk_vm_state_changed,
 183                                                         blk);
 184        }
 185        return;
 186    }
 187
 188    blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
 189    if (local_err) {
 190        error_propagate(errp, local_err);
 191        blk->disable_perm = true;
 192        return;
 193    }
 194}
 195
 196void blk_set_force_allow_inactivate(BlockBackend *blk)
 197{
 198    blk->force_allow_inactivate = true;
 199}
 200
 201static bool blk_can_inactivate(BlockBackend *blk)
 202{
 203    /* If it is a guest device, inactivate is ok. */
 204    if (blk->dev || blk_name(blk)[0]) {
 205        return true;
 206    }
 207
 208    /* Inactivating means no more writes to the image can be done,
 209     * even if those writes would be changes invisible to the
 210     * guest.  For block job BBs that satisfy this, we can just allow
 211     * it.  This is the case for mirror job source, which is required
 212     * by libvirt non-shared block migration. */
 213    if (!(blk->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED))) {
 214        return true;
 215    }
 216
 217    return blk->force_allow_inactivate;
 218}
 219
 220static int blk_root_inactivate(BdrvChild *child)
 221{
 222    BlockBackend *blk = child->opaque;
 223
 224    if (blk->disable_perm) {
 225        return 0;
 226    }
 227
 228    if (!blk_can_inactivate(blk)) {
 229        return -EPERM;
 230    }
 231
 232    blk->disable_perm = true;
 233    if (blk->root) {
 234        bdrv_child_try_set_perm(blk->root, 0, BLK_PERM_ALL, &error_abort);
 235    }
 236
 237    return 0;
 238}
 239
 240static const BdrvChildRole child_root = {
 241    .inherit_options    = blk_root_inherit_options,
 242
 243    .change_media       = blk_root_change_media,
 244    .resize             = blk_root_resize,
 245    .get_name           = blk_root_get_name,
 246    .get_parent_desc    = blk_root_get_parent_desc,
 247
 248    .drained_begin      = blk_root_drained_begin,
 249    .drained_end        = blk_root_drained_end,
 250
 251    .activate           = blk_root_activate,
 252    .inactivate         = blk_root_inactivate,
 253};
 254
 255/*
 256 * Create a new BlockBackend with a reference count of one.
 257 *
 258 * @perm is a bitmasks of BLK_PERM_* constants which describes the permissions
 259 * to request for a block driver node that is attached to this BlockBackend.
 260 * @shared_perm is a bitmask which describes which permissions may be granted
 261 * to other users of the attached node.
 262 * Both sets of permissions can be changed later using blk_set_perm().
 263 *
 264 * Return the new BlockBackend on success, null on failure.
 265 */
 266BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm)
 267{
 268    BlockBackend *blk;
 269
 270    blk = g_new0(BlockBackend, 1);
 271    blk->refcnt = 1;
 272    blk->perm = perm;
 273    blk->shared_perm = shared_perm;
 274    blk_set_enable_write_cache(blk, true);
 275
 276    qemu_co_mutex_init(&blk->public.throttled_reqs_lock);
 277    qemu_co_queue_init(&blk->public.throttled_reqs[0]);
 278    qemu_co_queue_init(&blk->public.throttled_reqs[1]);
 279    block_acct_init(&blk->stats);
 280
 281    notifier_list_init(&blk->remove_bs_notifiers);
 282    notifier_list_init(&blk->insert_bs_notifiers);
 283
 284    QTAILQ_INSERT_TAIL(&block_backends, blk, link);
 285    return blk;
 286}
 287
 288/*
 289 * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
 290 *
 291 * Just as with bdrv_open(), after having called this function the reference to
 292 * @options belongs to the block layer (even on failure).
 293 *
 294 * TODO: Remove @filename and @flags; it should be possible to specify a whole
 295 * BDS tree just by specifying the @options QDict (or @reference,
 296 * alternatively). At the time of adding this function, this is not possible,
 297 * though, so callers of this function have to be able to specify @filename and
 298 * @flags.
 299 */
 300BlockBackend *blk_new_open(const char *filename, const char *reference,
 301                           QDict *options, int flags, Error **errp)
 302{
 303    BlockBackend *blk;
 304    BlockDriverState *bs;
 305    uint64_t perm;
 306
 307    /* blk_new_open() is mainly used in .bdrv_create implementations and the
 308     * tools where sharing isn't a concern because the BDS stays private, so we
 309     * just request permission according to the flags.
 310     *
 311     * The exceptions are xen_disk and blockdev_init(); in these cases, the
 312     * caller of blk_new_open() doesn't make use of the permissions, but they
 313     * shouldn't hurt either. We can still share everything here because the
 314     * guest devices will add their own blockers if they can't share. */
 315    perm = BLK_PERM_CONSISTENT_READ;
 316    if (flags & BDRV_O_RDWR) {
 317        perm |= BLK_PERM_WRITE;
 318    }
 319    if (flags & BDRV_O_RESIZE) {
 320        perm |= BLK_PERM_RESIZE;
 321    }
 322
 323    blk = blk_new(perm, BLK_PERM_ALL);
 324    bs = bdrv_open(filename, reference, options, flags, errp);
 325    if (!bs) {
 326        blk_unref(blk);
 327        return NULL;
 328    }
 329
 330    blk->root = bdrv_root_attach_child(bs, "root", &child_root,
 331                                       perm, BLK_PERM_ALL, blk, errp);
 332    if (!blk->root) {
 333        bdrv_unref(bs);
 334        blk_unref(blk);
 335        return NULL;
 336    }
 337
 338    return blk;
 339}
 340
 341static void blk_delete(BlockBackend *blk)
 342{
 343    assert(!blk->refcnt);
 344    assert(!blk->name);
 345    assert(!blk->dev);
 346    if (blk->public.throttle_state) {
 347        blk_io_limits_disable(blk);
 348    }
 349    if (blk->root) {
 350        blk_remove_bs(blk);
 351    }
 352    if (blk->vmsh) {
 353        qemu_del_vm_change_state_handler(blk->vmsh);
 354        blk->vmsh = NULL;
 355    }
 356    assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
 357    assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
 358    QTAILQ_REMOVE(&block_backends, blk, link);
 359    drive_info_del(blk->legacy_dinfo);
 360    block_acct_cleanup(&blk->stats);
 361    g_free(blk);
 362}
 363
 364static void drive_info_del(DriveInfo *dinfo)
 365{
 366    if (!dinfo) {
 367        return;
 368    }
 369    qemu_opts_del(dinfo->opts);
 370    g_free(dinfo->serial);
 371    g_free(dinfo);
 372}
 373
 374int blk_get_refcnt(BlockBackend *blk)
 375{
 376    return blk ? blk->refcnt : 0;
 377}
 378
 379/*
 380 * Increment @blk's reference count.
 381 * @blk must not be null.
 382 */
 383void blk_ref(BlockBackend *blk)
 384{
 385    blk->refcnt++;
 386}
 387
 388/*
 389 * Decrement @blk's reference count.
 390 * If this drops it to zero, destroy @blk.
 391 * For convenience, do nothing if @blk is null.
 392 */
 393void blk_unref(BlockBackend *blk)
 394{
 395    if (blk) {
 396        assert(blk->refcnt > 0);
 397        if (!--blk->refcnt) {
 398            blk_delete(blk);
 399        }
 400    }
 401}
 402
 403/*
 404 * Behaves similarly to blk_next() but iterates over all BlockBackends, even the
 405 * ones which are hidden (i.e. are not referenced by the monitor).
 406 */
 407BlockBackend *blk_all_next(BlockBackend *blk)
 408{
 409    return blk ? QTAILQ_NEXT(blk, link)
 410               : QTAILQ_FIRST(&block_backends);
 411}
 412
 413void blk_remove_all_bs(void)
 414{
 415    BlockBackend *blk = NULL;
 416
 417    while ((blk = blk_all_next(blk)) != NULL) {
 418        AioContext *ctx = blk_get_aio_context(blk);
 419
 420        aio_context_acquire(ctx);
 421        if (blk->root) {
 422            blk_remove_bs(blk);
 423        }
 424        aio_context_release(ctx);
 425    }
 426}
 427
 428/*
 429 * Return the monitor-owned BlockBackend after @blk.
 430 * If @blk is null, return the first one.
 431 * Else, return @blk's next sibling, which may be null.
 432 *
 433 * To iterate over all BlockBackends, do
 434 * for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
 435 *     ...
 436 * }
 437 */
 438BlockBackend *blk_next(BlockBackend *blk)
 439{
 440    return blk ? QTAILQ_NEXT(blk, monitor_link)
 441               : QTAILQ_FIRST(&monitor_block_backends);
 442}
 443
 444/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
 445 * the monitor or attached to a BlockBackend */
 446BlockDriverState *bdrv_next(BdrvNextIterator *it)
 447{
 448    BlockDriverState *bs;
 449
 450    /* First, return all root nodes of BlockBackends. In order to avoid
 451     * returning a BDS twice when multiple BBs refer to it, we only return it
 452     * if the BB is the first one in the parent list of the BDS. */
 453    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
 454        do {
 455            it->blk = blk_all_next(it->blk);
 456            bs = it->blk ? blk_bs(it->blk) : NULL;
 457        } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
 458
 459        if (bs) {
 460            return bs;
 461        }
 462        it->phase = BDRV_NEXT_MONITOR_OWNED;
 463    }
 464
 465    /* Then return the monitor-owned BDSes without a BB attached. Ignore all
 466     * BDSes that are attached to a BlockBackend here; they have been handled
 467     * by the above block already */
 468    do {
 469        it->bs = bdrv_next_monitor_owned(it->bs);
 470        bs = it->bs;
 471    } while (bs && bdrv_has_blk(bs));
 472
 473    return bs;
 474}
 475
 476BlockDriverState *bdrv_first(BdrvNextIterator *it)
 477{
 478    *it = (BdrvNextIterator) {
 479        .phase = BDRV_NEXT_BACKEND_ROOTS,
 480    };
 481
 482    return bdrv_next(it);
 483}
 484
 485/*
 486 * Add a BlockBackend into the list of backends referenced by the monitor, with
 487 * the given @name acting as the handle for the monitor.
 488 * Strictly for use by blockdev.c.
 489 *
 490 * @name must not be null or empty.
 491 *
 492 * Returns true on success and false on failure. In the latter case, an Error
 493 * object is returned through @errp.
 494 */
 495bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp)
 496{
 497    assert(!blk->name);
 498    assert(name && name[0]);
 499
 500    if (!id_wellformed(name)) {
 501        error_setg(errp, "Invalid device name");
 502        return false;
 503    }
 504    if (blk_by_name(name)) {
 505        error_setg(errp, "Device with id '%s' already exists", name);
 506        return false;
 507    }
 508    if (bdrv_find_node(name)) {
 509        error_setg(errp,
 510                   "Device name '%s' conflicts with an existing node name",
 511                   name);
 512        return false;
 513    }
 514
 515    blk->name = g_strdup(name);
 516    QTAILQ_INSERT_TAIL(&monitor_block_backends, blk, monitor_link);
 517    return true;
 518}
 519
 520/*
 521 * Remove a BlockBackend from the list of backends referenced by the monitor.
 522 * Strictly for use by blockdev.c.
 523 */
 524void monitor_remove_blk(BlockBackend *blk)
 525{
 526    if (!blk->name) {
 527        return;
 528    }
 529
 530    QTAILQ_REMOVE(&monitor_block_backends, blk, monitor_link);
 531    g_free(blk->name);
 532    blk->name = NULL;
 533}
 534
 535/*
 536 * Return @blk's name, a non-null string.
 537 * Returns an empty string iff @blk is not referenced by the monitor.
 538 */
 539const char *blk_name(const BlockBackend *blk)
 540{
 541    return blk->name ?: "";
 542}
 543
 544/*
 545 * Return the BlockBackend with name @name if it exists, else null.
 546 * @name must not be null.
 547 */
 548BlockBackend *blk_by_name(const char *name)
 549{
 550    BlockBackend *blk = NULL;
 551
 552    assert(name);
 553    while ((blk = blk_next(blk)) != NULL) {
 554        if (!strcmp(name, blk->name)) {
 555            return blk;
 556        }
 557    }
 558    return NULL;
 559}
 560
 561/*
 562 * Return the BlockDriverState attached to @blk if any, else null.
 563 */
 564BlockDriverState *blk_bs(BlockBackend *blk)
 565{
 566    return blk->root ? blk->root->bs : NULL;
 567}
 568
 569static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
 570{
 571    BdrvChild *child;
 572    QLIST_FOREACH(child, &bs->parents, next_parent) {
 573        if (child->role == &child_root) {
 574            return child->opaque;
 575        }
 576    }
 577
 578    return NULL;
 579}
 580
 581/*
 582 * Returns true if @bs has an associated BlockBackend.
 583 */
 584bool bdrv_has_blk(BlockDriverState *bs)
 585{
 586    return bdrv_first_blk(bs) != NULL;
 587}
 588
 589/*
 590 * Returns true if @bs has only BlockBackends as parents.
 591 */
 592bool bdrv_is_root_node(BlockDriverState *bs)
 593{
 594    BdrvChild *c;
 595
 596    QLIST_FOREACH(c, &bs->parents, next_parent) {
 597        if (c->role != &child_root) {
 598            return false;
 599        }
 600    }
 601
 602    return true;
 603}
 604
 605/*
 606 * Return @blk's DriveInfo if any, else null.
 607 */
 608DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
 609{
 610    return blk->legacy_dinfo;
 611}
 612
 613/*
 614 * Set @blk's DriveInfo to @dinfo, and return it.
 615 * @blk must not have a DriveInfo set already.
 616 * No other BlockBackend may have the same DriveInfo set.
 617 */
 618DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo)
 619{
 620    assert(!blk->legacy_dinfo);
 621    return blk->legacy_dinfo = dinfo;
 622}
 623
 624/*
 625 * Return the BlockBackend with DriveInfo @dinfo.
 626 * It must exist.
 627 */
 628BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
 629{
 630    BlockBackend *blk = NULL;
 631
 632    while ((blk = blk_next(blk)) != NULL) {
 633        if (blk->legacy_dinfo == dinfo) {
 634            return blk;
 635        }
 636    }
 637    abort();
 638}
 639
 640/*
 641 * Returns a pointer to the publicly accessible fields of @blk.
 642 */
 643BlockBackendPublic *blk_get_public(BlockBackend *blk)
 644{
 645    return &blk->public;
 646}
 647
 648/*
 649 * Returns a BlockBackend given the associated @public fields.
 650 */
 651BlockBackend *blk_by_public(BlockBackendPublic *public)
 652{
 653    return container_of(public, BlockBackend, public);
 654}
 655
 656/*
 657 * Disassociates the currently associated BlockDriverState from @blk.
 658 */
 659void blk_remove_bs(BlockBackend *blk)
 660{
 661    notifier_list_notify(&blk->remove_bs_notifiers, blk);
 662    if (blk->public.throttle_state) {
 663        throttle_timers_detach_aio_context(&blk->public.throttle_timers);
 664    }
 665
 666    blk_update_root_state(blk);
 667
 668    bdrv_root_unref_child(blk->root);
 669    blk->root = NULL;
 670}
 671
 672/*
 673 * Associates a new BlockDriverState with @blk.
 674 */
 675int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
 676{
 677    blk->root = bdrv_root_attach_child(bs, "root", &child_root,
 678                                       blk->perm, blk->shared_perm, blk, errp);
 679    if (blk->root == NULL) {
 680        return -EPERM;
 681    }
 682    bdrv_ref(bs);
 683
 684    notifier_list_notify(&blk->insert_bs_notifiers, blk);
 685    if (blk->public.throttle_state) {
 686        throttle_timers_attach_aio_context(
 687            &blk->public.throttle_timers, bdrv_get_aio_context(bs));
 688    }
 689
 690    return 0;
 691}
 692
 693/*
 694 * Sets the permission bitmasks that the user of the BlockBackend needs.
 695 */
 696int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
 697                 Error **errp)
 698{
 699    int ret;
 700
 701    if (blk->root && !blk->disable_perm) {
 702        ret = bdrv_child_try_set_perm(blk->root, perm, shared_perm, errp);
 703        if (ret < 0) {
 704            return ret;
 705        }
 706    }
 707
 708    blk->perm = perm;
 709    blk->shared_perm = shared_perm;
 710
 711    return 0;
 712}
 713
 714void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm)
 715{
 716    *perm = blk->perm;
 717    *shared_perm = blk->shared_perm;
 718}
 719
 720static int blk_do_attach_dev(BlockBackend *blk, void *dev)
 721{
 722    if (blk->dev) {
 723        return -EBUSY;
 724    }
 725
 726    /* While migration is still incoming, we don't need to apply the
 727     * permissions of guest device BlockBackends. We might still have a block
 728     * job or NBD server writing to the image for storage migration. */
 729    if (runstate_check(RUN_STATE_INMIGRATE)) {
 730        blk->disable_perm = true;
 731    }
 732
 733    blk_ref(blk);
 734    blk->dev = dev;
 735    blk->legacy_dev = false;
 736    blk_iostatus_reset(blk);
 737
 738    return 0;
 739}
 740
 741/*
 742 * Attach device model @dev to @blk.
 743 * Return 0 on success, -EBUSY when a device model is attached already.
 744 */
 745int blk_attach_dev(BlockBackend *blk, DeviceState *dev)
 746{
 747    return blk_do_attach_dev(blk, dev);
 748}
 749
 750/*
 751 * Attach device model @dev to @blk.
 752 * @blk must not have a device model attached already.
 753 * TODO qdevified devices don't use this, remove when devices are qdevified
 754 */
 755void blk_attach_dev_legacy(BlockBackend *blk, void *dev)
 756{
 757    if (blk_do_attach_dev(blk, dev) < 0) {
 758        abort();
 759    }
 760    blk->legacy_dev = true;
 761}
 762
 763/*
 764 * Detach device model @dev from @blk.
 765 * @dev must be currently attached to @blk.
 766 */
 767void blk_detach_dev(BlockBackend *blk, void *dev)
 768/* TODO change to DeviceState *dev when all users are qdevified */
 769{
 770    assert(blk->dev == dev);
 771    blk->dev = NULL;
 772    blk->dev_ops = NULL;
 773    blk->dev_opaque = NULL;
 774    blk->guest_block_size = 512;
 775    blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort);
 776    blk_unref(blk);
 777}
 778
 779/*
 780 * Return the device model attached to @blk if any, else null.
 781 */
 782void *blk_get_attached_dev(BlockBackend *blk)
 783/* TODO change to return DeviceState * when all users are qdevified */
 784{
 785    return blk->dev;
 786}
 787
 788/* Return the qdev ID, or if no ID is assigned the QOM path, of the block
 789 * device attached to the BlockBackend. */
 790char *blk_get_attached_dev_id(BlockBackend *blk)
 791{
 792    DeviceState *dev;
 793
 794    assert(!blk->legacy_dev);
 795    dev = blk->dev;
 796
 797    if (!dev) {
 798        return g_strdup("");
 799    } else if (dev->id) {
 800        return g_strdup(dev->id);
 801    }
 802    return object_get_canonical_path(OBJECT(dev));
 803}
 804
 805/*
 806 * Return the BlockBackend which has the device model @dev attached if it
 807 * exists, else null.
 808 *
 809 * @dev must not be null.
 810 */
 811BlockBackend *blk_by_dev(void *dev)
 812{
 813    BlockBackend *blk = NULL;
 814
 815    assert(dev != NULL);
 816    while ((blk = blk_all_next(blk)) != NULL) {
 817        if (blk->dev == dev) {
 818            return blk;
 819        }
 820    }
 821    return NULL;
 822}
 823
 824/*
 825 * Set @blk's device model callbacks to @ops.
 826 * @opaque is the opaque argument to pass to the callbacks.
 827 * This is for use by device models.
 828 */
 829void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
 830                     void *opaque)
 831{
 832    /* All drivers that use blk_set_dev_ops() are qdevified and we want to keep
 833     * it that way, so we can assume blk->dev, if present, is a DeviceState if
 834     * blk->dev_ops is set. Non-device users may use dev_ops without device. */
 835    assert(!blk->legacy_dev);
 836
 837    blk->dev_ops = ops;
 838    blk->dev_opaque = opaque;
 839
 840    /* Are we currently quiesced? Should we enforce this right now? */
 841    if (blk->quiesce_counter && ops->drained_begin) {
 842        ops->drained_begin(opaque);
 843    }
 844}
 845
 846/*
 847 * Notify @blk's attached device model of media change.
 848 *
 849 * If @load is true, notify of media load. This action can fail, meaning that
 850 * the medium cannot be loaded. @errp is set then.
 851 *
 852 * If @load is false, notify of media eject. This can never fail.
 853 *
 854 * Also send DEVICE_TRAY_MOVED events as appropriate.
 855 */
 856void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp)
 857{
 858    if (blk->dev_ops && blk->dev_ops->change_media_cb) {
 859        bool tray_was_open, tray_is_open;
 860        Error *local_err = NULL;
 861
 862        assert(!blk->legacy_dev);
 863
 864        tray_was_open = blk_dev_is_tray_open(blk);
 865        blk->dev_ops->change_media_cb(blk->dev_opaque, load, &local_err);
 866        if (local_err) {
 867            assert(load == true);
 868            error_propagate(errp, local_err);
 869            return;
 870        }
 871        tray_is_open = blk_dev_is_tray_open(blk);
 872
 873        if (tray_was_open != tray_is_open) {
 874            char *id = blk_get_attached_dev_id(blk);
 875            qapi_event_send_device_tray_moved(blk_name(blk), id, tray_is_open,
 876                                              &error_abort);
 877            g_free(id);
 878        }
 879    }
 880}
 881
 882static void blk_root_change_media(BdrvChild *child, bool load)
 883{
 884    blk_dev_change_media_cb(child->opaque, load, NULL);
 885}
 886
 887/*
 888 * Does @blk's attached device model have removable media?
 889 * %true if no device model is attached.
 890 */
 891bool blk_dev_has_removable_media(BlockBackend *blk)
 892{
 893    return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb);
 894}
 895
 896/*
 897 * Does @blk's attached device model have a tray?
 898 */
 899bool blk_dev_has_tray(BlockBackend *blk)
 900{
 901    return blk->dev_ops && blk->dev_ops->is_tray_open;
 902}
 903
 904/*
 905 * Notify @blk's attached device model of a media eject request.
 906 * If @force is true, the medium is about to be yanked out forcefully.
 907 */
 908void blk_dev_eject_request(BlockBackend *blk, bool force)
 909{
 910    if (blk->dev_ops && blk->dev_ops->eject_request_cb) {
 911        blk->dev_ops->eject_request_cb(blk->dev_opaque, force);
 912    }
 913}
 914
 915/*
 916 * Does @blk's attached device model have a tray, and is it open?
 917 */
 918bool blk_dev_is_tray_open(BlockBackend *blk)
 919{
 920    if (blk_dev_has_tray(blk)) {
 921        return blk->dev_ops->is_tray_open(blk->dev_opaque);
 922    }
 923    return false;
 924}
 925
 926/*
 927 * Does @blk's attached device model have the medium locked?
 928 * %false if the device model has no such lock.
 929 */
 930bool blk_dev_is_medium_locked(BlockBackend *blk)
 931{
 932    if (blk->dev_ops && blk->dev_ops->is_medium_locked) {
 933        return blk->dev_ops->is_medium_locked(blk->dev_opaque);
 934    }
 935    return false;
 936}
 937
 938/*
 939 * Notify @blk's attached device model of a backend size change.
 940 */
 941static void blk_root_resize(BdrvChild *child)
 942{
 943    BlockBackend *blk = child->opaque;
 944
 945    if (blk->dev_ops && blk->dev_ops->resize_cb) {
 946        blk->dev_ops->resize_cb(blk->dev_opaque);
 947    }
 948}
 949
 950void blk_iostatus_enable(BlockBackend *blk)
 951{
 952    blk->iostatus_enabled = true;
 953    blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
 954}
 955
 956/* The I/O status is only enabled if the drive explicitly
 957 * enables it _and_ the VM is configured to stop on errors */
 958bool blk_iostatus_is_enabled(const BlockBackend *blk)
 959{
 960    return (blk->iostatus_enabled &&
 961           (blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
 962            blk->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
 963            blk->on_read_error == BLOCKDEV_ON_ERROR_STOP));
 964}
 965
 966BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk)
 967{
 968    return blk->iostatus;
 969}
 970
 971void blk_iostatus_disable(BlockBackend *blk)
 972{
 973    blk->iostatus_enabled = false;
 974}
 975
 976void blk_iostatus_reset(BlockBackend *blk)
 977{
 978    if (blk_iostatus_is_enabled(blk)) {
 979        BlockDriverState *bs = blk_bs(blk);
 980        blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
 981        if (bs && bs->job) {
 982            block_job_iostatus_reset(bs->job);
 983        }
 984    }
 985}
 986
 987void blk_iostatus_set_err(BlockBackend *blk, int error)
 988{
 989    assert(blk_iostatus_is_enabled(blk));
 990    if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
 991        blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
 992                                          BLOCK_DEVICE_IO_STATUS_FAILED;
 993    }
 994}
 995
 996void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow)
 997{
 998    blk->allow_write_beyond_eof = allow;
 999}
1000
1001static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
1002                                  size_t size)
1003{
1004    int64_t len;
1005
1006    if (size > INT_MAX) {
1007        return -EIO;
1008    }
1009
1010    if (!blk_is_available(blk)) {
1011        return -ENOMEDIUM;
1012    }
1013
1014    if (offset < 0) {
1015        return -EIO;
1016    }
1017
1018    if (!blk->allow_write_beyond_eof) {
1019        len = blk_getlength(blk);
1020        if (len < 0) {
1021            return len;
1022        }
1023
1024        if (offset > len || len - offset < size) {
1025            return -EIO;
1026        }
1027    }
1028
1029    return 0;
1030}
1031
1032int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
1033                               unsigned int bytes, QEMUIOVector *qiov,
1034                               BdrvRequestFlags flags)
1035{
1036    int ret;
1037    BlockDriverState *bs = blk_bs(blk);
1038
1039    trace_blk_co_preadv(blk, bs, offset, bytes, flags);
1040
1041    ret = blk_check_byte_request(blk, offset, bytes);
1042    if (ret < 0) {
1043        return ret;
1044    }
1045
1046    bdrv_inc_in_flight(bs);
1047
1048    /* throttling disk I/O */
1049    if (blk->public.throttle_state) {
1050        throttle_group_co_io_limits_intercept(blk, bytes, false);
1051    }
1052
1053    ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
1054    bdrv_dec_in_flight(bs);
1055    return ret;
1056}
1057
1058int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
1059                                unsigned int bytes, QEMUIOVector *qiov,
1060                                BdrvRequestFlags flags)
1061{
1062    int ret;
1063    BlockDriverState *bs = blk_bs(blk);
1064
1065    trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
1066
1067    ret = blk_check_byte_request(blk, offset, bytes);
1068    if (ret < 0) {
1069        return ret;
1070    }
1071
1072    bdrv_inc_in_flight(bs);
1073
1074    /* throttling disk I/O */
1075    if (blk->public.throttle_state) {
1076        throttle_group_co_io_limits_intercept(blk, bytes, true);
1077    }
1078
1079    if (!blk->enable_write_cache) {
1080        flags |= BDRV_REQ_FUA;
1081    }
1082
1083    ret = bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags);
1084    bdrv_dec_in_flight(bs);
1085    return ret;
1086}
1087
1088typedef struct BlkRwCo {
1089    BlockBackend *blk;
1090    int64_t offset;
1091    QEMUIOVector *qiov;
1092    int ret;
1093    BdrvRequestFlags flags;
1094} BlkRwCo;
1095
1096static void blk_read_entry(void *opaque)
1097{
1098    BlkRwCo *rwco = opaque;
1099
1100    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, rwco->qiov->size,
1101                              rwco->qiov, rwco->flags);
1102}
1103
1104static void blk_write_entry(void *opaque)
1105{
1106    BlkRwCo *rwco = opaque;
1107
1108    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, rwco->qiov->size,
1109                               rwco->qiov, rwco->flags);
1110}
1111
1112static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
1113                   int64_t bytes, CoroutineEntry co_entry,
1114                   BdrvRequestFlags flags)
1115{
1116    QEMUIOVector qiov;
1117    struct iovec iov;
1118    BlkRwCo rwco;
1119
1120    iov = (struct iovec) {
1121        .iov_base = buf,
1122        .iov_len = bytes,
1123    };
1124    qemu_iovec_init_external(&qiov, &iov, 1);
1125
1126    rwco = (BlkRwCo) {
1127        .blk    = blk,
1128        .offset = offset,
1129        .qiov   = &qiov,
1130        .flags  = flags,
1131        .ret    = NOT_DONE,
1132    };
1133
1134    if (qemu_in_coroutine()) {
1135        /* Fast-path if already in coroutine context */
1136        co_entry(&rwco);
1137    } else {
1138        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
1139        bdrv_coroutine_enter(blk_bs(blk), co);
1140        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
1141    }
1142
1143    return rwco.ret;
1144}
1145
1146int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
1147                          int count)
1148{
1149    int ret;
1150
1151    ret = blk_check_byte_request(blk, offset, count);
1152    if (ret < 0) {
1153        return ret;
1154    }
1155
1156    blk_root_drained_begin(blk->root);
1157    ret = blk_pread(blk, offset, buf, count);
1158    blk_root_drained_end(blk->root);
1159    return ret;
1160}
1161
1162int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1163                      int bytes, BdrvRequestFlags flags)
1164{
1165    return blk_prw(blk, offset, NULL, bytes, blk_write_entry,
1166                   flags | BDRV_REQ_ZERO_WRITE);
1167}
1168
1169int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
1170{
1171    return bdrv_make_zero(blk->root, flags);
1172}
1173
1174static void error_callback_bh(void *opaque)
1175{
1176    struct BlockBackendAIOCB *acb = opaque;
1177
1178    bdrv_dec_in_flight(acb->common.bs);
1179    acb->common.cb(acb->common.opaque, acb->ret);
1180    qemu_aio_unref(acb);
1181}
1182
1183BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
1184                                  BlockCompletionFunc *cb,
1185                                  void *opaque, int ret)
1186{
1187    struct BlockBackendAIOCB *acb;
1188
1189    bdrv_inc_in_flight(blk_bs(blk));
1190    acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
1191    acb->blk = blk;
1192    acb->ret = ret;
1193
1194    aio_bh_schedule_oneshot(blk_get_aio_context(blk), error_callback_bh, acb);
1195    return &acb->common;
1196}
1197
1198typedef struct BlkAioEmAIOCB {
1199    BlockAIOCB common;
1200    BlkRwCo rwco;
1201    int bytes;
1202    bool has_returned;
1203} BlkAioEmAIOCB;
1204
1205static const AIOCBInfo blk_aio_em_aiocb_info = {
1206    .aiocb_size         = sizeof(BlkAioEmAIOCB),
1207};
1208
1209static void blk_aio_complete(BlkAioEmAIOCB *acb)
1210{
1211    if (acb->has_returned) {
1212        bdrv_dec_in_flight(acb->common.bs);
1213        acb->common.cb(acb->common.opaque, acb->rwco.ret);
1214        qemu_aio_unref(acb);
1215    }
1216}
1217
1218static void blk_aio_complete_bh(void *opaque)
1219{
1220    BlkAioEmAIOCB *acb = opaque;
1221    assert(acb->has_returned);
1222    blk_aio_complete(acb);
1223}
1224
1225static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
1226                                QEMUIOVector *qiov, CoroutineEntry co_entry,
1227                                BdrvRequestFlags flags,
1228                                BlockCompletionFunc *cb, void *opaque)
1229{
1230    BlkAioEmAIOCB *acb;
1231    Coroutine *co;
1232
1233    bdrv_inc_in_flight(blk_bs(blk));
1234    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1235    acb->rwco = (BlkRwCo) {
1236        .blk    = blk,
1237        .offset = offset,
1238        .qiov   = qiov,
1239        .flags  = flags,
1240        .ret    = NOT_DONE,
1241    };
1242    acb->bytes = bytes;
1243    acb->has_returned = false;
1244
1245    co = qemu_coroutine_create(co_entry, acb);
1246    bdrv_coroutine_enter(blk_bs(blk), co);
1247
1248    acb->has_returned = true;
1249    if (acb->rwco.ret != NOT_DONE) {
1250        aio_bh_schedule_oneshot(blk_get_aio_context(blk),
1251                                blk_aio_complete_bh, acb);
1252    }
1253
1254    return &acb->common;
1255}
1256
1257static void blk_aio_read_entry(void *opaque)
1258{
1259    BlkAioEmAIOCB *acb = opaque;
1260    BlkRwCo *rwco = &acb->rwco;
1261
1262    assert(rwco->qiov->size == acb->bytes);
1263    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, acb->bytes,
1264                              rwco->qiov, rwco->flags);
1265    blk_aio_complete(acb);
1266}
1267
1268static void blk_aio_write_entry(void *opaque)
1269{
1270    BlkAioEmAIOCB *acb = opaque;
1271    BlkRwCo *rwco = &acb->rwco;
1272
1273    assert(!rwco->qiov || rwco->qiov->size == acb->bytes);
1274    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, acb->bytes,
1275                               rwco->qiov, rwco->flags);
1276    blk_aio_complete(acb);
1277}
1278
1279BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1280                                  int count, BdrvRequestFlags flags,
1281                                  BlockCompletionFunc *cb, void *opaque)
1282{
1283    return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
1284                        flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
1285}
1286
1287int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
1288{
1289    int ret = blk_prw(blk, offset, buf, count, blk_read_entry, 0);
1290    if (ret < 0) {
1291        return ret;
1292    }
1293    return count;
1294}
1295
1296int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count,
1297               BdrvRequestFlags flags)
1298{
1299    int ret = blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
1300                      flags);
1301    if (ret < 0) {
1302        return ret;
1303    }
1304    return count;
1305}
1306
1307int64_t blk_getlength(BlockBackend *blk)
1308{
1309    if (!blk_is_available(blk)) {
1310        return -ENOMEDIUM;
1311    }
1312
1313    return bdrv_getlength(blk_bs(blk));
1314}
1315
1316void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr)
1317{
1318    if (!blk_bs(blk)) {
1319        *nb_sectors_ptr = 0;
1320    } else {
1321        bdrv_get_geometry(blk_bs(blk), nb_sectors_ptr);
1322    }
1323}
1324
1325int64_t blk_nb_sectors(BlockBackend *blk)
1326{
1327    if (!blk_is_available(blk)) {
1328        return -ENOMEDIUM;
1329    }
1330
1331    return bdrv_nb_sectors(blk_bs(blk));
1332}
1333
1334BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
1335                           QEMUIOVector *qiov, BdrvRequestFlags flags,
1336                           BlockCompletionFunc *cb, void *opaque)
1337{
1338    return blk_aio_prwv(blk, offset, qiov->size, qiov,
1339                        blk_aio_read_entry, flags, cb, opaque);
1340}
1341
1342BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
1343                            QEMUIOVector *qiov, BdrvRequestFlags flags,
1344                            BlockCompletionFunc *cb, void *opaque)
1345{
1346    return blk_aio_prwv(blk, offset, qiov->size, qiov,
1347                        blk_aio_write_entry, flags, cb, opaque);
1348}
1349
1350static void blk_aio_flush_entry(void *opaque)
1351{
1352    BlkAioEmAIOCB *acb = opaque;
1353    BlkRwCo *rwco = &acb->rwco;
1354
1355    rwco->ret = blk_co_flush(rwco->blk);
1356    blk_aio_complete(acb);
1357}
1358
1359BlockAIOCB *blk_aio_flush(BlockBackend *blk,
1360                          BlockCompletionFunc *cb, void *opaque)
1361{
1362    return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque);
1363}
1364
1365static void blk_aio_pdiscard_entry(void *opaque)
1366{
1367    BlkAioEmAIOCB *acb = opaque;
1368    BlkRwCo *rwco = &acb->rwco;
1369
1370    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, acb->bytes);
1371    blk_aio_complete(acb);
1372}
1373
1374BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
1375                             int64_t offset, int bytes,
1376                             BlockCompletionFunc *cb, void *opaque)
1377{
1378    return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0,
1379                        cb, opaque);
1380}
1381
1382void blk_aio_cancel(BlockAIOCB *acb)
1383{
1384    bdrv_aio_cancel(acb);
1385}
1386
1387void blk_aio_cancel_async(BlockAIOCB *acb)
1388{
1389    bdrv_aio_cancel_async(acb);
1390}
1391
1392int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1393{
1394    if (!blk_is_available(blk)) {
1395        return -ENOMEDIUM;
1396    }
1397
1398    return bdrv_co_ioctl(blk_bs(blk), req, buf);
1399}
1400
1401static void blk_ioctl_entry(void *opaque)
1402{
1403    BlkRwCo *rwco = opaque;
1404    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
1405                             rwco->qiov->iov[0].iov_base);
1406}
1407
1408int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1409{
1410    return blk_prw(blk, req, buf, 0, blk_ioctl_entry, 0);
1411}
1412
1413static void blk_aio_ioctl_entry(void *opaque)
1414{
1415    BlkAioEmAIOCB *acb = opaque;
1416    BlkRwCo *rwco = &acb->rwco;
1417
1418    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
1419                             rwco->qiov->iov[0].iov_base);
1420    blk_aio_complete(acb);
1421}
1422
1423BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
1424                          BlockCompletionFunc *cb, void *opaque)
1425{
1426    QEMUIOVector qiov;
1427    struct iovec iov;
1428
1429    iov = (struct iovec) {
1430        .iov_base = buf,
1431        .iov_len = 0,
1432    };
1433    qemu_iovec_init_external(&qiov, &iov, 1);
1434
1435    return blk_aio_prwv(blk, req, 0, &qiov, blk_aio_ioctl_entry, 0, cb, opaque);
1436}
1437
1438int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1439{
1440    int ret = blk_check_byte_request(blk, offset, bytes);
1441    if (ret < 0) {
1442        return ret;
1443    }
1444
1445    return bdrv_co_pdiscard(blk_bs(blk), offset, bytes);
1446}
1447
1448int blk_co_flush(BlockBackend *blk)
1449{
1450    if (!blk_is_available(blk)) {
1451        return -ENOMEDIUM;
1452    }
1453
1454    return bdrv_co_flush(blk_bs(blk));
1455}
1456
1457static void blk_flush_entry(void *opaque)
1458{
1459    BlkRwCo *rwco = opaque;
1460    rwco->ret = blk_co_flush(rwco->blk);
1461}
1462
1463int blk_flush(BlockBackend *blk)
1464{
1465    return blk_prw(blk, 0, NULL, 0, blk_flush_entry, 0);
1466}
1467
1468void blk_drain(BlockBackend *blk)
1469{
1470    if (blk_bs(blk)) {
1471        bdrv_drain(blk_bs(blk));
1472    }
1473}
1474
1475void blk_drain_all(void)
1476{
1477    bdrv_drain_all();
1478}
1479
1480void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
1481                      BlockdevOnError on_write_error)
1482{
1483    blk->on_read_error = on_read_error;
1484    blk->on_write_error = on_write_error;
1485}
1486
1487BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read)
1488{
1489    return is_read ? blk->on_read_error : blk->on_write_error;
1490}
1491
1492BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
1493                                      int error)
1494{
1495    BlockdevOnError on_err = blk_get_on_error(blk, is_read);
1496
1497    switch (on_err) {
1498    case BLOCKDEV_ON_ERROR_ENOSPC:
1499        return (error == ENOSPC) ?
1500               BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
1501    case BLOCKDEV_ON_ERROR_STOP:
1502        return BLOCK_ERROR_ACTION_STOP;
1503    case BLOCKDEV_ON_ERROR_REPORT:
1504        return BLOCK_ERROR_ACTION_REPORT;
1505    case BLOCKDEV_ON_ERROR_IGNORE:
1506        return BLOCK_ERROR_ACTION_IGNORE;
1507    case BLOCKDEV_ON_ERROR_AUTO:
1508    default:
1509        abort();
1510    }
1511}
1512
1513static void send_qmp_error_event(BlockBackend *blk,
1514                                 BlockErrorAction action,
1515                                 bool is_read, int error)
1516{
1517    IoOperationType optype;
1518
1519    optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
1520    qapi_event_send_block_io_error(blk_name(blk),
1521                                   bdrv_get_node_name(blk_bs(blk)), optype,
1522                                   action, blk_iostatus_is_enabled(blk),
1523                                   error == ENOSPC, strerror(error),
1524                                   &error_abort);
1525}
1526
1527/* This is done by device models because, while the block layer knows
1528 * about the error, it does not know whether an operation comes from
1529 * the device or the block layer (from a job, for example).
1530 */
1531void blk_error_action(BlockBackend *blk, BlockErrorAction action,
1532                      bool is_read, int error)
1533{
1534    assert(error >= 0);
1535
1536    if (action == BLOCK_ERROR_ACTION_STOP) {
1537        /* First set the iostatus, so that "info block" returns an iostatus
1538         * that matches the events raised so far (an additional error iostatus
1539         * is fine, but not a lost one).
1540         */
1541        blk_iostatus_set_err(blk, error);
1542
1543        /* Then raise the request to stop the VM and the event.
1544         * qemu_system_vmstop_request_prepare has two effects.  First,
1545         * it ensures that the STOP event always comes after the
1546         * BLOCK_IO_ERROR event.  Second, it ensures that even if management
1547         * can observe the STOP event and do a "cont" before the STOP
1548         * event is issued, the VM will not stop.  In this case, vm_start()
1549         * also ensures that the STOP/RESUME pair of events is emitted.
1550         */
1551        qemu_system_vmstop_request_prepare();
1552        send_qmp_error_event(blk, action, is_read, error);
1553        qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
1554    } else {
1555        send_qmp_error_event(blk, action, is_read, error);
1556    }
1557}
1558
1559int blk_is_read_only(BlockBackend *blk)
1560{
1561    BlockDriverState *bs = blk_bs(blk);
1562
1563    if (bs) {
1564        return bdrv_is_read_only(bs);
1565    } else {
1566        return blk->root_state.read_only;
1567    }
1568}
1569
1570int blk_is_sg(BlockBackend *blk)
1571{
1572    BlockDriverState *bs = blk_bs(blk);
1573
1574    if (!bs) {
1575        return 0;
1576    }
1577
1578    return bdrv_is_sg(bs);
1579}
1580
1581int blk_enable_write_cache(BlockBackend *blk)
1582{
1583    return blk->enable_write_cache;
1584}
1585
1586void blk_set_enable_write_cache(BlockBackend *blk, bool wce)
1587{
1588    blk->enable_write_cache = wce;
1589}
1590
1591void blk_invalidate_cache(BlockBackend *blk, Error **errp)
1592{
1593    BlockDriverState *bs = blk_bs(blk);
1594
1595    if (!bs) {
1596        error_setg(errp, "Device '%s' has no medium", blk->name);
1597        return;
1598    }
1599
1600    bdrv_invalidate_cache(bs, errp);
1601}
1602
1603bool blk_is_inserted(BlockBackend *blk)
1604{
1605    BlockDriverState *bs = blk_bs(blk);
1606
1607    return bs && bdrv_is_inserted(bs);
1608}
1609
1610bool blk_is_available(BlockBackend *blk)
1611{
1612    return blk_is_inserted(blk) && !blk_dev_is_tray_open(blk);
1613}
1614
1615void blk_lock_medium(BlockBackend *blk, bool locked)
1616{
1617    BlockDriverState *bs = blk_bs(blk);
1618
1619    if (bs) {
1620        bdrv_lock_medium(bs, locked);
1621    }
1622}
1623
1624void blk_eject(BlockBackend *blk, bool eject_flag)
1625{
1626    BlockDriverState *bs = blk_bs(blk);
1627    char *id;
1628
1629    /* blk_eject is only called by qdevified devices */
1630    assert(!blk->legacy_dev);
1631
1632    if (bs) {
1633        bdrv_eject(bs, eject_flag);
1634    }
1635
1636    /* Whether or not we ejected on the backend,
1637     * the frontend experienced a tray event. */
1638    id = blk_get_attached_dev_id(blk);
1639    qapi_event_send_device_tray_moved(blk_name(blk), id,
1640                                      eject_flag, &error_abort);
1641    g_free(id);
1642}
1643
1644int blk_get_flags(BlockBackend *blk)
1645{
1646    BlockDriverState *bs = blk_bs(blk);
1647
1648    if (bs) {
1649        return bdrv_get_flags(bs);
1650    } else {
1651        return blk->root_state.open_flags;
1652    }
1653}
1654
1655/* Returns the maximum transfer length, in bytes; guaranteed nonzero */
1656uint32_t blk_get_max_transfer(BlockBackend *blk)
1657{
1658    BlockDriverState *bs = blk_bs(blk);
1659    uint32_t max = 0;
1660
1661    if (bs) {
1662        max = bs->bl.max_transfer;
1663    }
1664    return MIN_NON_ZERO(max, INT_MAX);
1665}
1666
1667int blk_get_max_iov(BlockBackend *blk)
1668{
1669    return blk->root->bs->bl.max_iov;
1670}
1671
1672void blk_set_guest_block_size(BlockBackend *blk, int align)
1673{
1674    blk->guest_block_size = align;
1675}
1676
1677void *blk_try_blockalign(BlockBackend *blk, size_t size)
1678{
1679    return qemu_try_blockalign(blk ? blk_bs(blk) : NULL, size);
1680}
1681
1682void *blk_blockalign(BlockBackend *blk, size_t size)
1683{
1684    return qemu_blockalign(blk ? blk_bs(blk) : NULL, size);
1685}
1686
1687bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
1688{
1689    BlockDriverState *bs = blk_bs(blk);
1690
1691    if (!bs) {
1692        return false;
1693    }
1694
1695    return bdrv_op_is_blocked(bs, op, errp);
1696}
1697
1698void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
1699{
1700    BlockDriverState *bs = blk_bs(blk);
1701
1702    if (bs) {
1703        bdrv_op_unblock(bs, op, reason);
1704    }
1705}
1706
1707void blk_op_block_all(BlockBackend *blk, Error *reason)
1708{
1709    BlockDriverState *bs = blk_bs(blk);
1710
1711    if (bs) {
1712        bdrv_op_block_all(bs, reason);
1713    }
1714}
1715
1716void blk_op_unblock_all(BlockBackend *blk, Error *reason)
1717{
1718    BlockDriverState *bs = blk_bs(blk);
1719
1720    if (bs) {
1721        bdrv_op_unblock_all(bs, reason);
1722    }
1723}
1724
1725AioContext *blk_get_aio_context(BlockBackend *blk)
1726{
1727    BlockDriverState *bs = blk_bs(blk);
1728
1729    if (bs) {
1730        return bdrv_get_aio_context(bs);
1731    } else {
1732        return qemu_get_aio_context();
1733    }
1734}
1735
1736static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
1737{
1738    BlockBackendAIOCB *blk_acb = DO_UPCAST(BlockBackendAIOCB, common, acb);
1739    return blk_get_aio_context(blk_acb->blk);
1740}
1741
1742void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
1743{
1744    BlockDriverState *bs = blk_bs(blk);
1745
1746    if (bs) {
1747        if (blk->public.throttle_state) {
1748            throttle_timers_detach_aio_context(&blk->public.throttle_timers);
1749        }
1750        bdrv_set_aio_context(bs, new_context);
1751        if (blk->public.throttle_state) {
1752            throttle_timers_attach_aio_context(&blk->public.throttle_timers,
1753                                               new_context);
1754        }
1755    }
1756}
1757
1758void blk_add_aio_context_notifier(BlockBackend *blk,
1759        void (*attached_aio_context)(AioContext *new_context, void *opaque),
1760        void (*detach_aio_context)(void *opaque), void *opaque)
1761{
1762    BlockDriverState *bs = blk_bs(blk);
1763
1764    if (bs) {
1765        bdrv_add_aio_context_notifier(bs, attached_aio_context,
1766                                      detach_aio_context, opaque);
1767    }
1768}
1769
1770void blk_remove_aio_context_notifier(BlockBackend *blk,
1771                                     void (*attached_aio_context)(AioContext *,
1772                                                                  void *),
1773                                     void (*detach_aio_context)(void *),
1774                                     void *opaque)
1775{
1776    BlockDriverState *bs = blk_bs(blk);
1777
1778    if (bs) {
1779        bdrv_remove_aio_context_notifier(bs, attached_aio_context,
1780                                         detach_aio_context, opaque);
1781    }
1782}
1783
1784void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify)
1785{
1786    notifier_list_add(&blk->remove_bs_notifiers, notify);
1787}
1788
1789void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
1790{
1791    notifier_list_add(&blk->insert_bs_notifiers, notify);
1792}
1793
1794void blk_io_plug(BlockBackend *blk)
1795{
1796    BlockDriverState *bs = blk_bs(blk);
1797
1798    if (bs) {
1799        bdrv_io_plug(bs);
1800    }
1801}
1802
1803void blk_io_unplug(BlockBackend *blk)
1804{
1805    BlockDriverState *bs = blk_bs(blk);
1806
1807    if (bs) {
1808        bdrv_io_unplug(bs);
1809    }
1810}
1811
1812BlockAcctStats *blk_get_stats(BlockBackend *blk)
1813{
1814    return &blk->stats;
1815}
1816
1817void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
1818                  BlockCompletionFunc *cb, void *opaque)
1819{
1820    return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
1821}
1822
1823int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1824                                      int bytes, BdrvRequestFlags flags)
1825{
1826    return blk_co_pwritev(blk, offset, bytes, NULL,
1827                          flags | BDRV_REQ_ZERO_WRITE);
1828}
1829
1830int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
1831                          int count)
1832{
1833    return blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
1834                   BDRV_REQ_WRITE_COMPRESSED);
1835}
1836
1837int blk_truncate(BlockBackend *blk, int64_t offset, PreallocMode prealloc,
1838                 Error **errp)
1839{
1840    if (!blk_is_available(blk)) {
1841        error_setg(errp, "No medium inserted");
1842        return -ENOMEDIUM;
1843    }
1844
1845    return bdrv_truncate(blk->root, offset, prealloc, errp);
1846}
1847
1848static void blk_pdiscard_entry(void *opaque)
1849{
1850    BlkRwCo *rwco = opaque;
1851    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, rwco->qiov->size);
1852}
1853
1854int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1855{
1856    return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0);
1857}
1858
1859int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
1860                     int64_t pos, int size)
1861{
1862    int ret;
1863
1864    if (!blk_is_available(blk)) {
1865        return -ENOMEDIUM;
1866    }
1867
1868    ret = bdrv_save_vmstate(blk_bs(blk), buf, pos, size);
1869    if (ret < 0) {
1870        return ret;
1871    }
1872
1873    if (ret == size && !blk->enable_write_cache) {
1874        ret = bdrv_flush(blk_bs(blk));
1875    }
1876
1877    return ret < 0 ? ret : size;
1878}
1879
1880int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
1881{
1882    if (!blk_is_available(blk)) {
1883        return -ENOMEDIUM;
1884    }
1885
1886    return bdrv_load_vmstate(blk_bs(blk), buf, pos, size);
1887}
1888
1889int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
1890{
1891    if (!blk_is_available(blk)) {
1892        return -ENOMEDIUM;
1893    }
1894
1895    return bdrv_probe_blocksizes(blk_bs(blk), bsz);
1896}
1897
1898int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo)
1899{
1900    if (!blk_is_available(blk)) {
1901        return -ENOMEDIUM;
1902    }
1903
1904    return bdrv_probe_geometry(blk_bs(blk), geo);
1905}
1906
1907/*
1908 * Updates the BlockBackendRootState object with data from the currently
1909 * attached BlockDriverState.
1910 */
1911void blk_update_root_state(BlockBackend *blk)
1912{
1913    assert(blk->root);
1914
1915    blk->root_state.open_flags    = blk->root->bs->open_flags;
1916    blk->root_state.read_only     = blk->root->bs->read_only;
1917    blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
1918}
1919
1920/*
1921 * Returns the detect-zeroes setting to be used for bdrv_open() of a
1922 * BlockDriverState which is supposed to inherit the root state.
1923 */
1924bool blk_get_detect_zeroes_from_root_state(BlockBackend *blk)
1925{
1926    return blk->root_state.detect_zeroes;
1927}
1928
1929/*
1930 * Returns the flags to be used for bdrv_open() of a BlockDriverState which is
1931 * supposed to inherit the root state.
1932 */
1933int blk_get_open_flags_from_root_state(BlockBackend *blk)
1934{
1935    int bs_flags;
1936
1937    bs_flags = blk->root_state.read_only ? 0 : BDRV_O_RDWR;
1938    bs_flags |= blk->root_state.open_flags & ~BDRV_O_RDWR;
1939
1940    return bs_flags;
1941}
1942
1943BlockBackendRootState *blk_get_root_state(BlockBackend *blk)
1944{
1945    return &blk->root_state;
1946}
1947
1948int blk_commit_all(void)
1949{
1950    BlockBackend *blk = NULL;
1951
1952    while ((blk = blk_all_next(blk)) != NULL) {
1953        AioContext *aio_context = blk_get_aio_context(blk);
1954
1955        aio_context_acquire(aio_context);
1956        if (blk_is_inserted(blk) && blk->root->bs->backing) {
1957            int ret = bdrv_commit(blk->root->bs);
1958            if (ret < 0) {
1959                aio_context_release(aio_context);
1960                return ret;
1961            }
1962        }
1963        aio_context_release(aio_context);
1964    }
1965    return 0;
1966}
1967
1968
1969/* throttling disk I/O limits */
1970void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
1971{
1972    throttle_group_config(blk, cfg);
1973}
1974
1975void blk_io_limits_disable(BlockBackend *blk)
1976{
1977    assert(blk->public.throttle_state);
1978    bdrv_drained_begin(blk_bs(blk));
1979    throttle_group_unregister_blk(blk);
1980    bdrv_drained_end(blk_bs(blk));
1981}
1982
1983/* should be called before blk_set_io_limits if a limit is set */
1984void blk_io_limits_enable(BlockBackend *blk, const char *group)
1985{
1986    assert(!blk->public.throttle_state);
1987    throttle_group_register_blk(blk, group);
1988}
1989
1990void blk_io_limits_update_group(BlockBackend *blk, const char *group)
1991{
1992    /* this BB is not part of any group */
1993    if (!blk->public.throttle_state) {
1994        return;
1995    }
1996
1997    /* this BB is a part of the same group than the one we want */
1998    if (!g_strcmp0(throttle_group_get_name(blk), group)) {
1999        return;
2000    }
2001
2002    /* need to change the group this bs belong to */
2003    blk_io_limits_disable(blk);
2004    blk_io_limits_enable(blk, group);
2005}
2006
2007static void blk_root_drained_begin(BdrvChild *child)
2008{
2009    BlockBackend *blk = child->opaque;
2010
2011    if (++blk->quiesce_counter == 1) {
2012        if (blk->dev_ops && blk->dev_ops->drained_begin) {
2013            blk->dev_ops->drained_begin(blk->dev_opaque);
2014        }
2015    }
2016
2017    /* Note that blk->root may not be accessible here yet if we are just
2018     * attaching to a BlockDriverState that is drained. Use child instead. */
2019
2020    if (atomic_fetch_inc(&blk->public.io_limits_disabled) == 0) {
2021        throttle_group_restart_blk(blk);
2022    }
2023}
2024
2025static void blk_root_drained_end(BdrvChild *child)
2026{
2027    BlockBackend *blk = child->opaque;
2028    assert(blk->quiesce_counter);
2029
2030    assert(blk->public.io_limits_disabled);
2031    atomic_dec(&blk->public.io_limits_disabled);
2032
2033    if (--blk->quiesce_counter == 0) {
2034        if (blk->dev_ops && blk->dev_ops->drained_end) {
2035            blk->dev_ops->drained_end(blk->dev_opaque);
2036        }
2037    }
2038}
2039