qemu/block/block-backend.c
<<
>>
Prefs
   1/*
   2 * QEMU Block backends
   3 *
   4 * Copyright (C) 2014-2016 Red Hat, Inc.
   5 *
   6 * Authors:
   7 *  Markus Armbruster <armbru@redhat.com>,
   8 *
   9 * This work is licensed under the terms of the GNU LGPL, version 2.1
  10 * or later.  See the COPYING.LIB file in the top-level directory.
  11 */
  12
  13#include "qemu/osdep.h"
  14#include "sysemu/block-backend.h"
  15#include "block/block_int.h"
  16#include "block/blockjob.h"
  17#include "block/throttle-groups.h"
  18#include "sysemu/blockdev.h"
  19#include "sysemu/sysemu.h"
  20#include "qapi-event.h"
  21#include "qemu/id.h"
  22#include "trace.h"
  23#include "migration/misc.h"
  24
  25/* Number of coroutines to reserve per attached device model */
  26#define COROUTINE_POOL_RESERVATION 64
  27
  28#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  29
  30static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb);
  31
  32struct BlockBackend {
  33    char *name;
  34    int refcnt;
  35    BdrvChild *root;
  36    DriveInfo *legacy_dinfo;    /* null unless created by drive_new() */
  37    QTAILQ_ENTRY(BlockBackend) link;         /* for block_backends */
  38    QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */
  39    BlockBackendPublic public;
  40
  41    void *dev;                  /* attached device model, if any */
  42    bool legacy_dev;            /* true if dev is not a DeviceState */
  43    /* TODO change to DeviceState when all users are qdevified */
  44    const BlockDevOps *dev_ops;
  45    void *dev_opaque;
  46
  47    /* the block size for which the guest device expects atomicity */
  48    int guest_block_size;
  49
  50    /* If the BDS tree is removed, some of its options are stored here (which
  51     * can be used to restore those options in the new BDS on insert) */
  52    BlockBackendRootState root_state;
  53
  54    bool enable_write_cache;
  55
  56    /* I/O stats (display with "info blockstats"). */
  57    BlockAcctStats stats;
  58
  59    BlockdevOnError on_read_error, on_write_error;
  60    bool iostatus_enabled;
  61    BlockDeviceIoStatus iostatus;
  62
  63    uint64_t perm;
  64    uint64_t shared_perm;
  65    bool disable_perm;
  66
  67    bool allow_write_beyond_eof;
  68
  69    NotifierList remove_bs_notifiers, insert_bs_notifiers;
  70
  71    int quiesce_counter;
  72    VMChangeStateEntry *vmsh;
  73    bool force_allow_inactivate;
  74};
  75
  76typedef struct BlockBackendAIOCB {
  77    BlockAIOCB common;
  78    BlockBackend *blk;
  79    int ret;
  80} BlockBackendAIOCB;
  81
  82static const AIOCBInfo block_backend_aiocb_info = {
  83    .get_aio_context = blk_aiocb_get_aio_context,
  84    .aiocb_size = sizeof(BlockBackendAIOCB),
  85};
  86
  87static void drive_info_del(DriveInfo *dinfo);
  88static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
  89
  90/* All BlockBackends */
  91static QTAILQ_HEAD(, BlockBackend) block_backends =
  92    QTAILQ_HEAD_INITIALIZER(block_backends);
  93
  94/* All BlockBackends referenced by the monitor and which are iterated through by
  95 * blk_next() */
  96static QTAILQ_HEAD(, BlockBackend) monitor_block_backends =
  97    QTAILQ_HEAD_INITIALIZER(monitor_block_backends);
  98
  99static void blk_root_inherit_options(int *child_flags, QDict *child_options,
 100                                     int parent_flags, QDict *parent_options)
 101{
 102    /* We're not supposed to call this function for root nodes */
 103    abort();
 104}
 105static void blk_root_drained_begin(BdrvChild *child);
 106static void blk_root_drained_end(BdrvChild *child);
 107
 108static void blk_root_change_media(BdrvChild *child, bool load);
 109static void blk_root_resize(BdrvChild *child);
 110
 111static char *blk_root_get_parent_desc(BdrvChild *child)
 112{
 113    BlockBackend *blk = child->opaque;
 114    char *dev_id;
 115
 116    if (blk->name) {
 117        return g_strdup(blk->name);
 118    }
 119
 120    dev_id = blk_get_attached_dev_id(blk);
 121    if (*dev_id) {
 122        return dev_id;
 123    } else {
 124        /* TODO Callback into the BB owner for something more detailed */
 125        g_free(dev_id);
 126        return g_strdup("a block device");
 127    }
 128}
 129
 130static const char *blk_root_get_name(BdrvChild *child)
 131{
 132    return blk_name(child->opaque);
 133}
 134
 135static void blk_vm_state_changed(void *opaque, int running, RunState state)
 136{
 137    Error *local_err = NULL;
 138    BlockBackend *blk = opaque;
 139
 140    if (state == RUN_STATE_INMIGRATE) {
 141        return;
 142    }
 143
 144    qemu_del_vm_change_state_handler(blk->vmsh);
 145    blk->vmsh = NULL;
 146    blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
 147    if (local_err) {
 148        error_report_err(local_err);
 149    }
 150}
 151
 152/*
 153 * Notifies the user of the BlockBackend that migration has completed. qdev
 154 * devices can tighten their permissions in response (specifically revoke
 155 * shared write permissions that we needed for storage migration).
 156 *
 157 * If an error is returned, the VM cannot be allowed to be resumed.
 158 */
 159static void blk_root_activate(BdrvChild *child, Error **errp)
 160{
 161    BlockBackend *blk = child->opaque;
 162    Error *local_err = NULL;
 163
 164    if (!blk->disable_perm) {
 165        return;
 166    }
 167
 168    blk->disable_perm = false;
 169
 170    blk_set_perm(blk, blk->perm, BLK_PERM_ALL, &local_err);
 171    if (local_err) {
 172        error_propagate(errp, local_err);
 173        blk->disable_perm = true;
 174        return;
 175    }
 176
 177    if (runstate_check(RUN_STATE_INMIGRATE)) {
 178        /* Activation can happen when migration process is still active, for
 179         * example when nbd_server_add is called during non-shared storage
 180         * migration. Defer the shared_perm update to migration completion. */
 181        if (!blk->vmsh) {
 182            blk->vmsh = qemu_add_vm_change_state_handler(blk_vm_state_changed,
 183                                                         blk);
 184        }
 185        return;
 186    }
 187
 188    blk_set_perm(blk, blk->perm, blk->shared_perm, &local_err);
 189    if (local_err) {
 190        error_propagate(errp, local_err);
 191        blk->disable_perm = true;
 192        return;
 193    }
 194}
 195
 196void blk_set_force_allow_inactivate(BlockBackend *blk)
 197{
 198    blk->force_allow_inactivate = true;
 199}
 200
 201static bool blk_can_inactivate(BlockBackend *blk)
 202{
 203    /* If it is a guest device, inactivate is ok. */
 204    if (blk->dev || blk_name(blk)[0]) {
 205        return true;
 206    }
 207
 208    /* Inactivating means no more writes to the image can be done,
 209     * even if those writes would be changes invisible to the
 210     * guest.  For block job BBs that satisfy this, we can just allow
 211     * it.  This is the case for mirror job source, which is required
 212     * by libvirt non-shared block migration. */
 213    if (!(blk->perm & (BLK_PERM_WRITE | BLK_PERM_WRITE_UNCHANGED))) {
 214        return true;
 215    }
 216
 217    return blk->force_allow_inactivate;
 218}
 219
 220static int blk_root_inactivate(BdrvChild *child)
 221{
 222    BlockBackend *blk = child->opaque;
 223
 224    if (blk->disable_perm) {
 225        return 0;
 226    }
 227
 228    if (!blk_can_inactivate(blk)) {
 229        return -EPERM;
 230    }
 231
 232    blk->disable_perm = true;
 233    if (blk->root) {
 234        bdrv_child_try_set_perm(blk->root, 0, BLK_PERM_ALL, &error_abort);
 235    }
 236
 237    return 0;
 238}
 239
 240static const BdrvChildRole child_root = {
 241    .inherit_options    = blk_root_inherit_options,
 242
 243    .change_media       = blk_root_change_media,
 244    .resize             = blk_root_resize,
 245    .get_name           = blk_root_get_name,
 246    .get_parent_desc    = blk_root_get_parent_desc,
 247
 248    .drained_begin      = blk_root_drained_begin,
 249    .drained_end        = blk_root_drained_end,
 250
 251    .activate           = blk_root_activate,
 252    .inactivate         = blk_root_inactivate,
 253};
 254
 255/*
 256 * Create a new BlockBackend with a reference count of one.
 257 *
 258 * @perm is a bitmasks of BLK_PERM_* constants which describes the permissions
 259 * to request for a block driver node that is attached to this BlockBackend.
 260 * @shared_perm is a bitmask which describes which permissions may be granted
 261 * to other users of the attached node.
 262 * Both sets of permissions can be changed later using blk_set_perm().
 263 *
 264 * Return the new BlockBackend on success, null on failure.
 265 */
 266BlockBackend *blk_new(uint64_t perm, uint64_t shared_perm)
 267{
 268    BlockBackend *blk;
 269
 270    blk = g_new0(BlockBackend, 1);
 271    blk->refcnt = 1;
 272    blk->perm = perm;
 273    blk->shared_perm = shared_perm;
 274    blk_set_enable_write_cache(blk, true);
 275
 276    block_acct_init(&blk->stats);
 277
 278    notifier_list_init(&blk->remove_bs_notifiers);
 279    notifier_list_init(&blk->insert_bs_notifiers);
 280
 281    QTAILQ_INSERT_TAIL(&block_backends, blk, link);
 282    return blk;
 283}
 284
 285/*
 286 * Creates a new BlockBackend, opens a new BlockDriverState, and connects both.
 287 *
 288 * Just as with bdrv_open(), after having called this function the reference to
 289 * @options belongs to the block layer (even on failure).
 290 *
 291 * TODO: Remove @filename and @flags; it should be possible to specify a whole
 292 * BDS tree just by specifying the @options QDict (or @reference,
 293 * alternatively). At the time of adding this function, this is not possible,
 294 * though, so callers of this function have to be able to specify @filename and
 295 * @flags.
 296 */
 297BlockBackend *blk_new_open(const char *filename, const char *reference,
 298                           QDict *options, int flags, Error **errp)
 299{
 300    BlockBackend *blk;
 301    BlockDriverState *bs;
 302    uint64_t perm = 0;
 303
 304    /* blk_new_open() is mainly used in .bdrv_create implementations and the
 305     * tools where sharing isn't a concern because the BDS stays private, so we
 306     * just request permission according to the flags.
 307     *
 308     * The exceptions are xen_disk and blockdev_init(); in these cases, the
 309     * caller of blk_new_open() doesn't make use of the permissions, but they
 310     * shouldn't hurt either. We can still share everything here because the
 311     * guest devices will add their own blockers if they can't share. */
 312    if ((flags & BDRV_O_NO_IO) == 0) {
 313        perm |= BLK_PERM_CONSISTENT_READ;
 314        if (flags & BDRV_O_RDWR) {
 315            perm |= BLK_PERM_WRITE;
 316        }
 317    }
 318    if (flags & BDRV_O_RESIZE) {
 319        perm |= BLK_PERM_RESIZE;
 320    }
 321
 322    blk = blk_new(perm, BLK_PERM_ALL);
 323    bs = bdrv_open(filename, reference, options, flags, errp);
 324    if (!bs) {
 325        blk_unref(blk);
 326        return NULL;
 327    }
 328
 329    blk->root = bdrv_root_attach_child(bs, "root", &child_root,
 330                                       perm, BLK_PERM_ALL, blk, errp);
 331    if (!blk->root) {
 332        bdrv_unref(bs);
 333        blk_unref(blk);
 334        return NULL;
 335    }
 336
 337    return blk;
 338}
 339
 340static void blk_delete(BlockBackend *blk)
 341{
 342    assert(!blk->refcnt);
 343    assert(!blk->name);
 344    assert(!blk->dev);
 345    if (blk->public.throttle_group_member.throttle_state) {
 346        blk_io_limits_disable(blk);
 347    }
 348    if (blk->root) {
 349        blk_remove_bs(blk);
 350    }
 351    if (blk->vmsh) {
 352        qemu_del_vm_change_state_handler(blk->vmsh);
 353        blk->vmsh = NULL;
 354    }
 355    assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers));
 356    assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers));
 357    QTAILQ_REMOVE(&block_backends, blk, link);
 358    drive_info_del(blk->legacy_dinfo);
 359    block_acct_cleanup(&blk->stats);
 360    g_free(blk);
 361}
 362
 363static void drive_info_del(DriveInfo *dinfo)
 364{
 365    if (!dinfo) {
 366        return;
 367    }
 368    qemu_opts_del(dinfo->opts);
 369    g_free(dinfo->serial);
 370    g_free(dinfo);
 371}
 372
 373int blk_get_refcnt(BlockBackend *blk)
 374{
 375    return blk ? blk->refcnt : 0;
 376}
 377
 378/*
 379 * Increment @blk's reference count.
 380 * @blk must not be null.
 381 */
 382void blk_ref(BlockBackend *blk)
 383{
 384    blk->refcnt++;
 385}
 386
 387/*
 388 * Decrement @blk's reference count.
 389 * If this drops it to zero, destroy @blk.
 390 * For convenience, do nothing if @blk is null.
 391 */
 392void blk_unref(BlockBackend *blk)
 393{
 394    if (blk) {
 395        assert(blk->refcnt > 0);
 396        if (!--blk->refcnt) {
 397            blk_delete(blk);
 398        }
 399    }
 400}
 401
 402/*
 403 * Behaves similarly to blk_next() but iterates over all BlockBackends, even the
 404 * ones which are hidden (i.e. are not referenced by the monitor).
 405 */
 406BlockBackend *blk_all_next(BlockBackend *blk)
 407{
 408    return blk ? QTAILQ_NEXT(blk, link)
 409               : QTAILQ_FIRST(&block_backends);
 410}
 411
 412void blk_remove_all_bs(void)
 413{
 414    BlockBackend *blk = NULL;
 415
 416    while ((blk = blk_all_next(blk)) != NULL) {
 417        AioContext *ctx = blk_get_aio_context(blk);
 418
 419        aio_context_acquire(ctx);
 420        if (blk->root) {
 421            blk_remove_bs(blk);
 422        }
 423        aio_context_release(ctx);
 424    }
 425}
 426
 427/*
 428 * Return the monitor-owned BlockBackend after @blk.
 429 * If @blk is null, return the first one.
 430 * Else, return @blk's next sibling, which may be null.
 431 *
 432 * To iterate over all BlockBackends, do
 433 * for (blk = blk_next(NULL); blk; blk = blk_next(blk)) {
 434 *     ...
 435 * }
 436 */
 437BlockBackend *blk_next(BlockBackend *blk)
 438{
 439    return blk ? QTAILQ_NEXT(blk, monitor_link)
 440               : QTAILQ_FIRST(&monitor_block_backends);
 441}
 442
 443/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
 444 * the monitor or attached to a BlockBackend */
 445BlockDriverState *bdrv_next(BdrvNextIterator *it)
 446{
 447    BlockDriverState *bs, *old_bs;
 448
 449    /* Must be called from the main loop */
 450    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 451
 452    /* First, return all root nodes of BlockBackends. In order to avoid
 453     * returning a BDS twice when multiple BBs refer to it, we only return it
 454     * if the BB is the first one in the parent list of the BDS. */
 455    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
 456        BlockBackend *old_blk = it->blk;
 457
 458        old_bs = old_blk ? blk_bs(old_blk) : NULL;
 459
 460        do {
 461            it->blk = blk_all_next(it->blk);
 462            bs = it->blk ? blk_bs(it->blk) : NULL;
 463        } while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
 464
 465        if (it->blk) {
 466            blk_ref(it->blk);
 467        }
 468        blk_unref(old_blk);
 469
 470        if (bs) {
 471            bdrv_ref(bs);
 472            bdrv_unref(old_bs);
 473            return bs;
 474        }
 475        it->phase = BDRV_NEXT_MONITOR_OWNED;
 476    } else {
 477        old_bs = it->bs;
 478    }
 479
 480    /* Then return the monitor-owned BDSes without a BB attached. Ignore all
 481     * BDSes that are attached to a BlockBackend here; they have been handled
 482     * by the above block already */
 483    do {
 484        it->bs = bdrv_next_monitor_owned(it->bs);
 485        bs = it->bs;
 486    } while (bs && bdrv_has_blk(bs));
 487
 488    if (bs) {
 489        bdrv_ref(bs);
 490    }
 491    bdrv_unref(old_bs);
 492
 493    return bs;
 494}
 495
 496static void bdrv_next_reset(BdrvNextIterator *it)
 497{
 498    *it = (BdrvNextIterator) {
 499        .phase = BDRV_NEXT_BACKEND_ROOTS,
 500    };
 501}
 502
 503BlockDriverState *bdrv_first(BdrvNextIterator *it)
 504{
 505    bdrv_next_reset(it);
 506    return bdrv_next(it);
 507}
 508
 509/* Must be called when aborting a bdrv_next() iteration before
 510 * bdrv_next() returns NULL */
 511void bdrv_next_cleanup(BdrvNextIterator *it)
 512{
 513    /* Must be called from the main loop */
 514    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
 515
 516    if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
 517        if (it->blk) {
 518            bdrv_unref(blk_bs(it->blk));
 519            blk_unref(it->blk);
 520        }
 521    } else {
 522        bdrv_unref(it->bs);
 523    }
 524
 525    bdrv_next_reset(it);
 526}
 527
 528/*
 529 * Add a BlockBackend into the list of backends referenced by the monitor, with
 530 * the given @name acting as the handle for the monitor.
 531 * Strictly for use by blockdev.c.
 532 *
 533 * @name must not be null or empty.
 534 *
 535 * Returns true on success and false on failure. In the latter case, an Error
 536 * object is returned through @errp.
 537 */
 538bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp)
 539{
 540    assert(!blk->name);
 541    assert(name && name[0]);
 542
 543    if (!id_wellformed(name)) {
 544        error_setg(errp, "Invalid device name");
 545        return false;
 546    }
 547    if (blk_by_name(name)) {
 548        error_setg(errp, "Device with id '%s' already exists", name);
 549        return false;
 550    }
 551    if (bdrv_find_node(name)) {
 552        error_setg(errp,
 553                   "Device name '%s' conflicts with an existing node name",
 554                   name);
 555        return false;
 556    }
 557
 558    blk->name = g_strdup(name);
 559    QTAILQ_INSERT_TAIL(&monitor_block_backends, blk, monitor_link);
 560    return true;
 561}
 562
 563/*
 564 * Remove a BlockBackend from the list of backends referenced by the monitor.
 565 * Strictly for use by blockdev.c.
 566 */
 567void monitor_remove_blk(BlockBackend *blk)
 568{
 569    if (!blk->name) {
 570        return;
 571    }
 572
 573    QTAILQ_REMOVE(&monitor_block_backends, blk, monitor_link);
 574    g_free(blk->name);
 575    blk->name = NULL;
 576}
 577
 578/*
 579 * Return @blk's name, a non-null string.
 580 * Returns an empty string iff @blk is not referenced by the monitor.
 581 */
 582const char *blk_name(const BlockBackend *blk)
 583{
 584    return blk->name ?: "";
 585}
 586
 587/*
 588 * Return the BlockBackend with name @name if it exists, else null.
 589 * @name must not be null.
 590 */
 591BlockBackend *blk_by_name(const char *name)
 592{
 593    BlockBackend *blk = NULL;
 594
 595    assert(name);
 596    while ((blk = blk_next(blk)) != NULL) {
 597        if (!strcmp(name, blk->name)) {
 598            return blk;
 599        }
 600    }
 601    return NULL;
 602}
 603
 604/*
 605 * Return the BlockDriverState attached to @blk if any, else null.
 606 */
 607BlockDriverState *blk_bs(BlockBackend *blk)
 608{
 609    return blk->root ? blk->root->bs : NULL;
 610}
 611
 612static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
 613{
 614    BdrvChild *child;
 615    QLIST_FOREACH(child, &bs->parents, next_parent) {
 616        if (child->role == &child_root) {
 617            return child->opaque;
 618        }
 619    }
 620
 621    return NULL;
 622}
 623
 624/*
 625 * Returns true if @bs has an associated BlockBackend.
 626 */
 627bool bdrv_has_blk(BlockDriverState *bs)
 628{
 629    return bdrv_first_blk(bs) != NULL;
 630}
 631
 632/*
 633 * Returns true if @bs has only BlockBackends as parents.
 634 */
 635bool bdrv_is_root_node(BlockDriverState *bs)
 636{
 637    BdrvChild *c;
 638
 639    QLIST_FOREACH(c, &bs->parents, next_parent) {
 640        if (c->role != &child_root) {
 641            return false;
 642        }
 643    }
 644
 645    return true;
 646}
 647
 648/*
 649 * Return @blk's DriveInfo if any, else null.
 650 */
 651DriveInfo *blk_legacy_dinfo(BlockBackend *blk)
 652{
 653    return blk->legacy_dinfo;
 654}
 655
 656/*
 657 * Set @blk's DriveInfo to @dinfo, and return it.
 658 * @blk must not have a DriveInfo set already.
 659 * No other BlockBackend may have the same DriveInfo set.
 660 */
 661DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo)
 662{
 663    assert(!blk->legacy_dinfo);
 664    return blk->legacy_dinfo = dinfo;
 665}
 666
 667/*
 668 * Return the BlockBackend with DriveInfo @dinfo.
 669 * It must exist.
 670 */
 671BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo)
 672{
 673    BlockBackend *blk = NULL;
 674
 675    while ((blk = blk_next(blk)) != NULL) {
 676        if (blk->legacy_dinfo == dinfo) {
 677            return blk;
 678        }
 679    }
 680    abort();
 681}
 682
 683/*
 684 * Returns a pointer to the publicly accessible fields of @blk.
 685 */
 686BlockBackendPublic *blk_get_public(BlockBackend *blk)
 687{
 688    return &blk->public;
 689}
 690
 691/*
 692 * Returns a BlockBackend given the associated @public fields.
 693 */
 694BlockBackend *blk_by_public(BlockBackendPublic *public)
 695{
 696    return container_of(public, BlockBackend, public);
 697}
 698
 699/*
 700 * Disassociates the currently associated BlockDriverState from @blk.
 701 */
 702void blk_remove_bs(BlockBackend *blk)
 703{
 704    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
 705    BlockDriverState *bs;
 706
 707    notifier_list_notify(&blk->remove_bs_notifiers, blk);
 708    if (tgm->throttle_state) {
 709        bs = blk_bs(blk);
 710        bdrv_drained_begin(bs);
 711        throttle_group_detach_aio_context(tgm);
 712        throttle_group_attach_aio_context(tgm, qemu_get_aio_context());
 713        bdrv_drained_end(bs);
 714    }
 715
 716    blk_update_root_state(blk);
 717
 718    bdrv_root_unref_child(blk->root);
 719    blk->root = NULL;
 720}
 721
 722/*
 723 * Associates a new BlockDriverState with @blk.
 724 */
 725int blk_insert_bs(BlockBackend *blk, BlockDriverState *bs, Error **errp)
 726{
 727    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
 728    blk->root = bdrv_root_attach_child(bs, "root", &child_root,
 729                                       blk->perm, blk->shared_perm, blk, errp);
 730    if (blk->root == NULL) {
 731        return -EPERM;
 732    }
 733    bdrv_ref(bs);
 734
 735    notifier_list_notify(&blk->insert_bs_notifiers, blk);
 736    if (tgm->throttle_state) {
 737        throttle_group_detach_aio_context(tgm);
 738        throttle_group_attach_aio_context(tgm, bdrv_get_aio_context(bs));
 739    }
 740
 741    return 0;
 742}
 743
 744/*
 745 * Sets the permission bitmasks that the user of the BlockBackend needs.
 746 */
 747int blk_set_perm(BlockBackend *blk, uint64_t perm, uint64_t shared_perm,
 748                 Error **errp)
 749{
 750    int ret;
 751
 752    if (blk->root && !blk->disable_perm) {
 753        ret = bdrv_child_try_set_perm(blk->root, perm, shared_perm, errp);
 754        if (ret < 0) {
 755            return ret;
 756        }
 757    }
 758
 759    blk->perm = perm;
 760    blk->shared_perm = shared_perm;
 761
 762    return 0;
 763}
 764
 765void blk_get_perm(BlockBackend *blk, uint64_t *perm, uint64_t *shared_perm)
 766{
 767    *perm = blk->perm;
 768    *shared_perm = blk->shared_perm;
 769}
 770
 771static int blk_do_attach_dev(BlockBackend *blk, void *dev)
 772{
 773    if (blk->dev) {
 774        return -EBUSY;
 775    }
 776
 777    /* While migration is still incoming, we don't need to apply the
 778     * permissions of guest device BlockBackends. We might still have a block
 779     * job or NBD server writing to the image for storage migration. */
 780    if (runstate_check(RUN_STATE_INMIGRATE)) {
 781        blk->disable_perm = true;
 782    }
 783
 784    blk_ref(blk);
 785    blk->dev = dev;
 786    blk->legacy_dev = false;
 787    blk_iostatus_reset(blk);
 788
 789    return 0;
 790}
 791
 792/*
 793 * Attach device model @dev to @blk.
 794 * Return 0 on success, -EBUSY when a device model is attached already.
 795 */
 796int blk_attach_dev(BlockBackend *blk, DeviceState *dev)
 797{
 798    return blk_do_attach_dev(blk, dev);
 799}
 800
 801/*
 802 * Attach device model @dev to @blk.
 803 * @blk must not have a device model attached already.
 804 * TODO qdevified devices don't use this, remove when devices are qdevified
 805 */
 806void blk_attach_dev_legacy(BlockBackend *blk, void *dev)
 807{
 808    if (blk_do_attach_dev(blk, dev) < 0) {
 809        abort();
 810    }
 811    blk->legacy_dev = true;
 812}
 813
 814/*
 815 * Detach device model @dev from @blk.
 816 * @dev must be currently attached to @blk.
 817 */
 818void blk_detach_dev(BlockBackend *blk, void *dev)
 819/* TODO change to DeviceState *dev when all users are qdevified */
 820{
 821    assert(blk->dev == dev);
 822    blk->dev = NULL;
 823    blk->dev_ops = NULL;
 824    blk->dev_opaque = NULL;
 825    blk->guest_block_size = 512;
 826    blk_set_perm(blk, 0, BLK_PERM_ALL, &error_abort);
 827    blk_unref(blk);
 828}
 829
 830/*
 831 * Return the device model attached to @blk if any, else null.
 832 */
 833void *blk_get_attached_dev(BlockBackend *blk)
 834/* TODO change to return DeviceState * when all users are qdevified */
 835{
 836    return blk->dev;
 837}
 838
 839/* Return the qdev ID, or if no ID is assigned the QOM path, of the block
 840 * device attached to the BlockBackend. */
 841char *blk_get_attached_dev_id(BlockBackend *blk)
 842{
 843    DeviceState *dev;
 844
 845    assert(!blk->legacy_dev);
 846    dev = blk->dev;
 847
 848    if (!dev) {
 849        return g_strdup("");
 850    } else if (dev->id) {
 851        return g_strdup(dev->id);
 852    }
 853    return object_get_canonical_path(OBJECT(dev));
 854}
 855
 856/*
 857 * Return the BlockBackend which has the device model @dev attached if it
 858 * exists, else null.
 859 *
 860 * @dev must not be null.
 861 */
 862BlockBackend *blk_by_dev(void *dev)
 863{
 864    BlockBackend *blk = NULL;
 865
 866    assert(dev != NULL);
 867    while ((blk = blk_all_next(blk)) != NULL) {
 868        if (blk->dev == dev) {
 869            return blk;
 870        }
 871    }
 872    return NULL;
 873}
 874
 875/*
 876 * Set @blk's device model callbacks to @ops.
 877 * @opaque is the opaque argument to pass to the callbacks.
 878 * This is for use by device models.
 879 */
 880void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
 881                     void *opaque)
 882{
 883    /* All drivers that use blk_set_dev_ops() are qdevified and we want to keep
 884     * it that way, so we can assume blk->dev, if present, is a DeviceState if
 885     * blk->dev_ops is set. Non-device users may use dev_ops without device. */
 886    assert(!blk->legacy_dev);
 887
 888    blk->dev_ops = ops;
 889    blk->dev_opaque = opaque;
 890
 891    /* Are we currently quiesced? Should we enforce this right now? */
 892    if (blk->quiesce_counter && ops->drained_begin) {
 893        ops->drained_begin(opaque);
 894    }
 895}
 896
 897/*
 898 * Notify @blk's attached device model of media change.
 899 *
 900 * If @load is true, notify of media load. This action can fail, meaning that
 901 * the medium cannot be loaded. @errp is set then.
 902 *
 903 * If @load is false, notify of media eject. This can never fail.
 904 *
 905 * Also send DEVICE_TRAY_MOVED events as appropriate.
 906 */
 907void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp)
 908{
 909    if (blk->dev_ops && blk->dev_ops->change_media_cb) {
 910        bool tray_was_open, tray_is_open;
 911        Error *local_err = NULL;
 912
 913        assert(!blk->legacy_dev);
 914
 915        tray_was_open = blk_dev_is_tray_open(blk);
 916        blk->dev_ops->change_media_cb(blk->dev_opaque, load, &local_err);
 917        if (local_err) {
 918            assert(load == true);
 919            error_propagate(errp, local_err);
 920            return;
 921        }
 922        tray_is_open = blk_dev_is_tray_open(blk);
 923
 924        if (tray_was_open != tray_is_open) {
 925            char *id = blk_get_attached_dev_id(blk);
 926            qapi_event_send_device_tray_moved(blk_name(blk), id, tray_is_open,
 927                                              &error_abort);
 928            g_free(id);
 929        }
 930    }
 931}
 932
 933static void blk_root_change_media(BdrvChild *child, bool load)
 934{
 935    blk_dev_change_media_cb(child->opaque, load, NULL);
 936}
 937
 938/*
 939 * Does @blk's attached device model have removable media?
 940 * %true if no device model is attached.
 941 */
 942bool blk_dev_has_removable_media(BlockBackend *blk)
 943{
 944    return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb);
 945}
 946
 947/*
 948 * Does @blk's attached device model have a tray?
 949 */
 950bool blk_dev_has_tray(BlockBackend *blk)
 951{
 952    return blk->dev_ops && blk->dev_ops->is_tray_open;
 953}
 954
 955/*
 956 * Notify @blk's attached device model of a media eject request.
 957 * If @force is true, the medium is about to be yanked out forcefully.
 958 */
 959void blk_dev_eject_request(BlockBackend *blk, bool force)
 960{
 961    if (blk->dev_ops && blk->dev_ops->eject_request_cb) {
 962        blk->dev_ops->eject_request_cb(blk->dev_opaque, force);
 963    }
 964}
 965
 966/*
 967 * Does @blk's attached device model have a tray, and is it open?
 968 */
 969bool blk_dev_is_tray_open(BlockBackend *blk)
 970{
 971    if (blk_dev_has_tray(blk)) {
 972        return blk->dev_ops->is_tray_open(blk->dev_opaque);
 973    }
 974    return false;
 975}
 976
 977/*
 978 * Does @blk's attached device model have the medium locked?
 979 * %false if the device model has no such lock.
 980 */
 981bool blk_dev_is_medium_locked(BlockBackend *blk)
 982{
 983    if (blk->dev_ops && blk->dev_ops->is_medium_locked) {
 984        return blk->dev_ops->is_medium_locked(blk->dev_opaque);
 985    }
 986    return false;
 987}
 988
 989/*
 990 * Notify @blk's attached device model of a backend size change.
 991 */
 992static void blk_root_resize(BdrvChild *child)
 993{
 994    BlockBackend *blk = child->opaque;
 995
 996    if (blk->dev_ops && blk->dev_ops->resize_cb) {
 997        blk->dev_ops->resize_cb(blk->dev_opaque);
 998    }
 999}
1000
1001void blk_iostatus_enable(BlockBackend *blk)
1002{
1003    blk->iostatus_enabled = true;
1004    blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1005}
1006
1007/* The I/O status is only enabled if the drive explicitly
1008 * enables it _and_ the VM is configured to stop on errors */
1009bool blk_iostatus_is_enabled(const BlockBackend *blk)
1010{
1011    return (blk->iostatus_enabled &&
1012           (blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC ||
1013            blk->on_write_error == BLOCKDEV_ON_ERROR_STOP   ||
1014            blk->on_read_error == BLOCKDEV_ON_ERROR_STOP));
1015}
1016
1017BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk)
1018{
1019    return blk->iostatus;
1020}
1021
1022void blk_iostatus_disable(BlockBackend *blk)
1023{
1024    blk->iostatus_enabled = false;
1025}
1026
1027void blk_iostatus_reset(BlockBackend *blk)
1028{
1029    if (blk_iostatus_is_enabled(blk)) {
1030        BlockDriverState *bs = blk_bs(blk);
1031        blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK;
1032        if (bs && bs->job) {
1033            block_job_iostatus_reset(bs->job);
1034        }
1035    }
1036}
1037
1038void blk_iostatus_set_err(BlockBackend *blk, int error)
1039{
1040    assert(blk_iostatus_is_enabled(blk));
1041    if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
1042        blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE :
1043                                          BLOCK_DEVICE_IO_STATUS_FAILED;
1044    }
1045}
1046
1047void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow)
1048{
1049    blk->allow_write_beyond_eof = allow;
1050}
1051
1052static int blk_check_byte_request(BlockBackend *blk, int64_t offset,
1053                                  size_t size)
1054{
1055    int64_t len;
1056
1057    if (size > INT_MAX) {
1058        return -EIO;
1059    }
1060
1061    if (!blk_is_available(blk)) {
1062        return -ENOMEDIUM;
1063    }
1064
1065    if (offset < 0) {
1066        return -EIO;
1067    }
1068
1069    if (!blk->allow_write_beyond_eof) {
1070        len = blk_getlength(blk);
1071        if (len < 0) {
1072            return len;
1073        }
1074
1075        if (offset > len || len - offset < size) {
1076            return -EIO;
1077        }
1078    }
1079
1080    return 0;
1081}
1082
1083int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset,
1084                               unsigned int bytes, QEMUIOVector *qiov,
1085                               BdrvRequestFlags flags)
1086{
1087    int ret;
1088    BlockDriverState *bs = blk_bs(blk);
1089
1090    trace_blk_co_preadv(blk, bs, offset, bytes, flags);
1091
1092    ret = blk_check_byte_request(blk, offset, bytes);
1093    if (ret < 0) {
1094        return ret;
1095    }
1096
1097    bdrv_inc_in_flight(bs);
1098
1099    /* throttling disk I/O */
1100    if (blk->public.throttle_group_member.throttle_state) {
1101        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1102                bytes, false);
1103    }
1104
1105    ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
1106    bdrv_dec_in_flight(bs);
1107    return ret;
1108}
1109
1110int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
1111                                unsigned int bytes, QEMUIOVector *qiov,
1112                                BdrvRequestFlags flags)
1113{
1114    int ret;
1115    BlockDriverState *bs = blk_bs(blk);
1116
1117    trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
1118
1119    ret = blk_check_byte_request(blk, offset, bytes);
1120    if (ret < 0) {
1121        return ret;
1122    }
1123
1124    bdrv_inc_in_flight(bs);
1125    /* throttling disk I/O */
1126    if (blk->public.throttle_group_member.throttle_state) {
1127        throttle_group_co_io_limits_intercept(&blk->public.throttle_group_member,
1128                bytes, true);
1129    }
1130
1131    if (!blk->enable_write_cache) {
1132        flags |= BDRV_REQ_FUA;
1133    }
1134
1135    ret = bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags);
1136    bdrv_dec_in_flight(bs);
1137    return ret;
1138}
1139
1140typedef struct BlkRwCo {
1141    BlockBackend *blk;
1142    int64_t offset;
1143    QEMUIOVector *qiov;
1144    int ret;
1145    BdrvRequestFlags flags;
1146} BlkRwCo;
1147
1148static void blk_read_entry(void *opaque)
1149{
1150    BlkRwCo *rwco = opaque;
1151
1152    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, rwco->qiov->size,
1153                              rwco->qiov, rwco->flags);
1154}
1155
1156static void blk_write_entry(void *opaque)
1157{
1158    BlkRwCo *rwco = opaque;
1159
1160    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, rwco->qiov->size,
1161                               rwco->qiov, rwco->flags);
1162}
1163
1164static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf,
1165                   int64_t bytes, CoroutineEntry co_entry,
1166                   BdrvRequestFlags flags)
1167{
1168    QEMUIOVector qiov;
1169    struct iovec iov;
1170    BlkRwCo rwco;
1171
1172    iov = (struct iovec) {
1173        .iov_base = buf,
1174        .iov_len = bytes,
1175    };
1176    qemu_iovec_init_external(&qiov, &iov, 1);
1177
1178    rwco = (BlkRwCo) {
1179        .blk    = blk,
1180        .offset = offset,
1181        .qiov   = &qiov,
1182        .flags  = flags,
1183        .ret    = NOT_DONE,
1184    };
1185
1186    if (qemu_in_coroutine()) {
1187        /* Fast-path if already in coroutine context */
1188        co_entry(&rwco);
1189    } else {
1190        Coroutine *co = qemu_coroutine_create(co_entry, &rwco);
1191        bdrv_coroutine_enter(blk_bs(blk), co);
1192        BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
1193    }
1194
1195    return rwco.ret;
1196}
1197
1198int blk_pread_unthrottled(BlockBackend *blk, int64_t offset, uint8_t *buf,
1199                          int count)
1200{
1201    int ret;
1202
1203    ret = blk_check_byte_request(blk, offset, count);
1204    if (ret < 0) {
1205        return ret;
1206    }
1207
1208    blk_root_drained_begin(blk->root);
1209    ret = blk_pread(blk, offset, buf, count);
1210    blk_root_drained_end(blk->root);
1211    return ret;
1212}
1213
1214int blk_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1215                      int bytes, BdrvRequestFlags flags)
1216{
1217    return blk_prw(blk, offset, NULL, bytes, blk_write_entry,
1218                   flags | BDRV_REQ_ZERO_WRITE);
1219}
1220
1221int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
1222{
1223    return bdrv_make_zero(blk->root, flags);
1224}
1225
1226static void error_callback_bh(void *opaque)
1227{
1228    struct BlockBackendAIOCB *acb = opaque;
1229
1230    bdrv_dec_in_flight(acb->common.bs);
1231    acb->common.cb(acb->common.opaque, acb->ret);
1232    qemu_aio_unref(acb);
1233}
1234
1235BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
1236                                  BlockCompletionFunc *cb,
1237                                  void *opaque, int ret)
1238{
1239    struct BlockBackendAIOCB *acb;
1240
1241    bdrv_inc_in_flight(blk_bs(blk));
1242    acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque);
1243    acb->blk = blk;
1244    acb->ret = ret;
1245
1246    aio_bh_schedule_oneshot(blk_get_aio_context(blk), error_callback_bh, acb);
1247    return &acb->common;
1248}
1249
1250typedef struct BlkAioEmAIOCB {
1251    BlockAIOCB common;
1252    BlkRwCo rwco;
1253    int bytes;
1254    bool has_returned;
1255} BlkAioEmAIOCB;
1256
1257static const AIOCBInfo blk_aio_em_aiocb_info = {
1258    .aiocb_size         = sizeof(BlkAioEmAIOCB),
1259};
1260
1261static void blk_aio_complete(BlkAioEmAIOCB *acb)
1262{
1263    if (acb->has_returned) {
1264        bdrv_dec_in_flight(acb->common.bs);
1265        acb->common.cb(acb->common.opaque, acb->rwco.ret);
1266        qemu_aio_unref(acb);
1267    }
1268}
1269
1270static void blk_aio_complete_bh(void *opaque)
1271{
1272    BlkAioEmAIOCB *acb = opaque;
1273    assert(acb->has_returned);
1274    blk_aio_complete(acb);
1275}
1276
1277static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes,
1278                                QEMUIOVector *qiov, CoroutineEntry co_entry,
1279                                BdrvRequestFlags flags,
1280                                BlockCompletionFunc *cb, void *opaque)
1281{
1282    BlkAioEmAIOCB *acb;
1283    Coroutine *co;
1284
1285    bdrv_inc_in_flight(blk_bs(blk));
1286    acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque);
1287    acb->rwco = (BlkRwCo) {
1288        .blk    = blk,
1289        .offset = offset,
1290        .qiov   = qiov,
1291        .flags  = flags,
1292        .ret    = NOT_DONE,
1293    };
1294    acb->bytes = bytes;
1295    acb->has_returned = false;
1296
1297    co = qemu_coroutine_create(co_entry, acb);
1298    bdrv_coroutine_enter(blk_bs(blk), co);
1299
1300    acb->has_returned = true;
1301    if (acb->rwco.ret != NOT_DONE) {
1302        aio_bh_schedule_oneshot(blk_get_aio_context(blk),
1303                                blk_aio_complete_bh, acb);
1304    }
1305
1306    return &acb->common;
1307}
1308
1309static void blk_aio_read_entry(void *opaque)
1310{
1311    BlkAioEmAIOCB *acb = opaque;
1312    BlkRwCo *rwco = &acb->rwco;
1313
1314    assert(rwco->qiov->size == acb->bytes);
1315    rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, acb->bytes,
1316                              rwco->qiov, rwco->flags);
1317    blk_aio_complete(acb);
1318}
1319
1320static void blk_aio_write_entry(void *opaque)
1321{
1322    BlkAioEmAIOCB *acb = opaque;
1323    BlkRwCo *rwco = &acb->rwco;
1324
1325    assert(!rwco->qiov || rwco->qiov->size == acb->bytes);
1326    rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, acb->bytes,
1327                               rwco->qiov, rwco->flags);
1328    blk_aio_complete(acb);
1329}
1330
1331BlockAIOCB *blk_aio_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1332                                  int count, BdrvRequestFlags flags,
1333                                  BlockCompletionFunc *cb, void *opaque)
1334{
1335    return blk_aio_prwv(blk, offset, count, NULL, blk_aio_write_entry,
1336                        flags | BDRV_REQ_ZERO_WRITE, cb, opaque);
1337}
1338
1339int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count)
1340{
1341    int ret = blk_prw(blk, offset, buf, count, blk_read_entry, 0);
1342    if (ret < 0) {
1343        return ret;
1344    }
1345    return count;
1346}
1347
1348int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count,
1349               BdrvRequestFlags flags)
1350{
1351    int ret = blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
1352                      flags);
1353    if (ret < 0) {
1354        return ret;
1355    }
1356    return count;
1357}
1358
1359int64_t blk_getlength(BlockBackend *blk)
1360{
1361    if (!blk_is_available(blk)) {
1362        return -ENOMEDIUM;
1363    }
1364
1365    return bdrv_getlength(blk_bs(blk));
1366}
1367
1368void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr)
1369{
1370    if (!blk_bs(blk)) {
1371        *nb_sectors_ptr = 0;
1372    } else {
1373        bdrv_get_geometry(blk_bs(blk), nb_sectors_ptr);
1374    }
1375}
1376
1377int64_t blk_nb_sectors(BlockBackend *blk)
1378{
1379    if (!blk_is_available(blk)) {
1380        return -ENOMEDIUM;
1381    }
1382
1383    return bdrv_nb_sectors(blk_bs(blk));
1384}
1385
1386BlockAIOCB *blk_aio_preadv(BlockBackend *blk, int64_t offset,
1387                           QEMUIOVector *qiov, BdrvRequestFlags flags,
1388                           BlockCompletionFunc *cb, void *opaque)
1389{
1390    return blk_aio_prwv(blk, offset, qiov->size, qiov,
1391                        blk_aio_read_entry, flags, cb, opaque);
1392}
1393
1394BlockAIOCB *blk_aio_pwritev(BlockBackend *blk, int64_t offset,
1395                            QEMUIOVector *qiov, BdrvRequestFlags flags,
1396                            BlockCompletionFunc *cb, void *opaque)
1397{
1398    return blk_aio_prwv(blk, offset, qiov->size, qiov,
1399                        blk_aio_write_entry, flags, cb, opaque);
1400}
1401
1402static void blk_aio_flush_entry(void *opaque)
1403{
1404    BlkAioEmAIOCB *acb = opaque;
1405    BlkRwCo *rwco = &acb->rwco;
1406
1407    rwco->ret = blk_co_flush(rwco->blk);
1408    blk_aio_complete(acb);
1409}
1410
1411BlockAIOCB *blk_aio_flush(BlockBackend *blk,
1412                          BlockCompletionFunc *cb, void *opaque)
1413{
1414    return blk_aio_prwv(blk, 0, 0, NULL, blk_aio_flush_entry, 0, cb, opaque);
1415}
1416
1417static void blk_aio_pdiscard_entry(void *opaque)
1418{
1419    BlkAioEmAIOCB *acb = opaque;
1420    BlkRwCo *rwco = &acb->rwco;
1421
1422    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, acb->bytes);
1423    blk_aio_complete(acb);
1424}
1425
1426BlockAIOCB *blk_aio_pdiscard(BlockBackend *blk,
1427                             int64_t offset, int bytes,
1428                             BlockCompletionFunc *cb, void *opaque)
1429{
1430    return blk_aio_prwv(blk, offset, bytes, NULL, blk_aio_pdiscard_entry, 0,
1431                        cb, opaque);
1432}
1433
1434void blk_aio_cancel(BlockAIOCB *acb)
1435{
1436    bdrv_aio_cancel(acb);
1437}
1438
1439void blk_aio_cancel_async(BlockAIOCB *acb)
1440{
1441    bdrv_aio_cancel_async(acb);
1442}
1443
1444int blk_co_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1445{
1446    if (!blk_is_available(blk)) {
1447        return -ENOMEDIUM;
1448    }
1449
1450    return bdrv_co_ioctl(blk_bs(blk), req, buf);
1451}
1452
1453static void blk_ioctl_entry(void *opaque)
1454{
1455    BlkRwCo *rwco = opaque;
1456    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
1457                             rwco->qiov->iov[0].iov_base);
1458}
1459
1460int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf)
1461{
1462    return blk_prw(blk, req, buf, 0, blk_ioctl_entry, 0);
1463}
1464
1465static void blk_aio_ioctl_entry(void *opaque)
1466{
1467    BlkAioEmAIOCB *acb = opaque;
1468    BlkRwCo *rwco = &acb->rwco;
1469
1470    rwco->ret = blk_co_ioctl(rwco->blk, rwco->offset,
1471                             rwco->qiov->iov[0].iov_base);
1472    blk_aio_complete(acb);
1473}
1474
1475BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf,
1476                          BlockCompletionFunc *cb, void *opaque)
1477{
1478    QEMUIOVector qiov;
1479    struct iovec iov;
1480
1481    iov = (struct iovec) {
1482        .iov_base = buf,
1483        .iov_len = 0,
1484    };
1485    qemu_iovec_init_external(&qiov, &iov, 1);
1486
1487    return blk_aio_prwv(blk, req, 0, &qiov, blk_aio_ioctl_entry, 0, cb, opaque);
1488}
1489
1490int blk_co_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1491{
1492    int ret = blk_check_byte_request(blk, offset, bytes);
1493    if (ret < 0) {
1494        return ret;
1495    }
1496
1497    return bdrv_co_pdiscard(blk_bs(blk), offset, bytes);
1498}
1499
1500int blk_co_flush(BlockBackend *blk)
1501{
1502    if (!blk_is_available(blk)) {
1503        return -ENOMEDIUM;
1504    }
1505
1506    return bdrv_co_flush(blk_bs(blk));
1507}
1508
1509static void blk_flush_entry(void *opaque)
1510{
1511    BlkRwCo *rwco = opaque;
1512    rwco->ret = blk_co_flush(rwco->blk);
1513}
1514
1515int blk_flush(BlockBackend *blk)
1516{
1517    return blk_prw(blk, 0, NULL, 0, blk_flush_entry, 0);
1518}
1519
1520void blk_drain(BlockBackend *blk)
1521{
1522    if (blk_bs(blk)) {
1523        bdrv_drain(blk_bs(blk));
1524    }
1525}
1526
1527void blk_drain_all(void)
1528{
1529    bdrv_drain_all();
1530}
1531
1532void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
1533                      BlockdevOnError on_write_error)
1534{
1535    blk->on_read_error = on_read_error;
1536    blk->on_write_error = on_write_error;
1537}
1538
1539BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read)
1540{
1541    return is_read ? blk->on_read_error : blk->on_write_error;
1542}
1543
1544BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read,
1545                                      int error)
1546{
1547    BlockdevOnError on_err = blk_get_on_error(blk, is_read);
1548
1549    switch (on_err) {
1550    case BLOCKDEV_ON_ERROR_ENOSPC:
1551        return (error == ENOSPC) ?
1552               BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT;
1553    case BLOCKDEV_ON_ERROR_STOP:
1554        return BLOCK_ERROR_ACTION_STOP;
1555    case BLOCKDEV_ON_ERROR_REPORT:
1556        return BLOCK_ERROR_ACTION_REPORT;
1557    case BLOCKDEV_ON_ERROR_IGNORE:
1558        return BLOCK_ERROR_ACTION_IGNORE;
1559    case BLOCKDEV_ON_ERROR_AUTO:
1560    default:
1561        abort();
1562    }
1563}
1564
1565static void send_qmp_error_event(BlockBackend *blk,
1566                                 BlockErrorAction action,
1567                                 bool is_read, int error)
1568{
1569    IoOperationType optype;
1570
1571    optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE;
1572    qapi_event_send_block_io_error(blk_name(blk),
1573                                   bdrv_get_node_name(blk_bs(blk)), optype,
1574                                   action, blk_iostatus_is_enabled(blk),
1575                                   error == ENOSPC, strerror(error),
1576                                   &error_abort);
1577}
1578
1579/* This is done by device models because, while the block layer knows
1580 * about the error, it does not know whether an operation comes from
1581 * the device or the block layer (from a job, for example).
1582 */
1583void blk_error_action(BlockBackend *blk, BlockErrorAction action,
1584                      bool is_read, int error)
1585{
1586    assert(error >= 0);
1587
1588    if (action == BLOCK_ERROR_ACTION_STOP) {
1589        /* First set the iostatus, so that "info block" returns an iostatus
1590         * that matches the events raised so far (an additional error iostatus
1591         * is fine, but not a lost one).
1592         */
1593        blk_iostatus_set_err(blk, error);
1594
1595        /* Then raise the request to stop the VM and the event.
1596         * qemu_system_vmstop_request_prepare has two effects.  First,
1597         * it ensures that the STOP event always comes after the
1598         * BLOCK_IO_ERROR event.  Second, it ensures that even if management
1599         * can observe the STOP event and do a "cont" before the STOP
1600         * event is issued, the VM will not stop.  In this case, vm_start()
1601         * also ensures that the STOP/RESUME pair of events is emitted.
1602         */
1603        qemu_system_vmstop_request_prepare();
1604        send_qmp_error_event(blk, action, is_read, error);
1605        qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
1606    } else {
1607        send_qmp_error_event(blk, action, is_read, error);
1608    }
1609}
1610
1611int blk_is_read_only(BlockBackend *blk)
1612{
1613    BlockDriverState *bs = blk_bs(blk);
1614
1615    if (bs) {
1616        return bdrv_is_read_only(bs);
1617    } else {
1618        return blk->root_state.read_only;
1619    }
1620}
1621
1622int blk_is_sg(BlockBackend *blk)
1623{
1624    BlockDriverState *bs = blk_bs(blk);
1625
1626    if (!bs) {
1627        return 0;
1628    }
1629
1630    return bdrv_is_sg(bs);
1631}
1632
1633int blk_enable_write_cache(BlockBackend *blk)
1634{
1635    return blk->enable_write_cache;
1636}
1637
1638void blk_set_enable_write_cache(BlockBackend *blk, bool wce)
1639{
1640    blk->enable_write_cache = wce;
1641}
1642
1643void blk_invalidate_cache(BlockBackend *blk, Error **errp)
1644{
1645    BlockDriverState *bs = blk_bs(blk);
1646
1647    if (!bs) {
1648        error_setg(errp, "Device '%s' has no medium", blk->name);
1649        return;
1650    }
1651
1652    bdrv_invalidate_cache(bs, errp);
1653}
1654
1655bool blk_is_inserted(BlockBackend *blk)
1656{
1657    BlockDriverState *bs = blk_bs(blk);
1658
1659    return bs && bdrv_is_inserted(bs);
1660}
1661
1662bool blk_is_available(BlockBackend *blk)
1663{
1664    return blk_is_inserted(blk) && !blk_dev_is_tray_open(blk);
1665}
1666
1667void blk_lock_medium(BlockBackend *blk, bool locked)
1668{
1669    BlockDriverState *bs = blk_bs(blk);
1670
1671    if (bs) {
1672        bdrv_lock_medium(bs, locked);
1673    }
1674}
1675
1676void blk_eject(BlockBackend *blk, bool eject_flag)
1677{
1678    BlockDriverState *bs = blk_bs(blk);
1679    char *id;
1680
1681    /* blk_eject is only called by qdevified devices */
1682    assert(!blk->legacy_dev);
1683
1684    if (bs) {
1685        bdrv_eject(bs, eject_flag);
1686    }
1687
1688    /* Whether or not we ejected on the backend,
1689     * the frontend experienced a tray event. */
1690    id = blk_get_attached_dev_id(blk);
1691    qapi_event_send_device_tray_moved(blk_name(blk), id,
1692                                      eject_flag, &error_abort);
1693    g_free(id);
1694}
1695
1696int blk_get_flags(BlockBackend *blk)
1697{
1698    BlockDriverState *bs = blk_bs(blk);
1699
1700    if (bs) {
1701        return bdrv_get_flags(bs);
1702    } else {
1703        return blk->root_state.open_flags;
1704    }
1705}
1706
1707/* Returns the maximum transfer length, in bytes; guaranteed nonzero */
1708uint32_t blk_get_max_transfer(BlockBackend *blk)
1709{
1710    BlockDriverState *bs = blk_bs(blk);
1711    uint32_t max = 0;
1712
1713    if (bs) {
1714        max = bs->bl.max_transfer;
1715    }
1716    return MIN_NON_ZERO(max, INT_MAX);
1717}
1718
1719int blk_get_max_iov(BlockBackend *blk)
1720{
1721    return blk->root->bs->bl.max_iov;
1722}
1723
1724void blk_set_guest_block_size(BlockBackend *blk, int align)
1725{
1726    blk->guest_block_size = align;
1727}
1728
1729void *blk_try_blockalign(BlockBackend *blk, size_t size)
1730{
1731    return qemu_try_blockalign(blk ? blk_bs(blk) : NULL, size);
1732}
1733
1734void *blk_blockalign(BlockBackend *blk, size_t size)
1735{
1736    return qemu_blockalign(blk ? blk_bs(blk) : NULL, size);
1737}
1738
1739bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp)
1740{
1741    BlockDriverState *bs = blk_bs(blk);
1742
1743    if (!bs) {
1744        return false;
1745    }
1746
1747    return bdrv_op_is_blocked(bs, op, errp);
1748}
1749
1750void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason)
1751{
1752    BlockDriverState *bs = blk_bs(blk);
1753
1754    if (bs) {
1755        bdrv_op_unblock(bs, op, reason);
1756    }
1757}
1758
1759void blk_op_block_all(BlockBackend *blk, Error *reason)
1760{
1761    BlockDriverState *bs = blk_bs(blk);
1762
1763    if (bs) {
1764        bdrv_op_block_all(bs, reason);
1765    }
1766}
1767
1768void blk_op_unblock_all(BlockBackend *blk, Error *reason)
1769{
1770    BlockDriverState *bs = blk_bs(blk);
1771
1772    if (bs) {
1773        bdrv_op_unblock_all(bs, reason);
1774    }
1775}
1776
1777AioContext *blk_get_aio_context(BlockBackend *blk)
1778{
1779    BlockDriverState *bs = blk_bs(blk);
1780
1781    if (bs) {
1782        return bdrv_get_aio_context(bs);
1783    } else {
1784        return qemu_get_aio_context();
1785    }
1786}
1787
1788static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb)
1789{
1790    BlockBackendAIOCB *blk_acb = DO_UPCAST(BlockBackendAIOCB, common, acb);
1791    return blk_get_aio_context(blk_acb->blk);
1792}
1793
1794void blk_set_aio_context(BlockBackend *blk, AioContext *new_context)
1795{
1796    BlockDriverState *bs = blk_bs(blk);
1797    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
1798
1799    if (bs) {
1800        if (tgm->throttle_state) {
1801            bdrv_drained_begin(bs);
1802            throttle_group_detach_aio_context(tgm);
1803            throttle_group_attach_aio_context(tgm, new_context);
1804            bdrv_drained_end(bs);
1805        }
1806        bdrv_set_aio_context(bs, new_context);
1807    }
1808}
1809
1810void blk_add_aio_context_notifier(BlockBackend *blk,
1811        void (*attached_aio_context)(AioContext *new_context, void *opaque),
1812        void (*detach_aio_context)(void *opaque), void *opaque)
1813{
1814    BlockDriverState *bs = blk_bs(blk);
1815
1816    if (bs) {
1817        bdrv_add_aio_context_notifier(bs, attached_aio_context,
1818                                      detach_aio_context, opaque);
1819    }
1820}
1821
1822void blk_remove_aio_context_notifier(BlockBackend *blk,
1823                                     void (*attached_aio_context)(AioContext *,
1824                                                                  void *),
1825                                     void (*detach_aio_context)(void *),
1826                                     void *opaque)
1827{
1828    BlockDriverState *bs = blk_bs(blk);
1829
1830    if (bs) {
1831        bdrv_remove_aio_context_notifier(bs, attached_aio_context,
1832                                         detach_aio_context, opaque);
1833    }
1834}
1835
1836void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify)
1837{
1838    notifier_list_add(&blk->remove_bs_notifiers, notify);
1839}
1840
1841void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify)
1842{
1843    notifier_list_add(&blk->insert_bs_notifiers, notify);
1844}
1845
1846void blk_io_plug(BlockBackend *blk)
1847{
1848    BlockDriverState *bs = blk_bs(blk);
1849
1850    if (bs) {
1851        bdrv_io_plug(bs);
1852    }
1853}
1854
1855void blk_io_unplug(BlockBackend *blk)
1856{
1857    BlockDriverState *bs = blk_bs(blk);
1858
1859    if (bs) {
1860        bdrv_io_unplug(bs);
1861    }
1862}
1863
1864BlockAcctStats *blk_get_stats(BlockBackend *blk)
1865{
1866    return &blk->stats;
1867}
1868
1869void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
1870                  BlockCompletionFunc *cb, void *opaque)
1871{
1872    return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque);
1873}
1874
1875int coroutine_fn blk_co_pwrite_zeroes(BlockBackend *blk, int64_t offset,
1876                                      int bytes, BdrvRequestFlags flags)
1877{
1878    return blk_co_pwritev(blk, offset, bytes, NULL,
1879                          flags | BDRV_REQ_ZERO_WRITE);
1880}
1881
1882int blk_pwrite_compressed(BlockBackend *blk, int64_t offset, const void *buf,
1883                          int count)
1884{
1885    return blk_prw(blk, offset, (void *) buf, count, blk_write_entry,
1886                   BDRV_REQ_WRITE_COMPRESSED);
1887}
1888
1889int blk_truncate(BlockBackend *blk, int64_t offset, PreallocMode prealloc,
1890                 Error **errp)
1891{
1892    if (!blk_is_available(blk)) {
1893        error_setg(errp, "No medium inserted");
1894        return -ENOMEDIUM;
1895    }
1896
1897    return bdrv_truncate(blk->root, offset, prealloc, errp);
1898}
1899
1900static void blk_pdiscard_entry(void *opaque)
1901{
1902    BlkRwCo *rwco = opaque;
1903    rwco->ret = blk_co_pdiscard(rwco->blk, rwco->offset, rwco->qiov->size);
1904}
1905
1906int blk_pdiscard(BlockBackend *blk, int64_t offset, int bytes)
1907{
1908    return blk_prw(blk, offset, NULL, bytes, blk_pdiscard_entry, 0);
1909}
1910
1911int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf,
1912                     int64_t pos, int size)
1913{
1914    int ret;
1915
1916    if (!blk_is_available(blk)) {
1917        return -ENOMEDIUM;
1918    }
1919
1920    ret = bdrv_save_vmstate(blk_bs(blk), buf, pos, size);
1921    if (ret < 0) {
1922        return ret;
1923    }
1924
1925    if (ret == size && !blk->enable_write_cache) {
1926        ret = bdrv_flush(blk_bs(blk));
1927    }
1928
1929    return ret < 0 ? ret : size;
1930}
1931
1932int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size)
1933{
1934    if (!blk_is_available(blk)) {
1935        return -ENOMEDIUM;
1936    }
1937
1938    return bdrv_load_vmstate(blk_bs(blk), buf, pos, size);
1939}
1940
1941int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz)
1942{
1943    if (!blk_is_available(blk)) {
1944        return -ENOMEDIUM;
1945    }
1946
1947    return bdrv_probe_blocksizes(blk_bs(blk), bsz);
1948}
1949
1950int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo)
1951{
1952    if (!blk_is_available(blk)) {
1953        return -ENOMEDIUM;
1954    }
1955
1956    return bdrv_probe_geometry(blk_bs(blk), geo);
1957}
1958
1959/*
1960 * Updates the BlockBackendRootState object with data from the currently
1961 * attached BlockDriverState.
1962 */
1963void blk_update_root_state(BlockBackend *blk)
1964{
1965    assert(blk->root);
1966
1967    blk->root_state.open_flags    = blk->root->bs->open_flags;
1968    blk->root_state.read_only     = blk->root->bs->read_only;
1969    blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes;
1970}
1971
1972/*
1973 * Returns the detect-zeroes setting to be used for bdrv_open() of a
1974 * BlockDriverState which is supposed to inherit the root state.
1975 */
1976bool blk_get_detect_zeroes_from_root_state(BlockBackend *blk)
1977{
1978    return blk->root_state.detect_zeroes;
1979}
1980
1981/*
1982 * Returns the flags to be used for bdrv_open() of a BlockDriverState which is
1983 * supposed to inherit the root state.
1984 */
1985int blk_get_open_flags_from_root_state(BlockBackend *blk)
1986{
1987    int bs_flags;
1988
1989    bs_flags = blk->root_state.read_only ? 0 : BDRV_O_RDWR;
1990    bs_flags |= blk->root_state.open_flags & ~BDRV_O_RDWR;
1991
1992    return bs_flags;
1993}
1994
1995BlockBackendRootState *blk_get_root_state(BlockBackend *blk)
1996{
1997    return &blk->root_state;
1998}
1999
2000int blk_commit_all(void)
2001{
2002    BlockBackend *blk = NULL;
2003
2004    while ((blk = blk_all_next(blk)) != NULL) {
2005        AioContext *aio_context = blk_get_aio_context(blk);
2006
2007        aio_context_acquire(aio_context);
2008        if (blk_is_inserted(blk) && blk->root->bs->backing) {
2009            int ret = bdrv_commit(blk->root->bs);
2010            if (ret < 0) {
2011                aio_context_release(aio_context);
2012                return ret;
2013            }
2014        }
2015        aio_context_release(aio_context);
2016    }
2017    return 0;
2018}
2019
2020
2021/* throttling disk I/O limits */
2022void blk_set_io_limits(BlockBackend *blk, ThrottleConfig *cfg)
2023{
2024    throttle_group_config(&blk->public.throttle_group_member, cfg);
2025}
2026
2027void blk_io_limits_disable(BlockBackend *blk)
2028{
2029    BlockDriverState *bs = blk_bs(blk);
2030    ThrottleGroupMember *tgm = &blk->public.throttle_group_member;
2031    assert(tgm->throttle_state);
2032    if (bs) {
2033        bdrv_drained_begin(bs);
2034    }
2035    throttle_group_unregister_tgm(tgm);
2036    if (bs) {
2037        bdrv_drained_end(bs);
2038    }
2039}
2040
2041/* should be called before blk_set_io_limits if a limit is set */
2042void blk_io_limits_enable(BlockBackend *blk, const char *group)
2043{
2044    assert(!blk->public.throttle_group_member.throttle_state);
2045    throttle_group_register_tgm(&blk->public.throttle_group_member,
2046                                group, blk_get_aio_context(blk));
2047}
2048
2049void blk_io_limits_update_group(BlockBackend *blk, const char *group)
2050{
2051    /* this BB is not part of any group */
2052    if (!blk->public.throttle_group_member.throttle_state) {
2053        return;
2054    }
2055
2056    /* this BB is a part of the same group than the one we want */
2057    if (!g_strcmp0(throttle_group_get_name(&blk->public.throttle_group_member),
2058                group)) {
2059        return;
2060    }
2061
2062    /* need to change the group this bs belong to */
2063    blk_io_limits_disable(blk);
2064    blk_io_limits_enable(blk, group);
2065}
2066
2067static void blk_root_drained_begin(BdrvChild *child)
2068{
2069    BlockBackend *blk = child->opaque;
2070
2071    if (++blk->quiesce_counter == 1) {
2072        if (blk->dev_ops && blk->dev_ops->drained_begin) {
2073            blk->dev_ops->drained_begin(blk->dev_opaque);
2074        }
2075    }
2076
2077    /* Note that blk->root may not be accessible here yet if we are just
2078     * attaching to a BlockDriverState that is drained. Use child instead. */
2079
2080    if (atomic_fetch_inc(&blk->public.throttle_group_member.io_limits_disabled) == 0) {
2081        throttle_group_restart_tgm(&blk->public.throttle_group_member);
2082    }
2083}
2084
2085static void blk_root_drained_end(BdrvChild *child)
2086{
2087    BlockBackend *blk = child->opaque;
2088    assert(blk->quiesce_counter);
2089
2090    assert(blk->public.throttle_group_member.io_limits_disabled);
2091    atomic_dec(&blk->public.throttle_group_member.io_limits_disabled);
2092
2093    if (--blk->quiesce_counter == 0) {
2094        if (blk->dev_ops && blk->dev_ops->drained_end) {
2095            blk->dev_ops->drained_end(blk->dev_opaque);
2096        }
2097    }
2098}
2099