qemu/include/block/block_int.h
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator block driver
   3 *
   4 * Copyright (c) 2003 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24#ifndef BLOCK_INT_H
  25#define BLOCK_INT_H
  26
  27#include "block/accounting.h"
  28#include "block/block.h"
  29#include "qemu/option.h"
  30#include "qemu/queue.h"
  31#include "qemu/coroutine.h"
  32#include "qemu/stats64.h"
  33#include "qemu/timer.h"
  34#include "qapi-types.h"
  35#include "qemu/hbitmap.h"
  36#include "block/snapshot.h"
  37#include "qemu/main-loop.h"
  38#include "qemu/throttle.h"
  39
  40#define BLOCK_FLAG_LAZY_REFCOUNTS   8
  41
  42#define BLOCK_OPT_SIZE              "size"
  43#define BLOCK_OPT_ENCRYPT           "encryption"
  44#define BLOCK_OPT_ENCRYPT_FORMAT    "encrypt.format"
  45#define BLOCK_OPT_COMPAT6           "compat6"
  46#define BLOCK_OPT_HWVERSION         "hwversion"
  47#define BLOCK_OPT_BACKING_FILE      "backing_file"
  48#define BLOCK_OPT_BACKING_FMT       "backing_fmt"
  49#define BLOCK_OPT_CLUSTER_SIZE      "cluster_size"
  50#define BLOCK_OPT_TABLE_SIZE        "table_size"
  51#define BLOCK_OPT_PREALLOC          "preallocation"
  52#define BLOCK_OPT_SUBFMT            "subformat"
  53#define BLOCK_OPT_COMPAT_LEVEL      "compat"
  54#define BLOCK_OPT_LAZY_REFCOUNTS    "lazy_refcounts"
  55#define BLOCK_OPT_ADAPTER_TYPE      "adapter_type"
  56#define BLOCK_OPT_REDUNDANCY        "redundancy"
  57#define BLOCK_OPT_NOCOW             "nocow"
  58#define BLOCK_OPT_OBJECT_SIZE       "object_size"
  59#define BLOCK_OPT_REFCOUNT_BITS     "refcount_bits"
  60
  61#define BLOCK_PROBE_BUF_SIZE        512
  62
  63enum BdrvTrackedRequestType {
  64    BDRV_TRACKED_READ,
  65    BDRV_TRACKED_WRITE,
  66    BDRV_TRACKED_DISCARD,
  67};
  68
  69typedef struct BdrvTrackedRequest {
  70    BlockDriverState *bs;
  71    int64_t offset;
  72    unsigned int bytes;
  73    enum BdrvTrackedRequestType type;
  74
  75    bool serialising;
  76    int64_t overlap_offset;
  77    unsigned int overlap_bytes;
  78
  79    QLIST_ENTRY(BdrvTrackedRequest) list;
  80    Coroutine *co; /* owner, used for deadlock detection */
  81    CoQueue wait_queue; /* coroutines blocked on this request */
  82
  83    struct BdrvTrackedRequest *waiting_for;
  84} BdrvTrackedRequest;
  85
  86struct BlockDriver {
  87    const char *format_name;
  88    int instance_size;
  89
  90    /* set to true if the BlockDriver is a block filter */
  91    bool is_filter;
  92    /* for snapshots block filter like Quorum can implement the
  93     * following recursive callback.
  94     * It's purpose is to recurse on the filter children while calling
  95     * bdrv_recurse_is_first_non_filter on them.
  96     * For a sample implementation look in the future Quorum block filter.
  97     */
  98    bool (*bdrv_recurse_is_first_non_filter)(BlockDriverState *bs,
  99                                             BlockDriverState *candidate);
 100
 101    int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename);
 102    int (*bdrv_probe_device)(const char *filename);
 103
 104    /* Any driver implementing this callback is expected to be able to handle
 105     * NULL file names in its .bdrv_open() implementation */
 106    void (*bdrv_parse_filename)(const char *filename, QDict *options, Error **errp);
 107    /* Drivers not implementing bdrv_parse_filename nor bdrv_open should have
 108     * this field set to true, except ones that are defined only by their
 109     * child's bs.
 110     * An example of the last type will be the quorum block driver.
 111     */
 112    bool bdrv_needs_filename;
 113
 114    /* Set if a driver can support backing files */
 115    bool supports_backing;
 116
 117    /* For handling image reopen for split or non-split files */
 118    int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state,
 119                               BlockReopenQueue *queue, Error **errp);
 120    void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state);
 121    void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state);
 122    void (*bdrv_join_options)(QDict *options, QDict *old_options);
 123
 124    int (*bdrv_open)(BlockDriverState *bs, QDict *options, int flags,
 125                     Error **errp);
 126    int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags,
 127                          Error **errp);
 128    void (*bdrv_close)(BlockDriverState *bs);
 129    int (*bdrv_create)(const char *filename, QemuOpts *opts, Error **errp);
 130    int (*bdrv_make_empty)(BlockDriverState *bs);
 131
 132    void (*bdrv_refresh_filename)(BlockDriverState *bs, QDict *options);
 133
 134    /* aio */
 135    BlockAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs,
 136        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 137        BlockCompletionFunc *cb, void *opaque);
 138    BlockAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs,
 139        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 140        BlockCompletionFunc *cb, void *opaque);
 141    BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
 142        BlockCompletionFunc *cb, void *opaque);
 143    BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs,
 144        int64_t offset, int bytes,
 145        BlockCompletionFunc *cb, void *opaque);
 146
 147    int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs,
 148        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
 149    int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs,
 150        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
 151    int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
 152        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
 153    int coroutine_fn (*bdrv_co_writev_flags)(BlockDriverState *bs,
 154        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags);
 155    int coroutine_fn (*bdrv_co_pwritev)(BlockDriverState *bs,
 156        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
 157
 158    /*
 159     * Efficiently zero a region of the disk image.  Typically an image format
 160     * would use a compact metadata representation to implement this.  This
 161     * function pointer may be NULL or return -ENOSUP and .bdrv_co_writev()
 162     * will be called instead.
 163     */
 164    int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs,
 165        int64_t offset, int bytes, BdrvRequestFlags flags);
 166    int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs,
 167        int64_t offset, int bytes);
 168
 169    /*
 170     * Building block for bdrv_block_status[_above]. The driver should
 171     * answer only according to the current layer, and should not
 172     * set BDRV_BLOCK_ALLOCATED, but may set BDRV_BLOCK_RAW.  See block.h
 173     * for the meaning of _DATA, _ZERO, and _OFFSET_VALID.
 174     */
 175    int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs,
 176        int64_t sector_num, int nb_sectors, int *pnum,
 177        BlockDriverState **file);
 178
 179    /*
 180     * Invalidate any cached meta-data.
 181     */
 182    void (*bdrv_invalidate_cache)(BlockDriverState *bs, Error **errp);
 183    int (*bdrv_inactivate)(BlockDriverState *bs);
 184
 185    /*
 186     * Flushes all data for all layers by calling bdrv_co_flush for underlying
 187     * layers, if needed. This function is needed for deterministic
 188     * synchronization of the flush finishing callback.
 189     */
 190    int coroutine_fn (*bdrv_co_flush)(BlockDriverState *bs);
 191
 192    /*
 193     * Flushes all data that was already written to the OS all the way down to
 194     * the disk (for example file-posix.c calls fsync()).
 195     */
 196    int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs);
 197
 198    /*
 199     * Flushes all internal caches to the OS. The data may still sit in a
 200     * writeback cache of the host OS, but it will survive a crash of the qemu
 201     * process.
 202     */
 203    int coroutine_fn (*bdrv_co_flush_to_os)(BlockDriverState *bs);
 204
 205    const char *protocol_name;
 206    int (*bdrv_truncate)(BlockDriverState *bs, int64_t offset,
 207                         PreallocMode prealloc, Error **errp);
 208
 209    int64_t (*bdrv_getlength)(BlockDriverState *bs);
 210    bool has_variable_length;
 211    int64_t (*bdrv_get_allocated_file_size)(BlockDriverState *bs);
 212    BlockMeasureInfo *(*bdrv_measure)(QemuOpts *opts, BlockDriverState *in_bs,
 213                                      Error **errp);
 214
 215    int coroutine_fn (*bdrv_co_pwritev_compressed)(BlockDriverState *bs,
 216        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov);
 217
 218    int (*bdrv_snapshot_create)(BlockDriverState *bs,
 219                                QEMUSnapshotInfo *sn_info);
 220    int (*bdrv_snapshot_goto)(BlockDriverState *bs,
 221                              const char *snapshot_id);
 222    int (*bdrv_snapshot_delete)(BlockDriverState *bs,
 223                                const char *snapshot_id,
 224                                const char *name,
 225                                Error **errp);
 226    int (*bdrv_snapshot_list)(BlockDriverState *bs,
 227                              QEMUSnapshotInfo **psn_info);
 228    int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs,
 229                                  const char *snapshot_id,
 230                                  const char *name,
 231                                  Error **errp);
 232    int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi);
 233    ImageInfoSpecific *(*bdrv_get_specific_info)(BlockDriverState *bs);
 234
 235    int coroutine_fn (*bdrv_save_vmstate)(BlockDriverState *bs,
 236                                          QEMUIOVector *qiov,
 237                                          int64_t pos);
 238    int coroutine_fn (*bdrv_load_vmstate)(BlockDriverState *bs,
 239                                          QEMUIOVector *qiov,
 240                                          int64_t pos);
 241
 242    int (*bdrv_change_backing_file)(BlockDriverState *bs,
 243        const char *backing_file, const char *backing_fmt);
 244
 245    /* removable device specific */
 246    bool (*bdrv_is_inserted)(BlockDriverState *bs);
 247    int (*bdrv_media_changed)(BlockDriverState *bs);
 248    void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
 249    void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked);
 250
 251    /* to control generic scsi devices */
 252    BlockAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs,
 253        unsigned long int req, void *buf,
 254        BlockCompletionFunc *cb, void *opaque);
 255    int coroutine_fn (*bdrv_co_ioctl)(BlockDriverState *bs,
 256                                      unsigned long int req, void *buf);
 257
 258    /* List of options for creating images, terminated by name == NULL */
 259    QemuOptsList *create_opts;
 260
 261    /*
 262     * Returns 0 for completed check, -errno for internal errors.
 263     * The check results are stored in result.
 264     */
 265    int (*bdrv_check)(BlockDriverState *bs, BdrvCheckResult *result,
 266        BdrvCheckMode fix);
 267
 268    int (*bdrv_amend_options)(BlockDriverState *bs, QemuOpts *opts,
 269                              BlockDriverAmendStatusCB *status_cb,
 270                              void *cb_opaque);
 271
 272    void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event);
 273
 274    /* TODO Better pass a option string/QDict/QemuOpts to add any rule? */
 275    int (*bdrv_debug_breakpoint)(BlockDriverState *bs, const char *event,
 276        const char *tag);
 277    int (*bdrv_debug_remove_breakpoint)(BlockDriverState *bs,
 278        const char *tag);
 279    int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag);
 280    bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag);
 281
 282    void (*bdrv_refresh_limits)(BlockDriverState *bs, Error **errp);
 283
 284    /*
 285     * Returns 1 if newly created images are guaranteed to contain only
 286     * zeros, 0 otherwise.
 287     */
 288    int (*bdrv_has_zero_init)(BlockDriverState *bs);
 289
 290    /* Remove fd handlers, timers, and other event loop callbacks so the event
 291     * loop is no longer in use.  Called with no in-flight requests and in
 292     * depth-first traversal order with parents before child nodes.
 293     */
 294    void (*bdrv_detach_aio_context)(BlockDriverState *bs);
 295
 296    /* Add fd handlers, timers, and other event loop callbacks so I/O requests
 297     * can be processed again.  Called with no in-flight requests and in
 298     * depth-first traversal order with child nodes before parent nodes.
 299     */
 300    void (*bdrv_attach_aio_context)(BlockDriverState *bs,
 301                                    AioContext *new_context);
 302
 303    /* io queue for linux-aio */
 304    void (*bdrv_io_plug)(BlockDriverState *bs);
 305    void (*bdrv_io_unplug)(BlockDriverState *bs);
 306
 307    /**
 308     * Try to get @bs's logical and physical block size.
 309     * On success, store them in @bsz and return zero.
 310     * On failure, return negative errno.
 311     */
 312    int (*bdrv_probe_blocksizes)(BlockDriverState *bs, BlockSizes *bsz);
 313    /**
 314     * Try to get @bs's geometry (cyls, heads, sectors)
 315     * On success, store them in @geo and return 0.
 316     * On failure return -errno.
 317     * Only drivers that want to override guest geometry implement this
 318     * callback; see hd_geometry_guess().
 319     */
 320    int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo);
 321
 322    /**
 323     * Drain and stop any internal sources of requests in the driver, and
 324     * remain so until next I/O callback (e.g. bdrv_co_writev) is called.
 325     */
 326    void coroutine_fn (*bdrv_co_drain)(BlockDriverState *bs);
 327
 328    void (*bdrv_add_child)(BlockDriverState *parent, BlockDriverState *child,
 329                           Error **errp);
 330    void (*bdrv_del_child)(BlockDriverState *parent, BdrvChild *child,
 331                           Error **errp);
 332
 333    /**
 334     * Informs the block driver that a permission change is intended. The
 335     * driver checks whether the change is permissible and may take other
 336     * preparations for the change (e.g. get file system locks). This operation
 337     * is always followed either by a call to either .bdrv_set_perm or
 338     * .bdrv_abort_perm_update.
 339     *
 340     * Checks whether the requested set of cumulative permissions in @perm
 341     * can be granted for accessing @bs and whether no other users are using
 342     * permissions other than those given in @shared (both arguments take
 343     * BLK_PERM_* bitmasks).
 344     *
 345     * If both conditions are met, 0 is returned. Otherwise, -errno is returned
 346     * and errp is set to an error describing the conflict.
 347     */
 348    int (*bdrv_check_perm)(BlockDriverState *bs, uint64_t perm,
 349                           uint64_t shared, Error **errp);
 350
 351    /**
 352     * Called to inform the driver that the set of cumulative set of used
 353     * permissions for @bs has changed to @perm, and the set of sharable
 354     * permission to @shared. The driver can use this to propagate changes to
 355     * its children (i.e. request permissions only if a parent actually needs
 356     * them).
 357     *
 358     * This function is only invoked after bdrv_check_perm(), so block drivers
 359     * may rely on preparations made in their .bdrv_check_perm implementation.
 360     */
 361    void (*bdrv_set_perm)(BlockDriverState *bs, uint64_t perm, uint64_t shared);
 362
 363    /*
 364     * Called to inform the driver that after a previous bdrv_check_perm()
 365     * call, the permission update is not performed and any preparations made
 366     * for it (e.g. taken file locks) need to be undone.
 367     *
 368     * This function can be called even for nodes that never saw a
 369     * bdrv_check_perm() call. It is a no-op then.
 370     */
 371    void (*bdrv_abort_perm_update)(BlockDriverState *bs);
 372
 373    /**
 374     * Returns in @nperm and @nshared the permissions that the driver for @bs
 375     * needs on its child @c, based on the cumulative permissions requested by
 376     * the parents in @parent_perm and @parent_shared.
 377     *
 378     * If @c is NULL, return the permissions for attaching a new child for the
 379     * given @role.
 380     */
 381     void (*bdrv_child_perm)(BlockDriverState *bs, BdrvChild *c,
 382                             const BdrvChildRole *role,
 383                             uint64_t parent_perm, uint64_t parent_shared,
 384                             uint64_t *nperm, uint64_t *nshared);
 385
 386    /**
 387     * Bitmaps should be marked as 'IN_USE' in the image on reopening image
 388     * as rw. This handler should realize it. It also should unset readonly
 389     * field of BlockDirtyBitmap's in case of success.
 390     */
 391    int (*bdrv_reopen_bitmaps_rw)(BlockDriverState *bs, Error **errp);
 392    bool (*bdrv_can_store_new_dirty_bitmap)(BlockDriverState *bs,
 393                                            const char *name,
 394                                            uint32_t granularity,
 395                                            Error **errp);
 396    void (*bdrv_remove_persistent_dirty_bitmap)(BlockDriverState *bs,
 397                                                const char *name,
 398                                                Error **errp);
 399
 400    QLIST_ENTRY(BlockDriver) list;
 401};
 402
 403typedef struct BlockLimits {
 404    /* Alignment requirement, in bytes, for offset/length of I/O
 405     * requests. Must be a power of 2 less than INT_MAX; defaults to
 406     * 1 for drivers with modern byte interfaces, and to 512
 407     * otherwise. */
 408    uint32_t request_alignment;
 409
 410    /* Maximum number of bytes that can be discarded at once (since it
 411     * is signed, it must be < 2G, if set). Must be multiple of
 412     * pdiscard_alignment, but need not be power of 2. May be 0 if no
 413     * inherent 32-bit limit */
 414    int32_t max_pdiscard;
 415
 416    /* Optimal alignment for discard requests in bytes. A power of 2
 417     * is best but not mandatory.  Must be a multiple of
 418     * bl.request_alignment, and must be less than max_pdiscard if
 419     * that is set. May be 0 if bl.request_alignment is good enough */
 420    uint32_t pdiscard_alignment;
 421
 422    /* Maximum number of bytes that can zeroized at once (since it is
 423     * signed, it must be < 2G, if set). Must be multiple of
 424     * pwrite_zeroes_alignment. May be 0 if no inherent 32-bit limit */
 425    int32_t max_pwrite_zeroes;
 426
 427    /* Optimal alignment for write zeroes requests in bytes. A power
 428     * of 2 is best but not mandatory.  Must be a multiple of
 429     * bl.request_alignment, and must be less than max_pwrite_zeroes
 430     * if that is set. May be 0 if bl.request_alignment is good
 431     * enough */
 432    uint32_t pwrite_zeroes_alignment;
 433
 434    /* Optimal transfer length in bytes.  A power of 2 is best but not
 435     * mandatory.  Must be a multiple of bl.request_alignment, or 0 if
 436     * no preferred size */
 437    uint32_t opt_transfer;
 438
 439    /* Maximal transfer length in bytes.  Need not be power of 2, but
 440     * must be multiple of opt_transfer and bl.request_alignment, or 0
 441     * for no 32-bit limit.  For now, anything larger than INT_MAX is
 442     * clamped down. */
 443    uint32_t max_transfer;
 444
 445    /* memory alignment, in bytes so that no bounce buffer is needed */
 446    size_t min_mem_alignment;
 447
 448    /* memory alignment, in bytes, for bounce buffer */
 449    size_t opt_mem_alignment;
 450
 451    /* maximum number of iovec elements */
 452    int max_iov;
 453} BlockLimits;
 454
 455typedef struct BdrvOpBlocker BdrvOpBlocker;
 456
 457typedef struct BdrvAioNotifier {
 458    void (*attached_aio_context)(AioContext *new_context, void *opaque);
 459    void (*detach_aio_context)(void *opaque);
 460
 461    void *opaque;
 462    bool deleted;
 463
 464    QLIST_ENTRY(BdrvAioNotifier) list;
 465} BdrvAioNotifier;
 466
 467struct BdrvChildRole {
 468    /* If true, bdrv_replace_node() doesn't change the node this BdrvChild
 469     * points to. */
 470    bool stay_at_node;
 471
 472    void (*inherit_options)(int *child_flags, QDict *child_options,
 473                            int parent_flags, QDict *parent_options);
 474
 475    void (*change_media)(BdrvChild *child, bool load);
 476    void (*resize)(BdrvChild *child);
 477
 478    /* Returns a name that is supposedly more useful for human users than the
 479     * node name for identifying the node in question (in particular, a BB
 480     * name), or NULL if the parent can't provide a better name. */
 481    const char *(*get_name)(BdrvChild *child);
 482
 483    /* Returns a malloced string that describes the parent of the child for a
 484     * human reader. This could be a node-name, BlockBackend name, qdev ID or
 485     * QOM path of the device owning the BlockBackend, job type and ID etc. The
 486     * caller is responsible for freeing the memory. */
 487    char *(*get_parent_desc)(BdrvChild *child);
 488
 489    /*
 490     * If this pair of functions is implemented, the parent doesn't issue new
 491     * requests after returning from .drained_begin() until .drained_end() is
 492     * called.
 493     *
 494     * Note that this can be nested. If drained_begin() was called twice, new
 495     * I/O is allowed only after drained_end() was called twice, too.
 496     */
 497    void (*drained_begin)(BdrvChild *child);
 498    void (*drained_end)(BdrvChild *child);
 499
 500    /* Notifies the parent that the child has been activated/inactivated (e.g.
 501     * when migration is completing) and it can start/stop requesting
 502     * permissions and doing I/O on it. */
 503    void (*activate)(BdrvChild *child, Error **errp);
 504    int (*inactivate)(BdrvChild *child);
 505
 506    void (*attach)(BdrvChild *child);
 507    void (*detach)(BdrvChild *child);
 508};
 509
 510extern const BdrvChildRole child_file;
 511extern const BdrvChildRole child_format;
 512extern const BdrvChildRole child_backing;
 513
 514struct BdrvChild {
 515    BlockDriverState *bs;
 516    char *name;
 517    const BdrvChildRole *role;
 518    void *opaque;
 519
 520    /**
 521     * Granted permissions for operating on this BdrvChild (BLK_PERM_* bitmask)
 522     */
 523    uint64_t perm;
 524
 525    /**
 526     * Permissions that can still be granted to other users of @bs while this
 527     * BdrvChild is still attached to it. (BLK_PERM_* bitmask)
 528     */
 529    uint64_t shared_perm;
 530
 531    QLIST_ENTRY(BdrvChild) next;
 532    QLIST_ENTRY(BdrvChild) next_parent;
 533};
 534
 535/*
 536 * Note: the function bdrv_append() copies and swaps contents of
 537 * BlockDriverStates, so if you add new fields to this struct, please
 538 * inspect bdrv_append() to determine if the new fields need to be
 539 * copied as well.
 540 */
 541struct BlockDriverState {
 542    /* Protected by big QEMU lock or read-only after opening.  No special
 543     * locking needed during I/O...
 544     */
 545    int open_flags; /* flags used to open the file, re-used for re-open */
 546    bool read_only; /* if true, the media is read only */
 547    bool encrypted; /* if true, the media is encrypted */
 548    bool sg;        /* if true, the device is a /dev/sg* */
 549    bool probed;    /* if true, format was probed rather than specified */
 550    bool force_share; /* if true, always allow all shared permissions */
 551    bool implicit;  /* if true, this filter node was automatically inserted */
 552
 553    BlockDriver *drv; /* NULL means no media */
 554    void *opaque;
 555
 556    AioContext *aio_context; /* event loop used for fd handlers, timers, etc */
 557    /* long-running tasks intended to always use the same AioContext as this
 558     * BDS may register themselves in this list to be notified of changes
 559     * regarding this BDS's context */
 560    QLIST_HEAD(, BdrvAioNotifier) aio_notifiers;
 561    bool walking_aio_notifiers; /* to make removal during iteration safe */
 562
 563    char filename[PATH_MAX];
 564    char backing_file[PATH_MAX]; /* if non zero, the image is a diff of
 565                                    this file image */
 566    char backing_format[16]; /* if non-zero and backing_file exists */
 567
 568    QDict *full_open_options;
 569    char exact_filename[PATH_MAX];
 570
 571    BdrvChild *backing;
 572    BdrvChild *file;
 573
 574    /* I/O Limits */
 575    BlockLimits bl;
 576
 577    /* Flags honored during pwrite (so far: BDRV_REQ_FUA) */
 578    unsigned int supported_write_flags;
 579    /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA,
 580     * BDRV_REQ_MAY_UNMAP) */
 581    unsigned int supported_zero_flags;
 582
 583    /* the following member gives a name to every node on the bs graph. */
 584    char node_name[32];
 585    /* element of the list of named nodes building the graph */
 586    QTAILQ_ENTRY(BlockDriverState) node_list;
 587    /* element of the list of all BlockDriverStates (all_bdrv_states) */
 588    QTAILQ_ENTRY(BlockDriverState) bs_list;
 589    /* element of the list of monitor-owned BDS */
 590    QTAILQ_ENTRY(BlockDriverState) monitor_list;
 591    int refcnt;
 592
 593    /* operation blockers */
 594    QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];
 595
 596    /* long-running background operation */
 597    BlockJob *job;
 598
 599    /* The node that this node inherited default options from (and a reopen on
 600     * which can affect this node by changing these defaults). This is always a
 601     * parent node of this node. */
 602    BlockDriverState *inherits_from;
 603    QLIST_HEAD(, BdrvChild) children;
 604    QLIST_HEAD(, BdrvChild) parents;
 605
 606    QDict *options;
 607    QDict *explicit_options;
 608    BlockdevDetectZeroesOptions detect_zeroes;
 609
 610    /* The error object in use for blocking operations on backing_hd */
 611    Error *backing_blocker;
 612
 613    /* Protected by AioContext lock */
 614
 615    /* If we are reading a disk image, give its size in sectors.
 616     * Generally read-only; it is written to by load_snapshot and
 617     * save_snaphost, but the block layer is quiescent during those.
 618     */
 619    int64_t total_sectors;
 620
 621    /* Callback before write request is processed */
 622    NotifierWithReturnList before_write_notifiers;
 623
 624    /* threshold limit for writes, in bytes. "High water mark". */
 625    uint64_t write_threshold_offset;
 626    NotifierWithReturn write_threshold_notifier;
 627
 628    /* Writing to the list requires the BQL _and_ the dirty_bitmap_mutex.
 629     * Reading from the list can be done with either the BQL or the
 630     * dirty_bitmap_mutex.  Modifying a bitmap only requires
 631     * dirty_bitmap_mutex.  */
 632    QemuMutex dirty_bitmap_mutex;
 633    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
 634
 635    /* Offset after the highest byte written to */
 636    Stat64 wr_highest_offset;
 637
 638    /* If true, copy read backing sectors into image.  Can be >1 if more
 639     * than one client has requested copy-on-read.  Accessed with atomic
 640     * ops.
 641     */
 642    int copy_on_read;
 643
 644    /* number of in-flight requests; overall and serialising.
 645     * Accessed with atomic ops.
 646     */
 647    unsigned int in_flight;
 648    unsigned int serialising_in_flight;
 649
 650    /* Internal to BDRV_POLL_WHILE and bdrv_wakeup.  Accessed with atomic
 651     * ops.
 652     */
 653    bool wakeup;
 654
 655    /* counter for nested bdrv_io_plug.
 656     * Accessed with atomic ops.
 657    */
 658    unsigned io_plugged;
 659
 660    /* do we need to tell the quest if we have a volatile write cache? */
 661    int enable_write_cache;
 662
 663    /* Accessed with atomic ops.  */
 664    int quiesce_counter;
 665    unsigned int write_gen;               /* Current data generation */
 666
 667    /* Protected by reqs_lock.  */
 668    CoMutex reqs_lock;
 669    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
 670    CoQueue flush_queue;                  /* Serializing flush queue */
 671    bool active_flush_req;                /* Flush request in flight? */
 672
 673    /* Only read/written by whoever has set active_flush_req to true.  */
 674    unsigned int flushed_gen;             /* Flushed write generation */
 675};
 676
 677struct BlockBackendRootState {
 678    int open_flags;
 679    bool read_only;
 680    BlockdevDetectZeroesOptions detect_zeroes;
 681};
 682
 683typedef enum BlockMirrorBackingMode {
 684    /* Reuse the existing backing chain from the source for the target.
 685     * - sync=full: Set backing BDS to NULL.
 686     * - sync=top:  Use source's backing BDS.
 687     * - sync=none: Use source as the backing BDS. */
 688    MIRROR_SOURCE_BACKING_CHAIN,
 689
 690    /* Open the target's backing chain completely anew */
 691    MIRROR_OPEN_BACKING_CHAIN,
 692
 693    /* Do not change the target's backing BDS after job completion */
 694    MIRROR_LEAVE_BACKING_CHAIN,
 695} BlockMirrorBackingMode;
 696
 697static inline BlockDriverState *backing_bs(BlockDriverState *bs)
 698{
 699    return bs->backing ? bs->backing->bs : NULL;
 700}
 701
 702
 703/* Essential block drivers which must always be statically linked into qemu, and
 704 * which therefore can be accessed without using bdrv_find_format() */
 705extern BlockDriver bdrv_file;
 706extern BlockDriver bdrv_raw;
 707extern BlockDriver bdrv_qcow2;
 708
 709int coroutine_fn bdrv_co_preadv(BdrvChild *child,
 710    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
 711    BdrvRequestFlags flags);
 712int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
 713    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
 714    BdrvRequestFlags flags);
 715
 716int get_tmp_filename(char *filename, int size);
 717BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
 718                            const char *filename);
 719
 720void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix,
 721                                      QDict *options);
 722
 723
 724/**
 725 * bdrv_add_before_write_notifier:
 726 *
 727 * Register a callback that is invoked before write requests are processed but
 728 * after any throttling or waiting for overlapping requests.
 729 */
 730void bdrv_add_before_write_notifier(BlockDriverState *bs,
 731                                    NotifierWithReturn *notifier);
 732
 733/**
 734 * bdrv_detach_aio_context:
 735 *
 736 * May be called from .bdrv_detach_aio_context() to detach children from the
 737 * current #AioContext.  This is only needed by block drivers that manage their
 738 * own children.  Both ->file and ->backing are automatically handled and
 739 * block drivers should not call this function on them explicitly.
 740 */
 741void bdrv_detach_aio_context(BlockDriverState *bs);
 742
 743/**
 744 * bdrv_attach_aio_context:
 745 *
 746 * May be called from .bdrv_attach_aio_context() to attach children to the new
 747 * #AioContext.  This is only needed by block drivers that manage their own
 748 * children.  Both ->file and ->backing are automatically handled and block
 749 * drivers should not call this function on them explicitly.
 750 */
 751void bdrv_attach_aio_context(BlockDriverState *bs,
 752                             AioContext *new_context);
 753
 754/**
 755 * bdrv_add_aio_context_notifier:
 756 *
 757 * If a long-running job intends to be always run in the same AioContext as a
 758 * certain BDS, it may use this function to be notified of changes regarding the
 759 * association of the BDS to an AioContext.
 760 *
 761 * attached_aio_context() is called after the target BDS has been attached to a
 762 * new AioContext; detach_aio_context() is called before the target BDS is being
 763 * detached from its old AioContext.
 764 */
 765void bdrv_add_aio_context_notifier(BlockDriverState *bs,
 766        void (*attached_aio_context)(AioContext *new_context, void *opaque),
 767        void (*detach_aio_context)(void *opaque), void *opaque);
 768
 769/**
 770 * bdrv_remove_aio_context_notifier:
 771 *
 772 * Unsubscribe of change notifications regarding the BDS's AioContext. The
 773 * parameters given here have to be the same as those given to
 774 * bdrv_add_aio_context_notifier().
 775 */
 776void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
 777                                      void (*aio_context_attached)(AioContext *,
 778                                                                   void *),
 779                                      void (*aio_context_detached)(void *),
 780                                      void *opaque);
 781
 782/**
 783 * bdrv_wakeup:
 784 * @bs: The BlockDriverState for which an I/O operation has been completed.
 785 *
 786 * Wake up the main thread if it is waiting on BDRV_POLL_WHILE.  During
 787 * synchronous I/O on a BlockDriverState that is attached to another
 788 * I/O thread, the main thread lets the I/O thread's event loop run,
 789 * waiting for the I/O operation to complete.  A bdrv_wakeup will wake
 790 * up the main thread if necessary.
 791 *
 792 * Manual calls to bdrv_wakeup are rarely necessary, because
 793 * bdrv_dec_in_flight already calls it.
 794 */
 795void bdrv_wakeup(BlockDriverState *bs);
 796
 797#ifdef _WIN32
 798int is_windows_drive(const char *filename);
 799#endif
 800
 801/**
 802 * stream_start:
 803 * @job_id: The id of the newly-created job, or %NULL to use the
 804 * device name of @bs.
 805 * @bs: Block device to operate on.
 806 * @base: Block device that will become the new base, or %NULL to
 807 * flatten the whole backing file chain onto @bs.
 808 * @backing_file_str: The file name that will be written to @bs as the
 809 * the new backing file if the job completes. Ignored if @base is %NULL.
 810 * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
 811 * @on_error: The action to take upon error.
 812 * @errp: Error object.
 813 *
 814 * Start a streaming operation on @bs.  Clusters that are unallocated
 815 * in @bs, but allocated in any image between @base and @bs (both
 816 * exclusive) will be written to @bs.  At the end of a successful
 817 * streaming job, the backing file of @bs will be changed to
 818 * @backing_file_str in the written image and to @base in the live
 819 * BlockDriverState.
 820 */
 821void stream_start(const char *job_id, BlockDriverState *bs,
 822                  BlockDriverState *base, const char *backing_file_str,
 823                  int64_t speed, BlockdevOnError on_error, Error **errp);
 824
 825/**
 826 * commit_start:
 827 * @job_id: The id of the newly-created job, or %NULL to use the
 828 * device name of @bs.
 829 * @bs: Active block device.
 830 * @top: Top block device to be committed.
 831 * @base: Block device that will be written into, and become the new top.
 832 * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
 833 * @on_error: The action to take upon error.
 834 * @backing_file_str: String to use as the backing file in @top's overlay
 835 * @filter_node_name: The node name that should be assigned to the filter
 836 * driver that the commit job inserts into the graph above @top. NULL means
 837 * that a node name should be autogenerated.
 838 * @errp: Error object.
 839 *
 840 */
 841void commit_start(const char *job_id, BlockDriverState *bs,
 842                  BlockDriverState *base, BlockDriverState *top, int64_t speed,
 843                  BlockdevOnError on_error, const char *backing_file_str,
 844                  const char *filter_node_name, Error **errp);
 845/**
 846 * commit_active_start:
 847 * @job_id: The id of the newly-created job, or %NULL to use the
 848 * device name of @bs.
 849 * @bs: Active block device to be committed.
 850 * @base: Block device that will be written into, and become the new top.
 851 * @creation_flags: Flags that control the behavior of the Job lifetime.
 852 *                  See @BlockJobCreateFlags
 853 * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
 854 * @on_error: The action to take upon error.
 855 * @filter_node_name: The node name that should be assigned to the filter
 856 * driver that the commit job inserts into the graph above @bs. NULL means that
 857 * a node name should be autogenerated.
 858 * @cb: Completion function for the job.
 859 * @opaque: Opaque pointer value passed to @cb.
 860 * @auto_complete: Auto complete the job.
 861 * @errp: Error object.
 862 *
 863 */
 864void commit_active_start(const char *job_id, BlockDriverState *bs,
 865                         BlockDriverState *base, int creation_flags,
 866                         int64_t speed, BlockdevOnError on_error,
 867                         const char *filter_node_name,
 868                         BlockCompletionFunc *cb, void *opaque,
 869                         bool auto_complete, Error **errp);
 870/*
 871 * mirror_start:
 872 * @job_id: The id of the newly-created job, or %NULL to use the
 873 * device name of @bs.
 874 * @bs: Block device to operate on.
 875 * @target: Block device to write to.
 876 * @replaces: Block graph node name to replace once the mirror is done. Can
 877 *            only be used when full mirroring is selected.
 878 * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
 879 * @granularity: The chosen granularity for the dirty bitmap.
 880 * @buf_size: The amount of data that can be in flight at one time.
 881 * @mode: Whether to collapse all images in the chain to the target.
 882 * @backing_mode: How to establish the target's backing chain after completion.
 883 * @on_source_error: The action to take upon error reading from the source.
 884 * @on_target_error: The action to take upon error writing to the target.
 885 * @unmap: Whether to unmap target where source sectors only contain zeroes.
 886 * @filter_node_name: The node name that should be assigned to the filter
 887 * driver that the mirror job inserts into the graph above @bs. NULL means that
 888 * a node name should be autogenerated.
 889 * @errp: Error object.
 890 *
 891 * Start a mirroring operation on @bs.  Clusters that are allocated
 892 * in @bs will be written to @target until the job is cancelled or
 893 * manually completed.  At the end of a successful mirroring job,
 894 * @bs will be switched to read from @target.
 895 */
 896void mirror_start(const char *job_id, BlockDriverState *bs,
 897                  BlockDriverState *target, const char *replaces,
 898                  int64_t speed, uint32_t granularity, int64_t buf_size,
 899                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
 900                  BlockdevOnError on_source_error,
 901                  BlockdevOnError on_target_error,
 902                  bool unmap, const char *filter_node_name, Error **errp);
 903
 904/*
 905 * backup_job_create:
 906 * @job_id: The id of the newly-created job, or %NULL to use the
 907 * device name of @bs.
 908 * @bs: Block device to operate on.
 909 * @target: Block device to write to.
 910 * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
 911 * @sync_mode: What parts of the disk image should be copied to the destination.
 912 * @sync_bitmap: The dirty bitmap if sync_mode is MIRROR_SYNC_MODE_INCREMENTAL.
 913 * @on_source_error: The action to take upon error reading from the source.
 914 * @on_target_error: The action to take upon error writing to the target.
 915 * @creation_flags: Flags that control the behavior of the Job lifetime.
 916 *                  See @BlockJobCreateFlags
 917 * @cb: Completion function for the job.
 918 * @opaque: Opaque pointer value passed to @cb.
 919 * @txn: Transaction that this job is part of (may be NULL).
 920 *
 921 * Create a backup operation on @bs.  Clusters in @bs are written to @target
 922 * until the job is cancelled or manually completed.
 923 */
 924BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
 925                            BlockDriverState *target, int64_t speed,
 926                            MirrorSyncMode sync_mode,
 927                            BdrvDirtyBitmap *sync_bitmap,
 928                            bool compress,
 929                            BlockdevOnError on_source_error,
 930                            BlockdevOnError on_target_error,
 931                            int creation_flags,
 932                            BlockCompletionFunc *cb, void *opaque,
 933                            BlockJobTxn *txn, Error **errp);
 934
 935void hmp_drive_add_node(Monitor *mon, const char *optstr);
 936
 937BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
 938                                  const char *child_name,
 939                                  const BdrvChildRole *child_role,
 940                                  uint64_t perm, uint64_t shared_perm,
 941                                  void *opaque, Error **errp);
 942void bdrv_root_unref_child(BdrvChild *child);
 943
 944int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
 945                            Error **errp);
 946
 947/* Default implementation for BlockDriver.bdrv_child_perm() that can be used by
 948 * block filters: Forward CONSISTENT_READ, WRITE, WRITE_UNCHANGED and RESIZE to
 949 * all children */
 950void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,
 951                               const BdrvChildRole *role,
 952                               uint64_t perm, uint64_t shared,
 953                               uint64_t *nperm, uint64_t *nshared);
 954
 955/* Default implementation for BlockDriver.bdrv_child_perm() that can be used by
 956 * (non-raw) image formats: Like above for bs->backing, but for bs->file it
 957 * requires WRITE | RESIZE for read-write images, always requires
 958 * CONSISTENT_READ and doesn't share WRITE. */
 959void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
 960                               const BdrvChildRole *role,
 961                               uint64_t perm, uint64_t shared,
 962                               uint64_t *nperm, uint64_t *nshared);
 963
 964const char *bdrv_get_parent_name(const BlockDriverState *bs);
 965void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp);
 966bool blk_dev_has_removable_media(BlockBackend *blk);
 967bool blk_dev_has_tray(BlockBackend *blk);
 968void blk_dev_eject_request(BlockBackend *blk, bool force);
 969bool blk_dev_is_tray_open(BlockBackend *blk);
 970bool blk_dev_is_medium_locked(BlockBackend *blk);
 971
 972void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int64_t nr_sect);
 973bool bdrv_requests_pending(BlockDriverState *bs);
 974
 975void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
 976void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
 977
 978void bdrv_inc_in_flight(BlockDriverState *bs);
 979void bdrv_dec_in_flight(BlockDriverState *bs);
 980
 981void blockdev_close_all_bdrv_states(void);
 982
 983#endif /* BLOCK_INT_H */
 984