qemu/include/block/block_int.h
<<
>>
Prefs
   1/*
   2 * QEMU System Emulator block driver
   3 *
   4 * Copyright (c) 2003 Fabrice Bellard
   5 *
   6 * Permission is hereby granted, free of charge, to any person obtaining a copy
   7 * of this software and associated documentation files (the "Software"), to deal
   8 * in the Software without restriction, including without limitation the rights
   9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10 * copies of the Software, and to permit persons to whom the Software is
  11 * furnished to do so, subject to the following conditions:
  12 *
  13 * The above copyright notice and this permission notice shall be included in
  14 * all copies or substantial portions of the Software.
  15 *
  16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22 * THE SOFTWARE.
  23 */
  24#ifndef BLOCK_INT_H
  25#define BLOCK_INT_H
  26
  27#include "block/accounting.h"
  28#include "block/block.h"
  29#include "qemu/option.h"
  30#include "qemu/queue.h"
  31#include "qemu/coroutine.h"
  32#include "qemu/stats64.h"
  33#include "qemu/timer.h"
  34#include "qapi-types.h"
  35#include "qemu/hbitmap.h"
  36#include "block/snapshot.h"
  37#include "qemu/main-loop.h"
  38#include "qemu/throttle.h"
  39
  40#define BLOCK_FLAG_LAZY_REFCOUNTS   8
  41
  42#define BLOCK_OPT_SIZE              "size"
  43#define BLOCK_OPT_ENCRYPT           "encryption"
  44#define BLOCK_OPT_ENCRYPT_FORMAT    "encrypt.format"
  45#define BLOCK_OPT_COMPAT6           "compat6"
  46#define BLOCK_OPT_HWVERSION         "hwversion"
  47#define BLOCK_OPT_BACKING_FILE      "backing_file"
  48#define BLOCK_OPT_BACKING_FMT       "backing_fmt"
  49#define BLOCK_OPT_CLUSTER_SIZE      "cluster_size"
  50#define BLOCK_OPT_TABLE_SIZE        "table_size"
  51#define BLOCK_OPT_PREALLOC          "preallocation"
  52#define BLOCK_OPT_SUBFMT            "subformat"
  53#define BLOCK_OPT_COMPAT_LEVEL      "compat"
  54#define BLOCK_OPT_LAZY_REFCOUNTS    "lazy_refcounts"
  55#define BLOCK_OPT_ADAPTER_TYPE      "adapter_type"
  56#define BLOCK_OPT_REDUNDANCY        "redundancy"
  57#define BLOCK_OPT_NOCOW             "nocow"
  58#define BLOCK_OPT_OBJECT_SIZE       "object_size"
  59#define BLOCK_OPT_REFCOUNT_BITS     "refcount_bits"
  60
  61#define BLOCK_PROBE_BUF_SIZE        512
  62
  63enum BdrvTrackedRequestType {
  64    BDRV_TRACKED_READ,
  65    BDRV_TRACKED_WRITE,
  66    BDRV_TRACKED_DISCARD,
  67};
  68
  69typedef struct BdrvTrackedRequest {
  70    BlockDriverState *bs;
  71    int64_t offset;
  72    unsigned int bytes;
  73    enum BdrvTrackedRequestType type;
  74
  75    bool serialising;
  76    int64_t overlap_offset;
  77    unsigned int overlap_bytes;
  78
  79    QLIST_ENTRY(BdrvTrackedRequest) list;
  80    Coroutine *co; /* owner, used for deadlock detection */
  81    CoQueue wait_queue; /* coroutines blocked on this request */
  82
  83    struct BdrvTrackedRequest *waiting_for;
  84} BdrvTrackedRequest;
  85
  86struct BlockDriver {
  87    const char *format_name;
  88    int instance_size;
  89
  90    /* set to true if the BlockDriver is a block filter. Block filters pass
  91     * certain callbacks that refer to data (see block.c) to their bs->file if
  92     * the driver doesn't implement them. Drivers that do not wish to forward
  93     * must implement them and return -ENOTSUP.
  94     */
  95    bool is_filter;
  96    /* for snapshots block filter like Quorum can implement the
  97     * following recursive callback.
  98     * It's purpose is to recurse on the filter children while calling
  99     * bdrv_recurse_is_first_non_filter on them.
 100     * For a sample implementation look in the future Quorum block filter.
 101     */
 102    bool (*bdrv_recurse_is_first_non_filter)(BlockDriverState *bs,
 103                                             BlockDriverState *candidate);
 104
 105    int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename);
 106    int (*bdrv_probe_device)(const char *filename);
 107
 108    /* Any driver implementing this callback is expected to be able to handle
 109     * NULL file names in its .bdrv_open() implementation */
 110    void (*bdrv_parse_filename)(const char *filename, QDict *options, Error **errp);
 111    /* Drivers not implementing bdrv_parse_filename nor bdrv_open should have
 112     * this field set to true, except ones that are defined only by their
 113     * child's bs.
 114     * An example of the last type will be the quorum block driver.
 115     */
 116    bool bdrv_needs_filename;
 117
 118    /* Set if a driver can support backing files */
 119    bool supports_backing;
 120
 121    /* For handling image reopen for split or non-split files */
 122    int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state,
 123                               BlockReopenQueue *queue, Error **errp);
 124    void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state);
 125    void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state);
 126    void (*bdrv_join_options)(QDict *options, QDict *old_options);
 127
 128    int (*bdrv_open)(BlockDriverState *bs, QDict *options, int flags,
 129                     Error **errp);
 130    int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags,
 131                          Error **errp);
 132    void (*bdrv_close)(BlockDriverState *bs);
 133    int (*bdrv_create)(const char *filename, QemuOpts *opts, Error **errp);
 134    int (*bdrv_make_empty)(BlockDriverState *bs);
 135
 136    void (*bdrv_refresh_filename)(BlockDriverState *bs, QDict *options);
 137
 138    /* aio */
 139    BlockAIOCB *(*bdrv_aio_readv)(BlockDriverState *bs,
 140        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 141        BlockCompletionFunc *cb, void *opaque);
 142    BlockAIOCB *(*bdrv_aio_writev)(BlockDriverState *bs,
 143        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
 144        BlockCompletionFunc *cb, void *opaque);
 145    BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
 146        BlockCompletionFunc *cb, void *opaque);
 147    BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs,
 148        int64_t offset, int bytes,
 149        BlockCompletionFunc *cb, void *opaque);
 150
 151    int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs,
 152        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
 153
 154    /**
 155     * @offset: position in bytes to read at
 156     * @bytes: number of bytes to read
 157     * @qiov: the buffers to fill with read data
 158     * @flags: currently unused, always 0
 159     *
 160     * @offset and @bytes will be a multiple of 'request_alignment',
 161     * but the length of individual @qiov elements does not have to
 162     * be a multiple.
 163     *
 164     * @bytes will always equal the total size of @qiov, and will be
 165     * no larger than 'max_transfer'.
 166     *
 167     * The buffer in @qiov may point directly to guest memory.
 168     */
 169    int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs,
 170        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
 171    int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
 172        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
 173    int coroutine_fn (*bdrv_co_writev_flags)(BlockDriverState *bs,
 174        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags);
 175    /**
 176     * @offset: position in bytes to write at
 177     * @bytes: number of bytes to write
 178     * @qiov: the buffers containing data to write
 179     * @flags: zero or more bits allowed by 'supported_write_flags'
 180     *
 181     * @offset and @bytes will be a multiple of 'request_alignment',
 182     * but the length of individual @qiov elements does not have to
 183     * be a multiple.
 184     *
 185     * @bytes will always equal the total size of @qiov, and will be
 186     * no larger than 'max_transfer'.
 187     *
 188     * The buffer in @qiov may point directly to guest memory.
 189     */
 190    int coroutine_fn (*bdrv_co_pwritev)(BlockDriverState *bs,
 191        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags);
 192
 193    /*
 194     * Efficiently zero a region of the disk image.  Typically an image format
 195     * would use a compact metadata representation to implement this.  This
 196     * function pointer may be NULL or return -ENOSUP and .bdrv_co_writev()
 197     * will be called instead.
 198     */
 199    int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs,
 200        int64_t offset, int bytes, BdrvRequestFlags flags);
 201    int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs,
 202        int64_t offset, int bytes);
 203
 204    /*
 205     * Building block for bdrv_block_status[_above] and
 206     * bdrv_is_allocated[_above].  The driver should answer only
 207     * according to the current layer, and should not set
 208     * BDRV_BLOCK_ALLOCATED, but may set BDRV_BLOCK_RAW.  See block.h
 209     * for the meaning of _DATA, _ZERO, and _OFFSET_VALID.  The block
 210     * layer guarantees input aligned to request_alignment, as well as
 211     * non-NULL pnum and file.
 212     */
 213    int64_t coroutine_fn (*bdrv_co_get_block_status)(BlockDriverState *bs,
 214        int64_t sector_num, int nb_sectors, int *pnum,
 215        BlockDriverState **file);
 216
 217    /*
 218     * Invalidate any cached meta-data.
 219     */
 220    void (*bdrv_invalidate_cache)(BlockDriverState *bs, Error **errp);
 221    int (*bdrv_inactivate)(BlockDriverState *bs);
 222
 223    /*
 224     * Flushes all data for all layers by calling bdrv_co_flush for underlying
 225     * layers, if needed. This function is needed for deterministic
 226     * synchronization of the flush finishing callback.
 227     */
 228    int coroutine_fn (*bdrv_co_flush)(BlockDriverState *bs);
 229
 230    /*
 231     * Flushes all data that was already written to the OS all the way down to
 232     * the disk (for example file-posix.c calls fsync()).
 233     */
 234    int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs);
 235
 236    /*
 237     * Flushes all internal caches to the OS. The data may still sit in a
 238     * writeback cache of the host OS, but it will survive a crash of the qemu
 239     * process.
 240     */
 241    int coroutine_fn (*bdrv_co_flush_to_os)(BlockDriverState *bs);
 242
 243    const char *protocol_name;
 244    int (*bdrv_truncate)(BlockDriverState *bs, int64_t offset,
 245                         PreallocMode prealloc, Error **errp);
 246
 247    int64_t (*bdrv_getlength)(BlockDriverState *bs);
 248    bool has_variable_length;
 249    int64_t (*bdrv_get_allocated_file_size)(BlockDriverState *bs);
 250    BlockMeasureInfo *(*bdrv_measure)(QemuOpts *opts, BlockDriverState *in_bs,
 251                                      Error **errp);
 252
 253    int coroutine_fn (*bdrv_co_pwritev_compressed)(BlockDriverState *bs,
 254        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov);
 255
 256    int (*bdrv_snapshot_create)(BlockDriverState *bs,
 257                                QEMUSnapshotInfo *sn_info);
 258    int (*bdrv_snapshot_goto)(BlockDriverState *bs,
 259                              const char *snapshot_id);
 260    int (*bdrv_snapshot_delete)(BlockDriverState *bs,
 261                                const char *snapshot_id,
 262                                const char *name,
 263                                Error **errp);
 264    int (*bdrv_snapshot_list)(BlockDriverState *bs,
 265                              QEMUSnapshotInfo **psn_info);
 266    int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs,
 267                                  const char *snapshot_id,
 268                                  const char *name,
 269                                  Error **errp);
 270    int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi);
 271    ImageInfoSpecific *(*bdrv_get_specific_info)(BlockDriverState *bs);
 272
 273    int coroutine_fn (*bdrv_save_vmstate)(BlockDriverState *bs,
 274                                          QEMUIOVector *qiov,
 275                                          int64_t pos);
 276    int coroutine_fn (*bdrv_load_vmstate)(BlockDriverState *bs,
 277                                          QEMUIOVector *qiov,
 278                                          int64_t pos);
 279
 280    int (*bdrv_change_backing_file)(BlockDriverState *bs,
 281        const char *backing_file, const char *backing_fmt);
 282
 283    /* removable device specific */
 284    bool (*bdrv_is_inserted)(BlockDriverState *bs);
 285    void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
 286    void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked);
 287
 288    /* to control generic scsi devices */
 289    BlockAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs,
 290        unsigned long int req, void *buf,
 291        BlockCompletionFunc *cb, void *opaque);
 292    int coroutine_fn (*bdrv_co_ioctl)(BlockDriverState *bs,
 293                                      unsigned long int req, void *buf);
 294
 295    /* List of options for creating images, terminated by name == NULL */
 296    QemuOptsList *create_opts;
 297
 298    /*
 299     * Returns 0 for completed check, -errno for internal errors.
 300     * The check results are stored in result.
 301     */
 302    int (*bdrv_check)(BlockDriverState *bs, BdrvCheckResult *result,
 303        BdrvCheckMode fix);
 304
 305    int (*bdrv_amend_options)(BlockDriverState *bs, QemuOpts *opts,
 306                              BlockDriverAmendStatusCB *status_cb,
 307                              void *cb_opaque);
 308
 309    void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event);
 310
 311    /* TODO Better pass a option string/QDict/QemuOpts to add any rule? */
 312    int (*bdrv_debug_breakpoint)(BlockDriverState *bs, const char *event,
 313        const char *tag);
 314    int (*bdrv_debug_remove_breakpoint)(BlockDriverState *bs,
 315        const char *tag);
 316    int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag);
 317    bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag);
 318
 319    void (*bdrv_refresh_limits)(BlockDriverState *bs, Error **errp);
 320
 321    /*
 322     * Returns 1 if newly created images are guaranteed to contain only
 323     * zeros, 0 otherwise.
 324     */
 325    int (*bdrv_has_zero_init)(BlockDriverState *bs);
 326
 327    /* Remove fd handlers, timers, and other event loop callbacks so the event
 328     * loop is no longer in use.  Called with no in-flight requests and in
 329     * depth-first traversal order with parents before child nodes.
 330     */
 331    void (*bdrv_detach_aio_context)(BlockDriverState *bs);
 332
 333    /* Add fd handlers, timers, and other event loop callbacks so I/O requests
 334     * can be processed again.  Called with no in-flight requests and in
 335     * depth-first traversal order with child nodes before parent nodes.
 336     */
 337    void (*bdrv_attach_aio_context)(BlockDriverState *bs,
 338                                    AioContext *new_context);
 339
 340    /* io queue for linux-aio */
 341    void (*bdrv_io_plug)(BlockDriverState *bs);
 342    void (*bdrv_io_unplug)(BlockDriverState *bs);
 343
 344    /**
 345     * Try to get @bs's logical and physical block size.
 346     * On success, store them in @bsz and return zero.
 347     * On failure, return negative errno.
 348     */
 349    int (*bdrv_probe_blocksizes)(BlockDriverState *bs, BlockSizes *bsz);
 350    /**
 351     * Try to get @bs's geometry (cyls, heads, sectors)
 352     * On success, store them in @geo and return 0.
 353     * On failure return -errno.
 354     * Only drivers that want to override guest geometry implement this
 355     * callback; see hd_geometry_guess().
 356     */
 357    int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo);
 358
 359    /**
 360     * bdrv_co_drain_begin is called if implemented in the beginning of a
 361     * drain operation to drain and stop any internal sources of requests in
 362     * the driver.
 363     * bdrv_co_drain_end is called if implemented at the end of the drain.
 364     *
 365     * They should be used by the driver to e.g. manage scheduled I/O
 366     * requests, or toggle an internal state. After the end of the drain new
 367     * requests will continue normally.
 368     */
 369    void coroutine_fn (*bdrv_co_drain_begin)(BlockDriverState *bs);
 370    void coroutine_fn (*bdrv_co_drain_end)(BlockDriverState *bs);
 371
 372    void (*bdrv_add_child)(BlockDriverState *parent, BlockDriverState *child,
 373                           Error **errp);
 374    void (*bdrv_del_child)(BlockDriverState *parent, BdrvChild *child,
 375                           Error **errp);
 376
 377    /**
 378     * Informs the block driver that a permission change is intended. The
 379     * driver checks whether the change is permissible and may take other
 380     * preparations for the change (e.g. get file system locks). This operation
 381     * is always followed either by a call to either .bdrv_set_perm or
 382     * .bdrv_abort_perm_update.
 383     *
 384     * Checks whether the requested set of cumulative permissions in @perm
 385     * can be granted for accessing @bs and whether no other users are using
 386     * permissions other than those given in @shared (both arguments take
 387     * BLK_PERM_* bitmasks).
 388     *
 389     * If both conditions are met, 0 is returned. Otherwise, -errno is returned
 390     * and errp is set to an error describing the conflict.
 391     */
 392    int (*bdrv_check_perm)(BlockDriverState *bs, uint64_t perm,
 393                           uint64_t shared, Error **errp);
 394
 395    /**
 396     * Called to inform the driver that the set of cumulative set of used
 397     * permissions for @bs has changed to @perm, and the set of sharable
 398     * permission to @shared. The driver can use this to propagate changes to
 399     * its children (i.e. request permissions only if a parent actually needs
 400     * them).
 401     *
 402     * This function is only invoked after bdrv_check_perm(), so block drivers
 403     * may rely on preparations made in their .bdrv_check_perm implementation.
 404     */
 405    void (*bdrv_set_perm)(BlockDriverState *bs, uint64_t perm, uint64_t shared);
 406
 407    /*
 408     * Called to inform the driver that after a previous bdrv_check_perm()
 409     * call, the permission update is not performed and any preparations made
 410     * for it (e.g. taken file locks) need to be undone.
 411     *
 412     * This function can be called even for nodes that never saw a
 413     * bdrv_check_perm() call. It is a no-op then.
 414     */
 415    void (*bdrv_abort_perm_update)(BlockDriverState *bs);
 416
 417    /**
 418     * Returns in @nperm and @nshared the permissions that the driver for @bs
 419     * needs on its child @c, based on the cumulative permissions requested by
 420     * the parents in @parent_perm and @parent_shared.
 421     *
 422     * If @c is NULL, return the permissions for attaching a new child for the
 423     * given @role.
 424     *
 425     * If @reopen_queue is non-NULL, don't return the currently needed
 426     * permissions, but those that will be needed after applying the
 427     * @reopen_queue.
 428     */
 429     void (*bdrv_child_perm)(BlockDriverState *bs, BdrvChild *c,
 430                             const BdrvChildRole *role,
 431                             BlockReopenQueue *reopen_queue,
 432                             uint64_t parent_perm, uint64_t parent_shared,
 433                             uint64_t *nperm, uint64_t *nshared);
 434
 435    /**
 436     * Bitmaps should be marked as 'IN_USE' in the image on reopening image
 437     * as rw. This handler should realize it. It also should unset readonly
 438     * field of BlockDirtyBitmap's in case of success.
 439     */
 440    int (*bdrv_reopen_bitmaps_rw)(BlockDriverState *bs, Error **errp);
 441    bool (*bdrv_can_store_new_dirty_bitmap)(BlockDriverState *bs,
 442                                            const char *name,
 443                                            uint32_t granularity,
 444                                            Error **errp);
 445    void (*bdrv_remove_persistent_dirty_bitmap)(BlockDriverState *bs,
 446                                                const char *name,
 447                                                Error **errp);
 448
 449    QLIST_ENTRY(BlockDriver) list;
 450};
 451
 452typedef struct BlockLimits {
 453    /* Alignment requirement, in bytes, for offset/length of I/O
 454     * requests. Must be a power of 2 less than INT_MAX; defaults to
 455     * 1 for drivers with modern byte interfaces, and to 512
 456     * otherwise. */
 457    uint32_t request_alignment;
 458
 459    /* Maximum number of bytes that can be discarded at once (since it
 460     * is signed, it must be < 2G, if set). Must be multiple of
 461     * pdiscard_alignment, but need not be power of 2. May be 0 if no
 462     * inherent 32-bit limit */
 463    int32_t max_pdiscard;
 464
 465    /* Optimal alignment for discard requests in bytes. A power of 2
 466     * is best but not mandatory.  Must be a multiple of
 467     * bl.request_alignment, and must be less than max_pdiscard if
 468     * that is set. May be 0 if bl.request_alignment is good enough */
 469    uint32_t pdiscard_alignment;
 470
 471    /* Maximum number of bytes that can zeroized at once (since it is
 472     * signed, it must be < 2G, if set). Must be multiple of
 473     * pwrite_zeroes_alignment. May be 0 if no inherent 32-bit limit */
 474    int32_t max_pwrite_zeroes;
 475
 476    /* Optimal alignment for write zeroes requests in bytes. A power
 477     * of 2 is best but not mandatory.  Must be a multiple of
 478     * bl.request_alignment, and must be less than max_pwrite_zeroes
 479     * if that is set. May be 0 if bl.request_alignment is good
 480     * enough */
 481    uint32_t pwrite_zeroes_alignment;
 482
 483    /* Optimal transfer length in bytes.  A power of 2 is best but not
 484     * mandatory.  Must be a multiple of bl.request_alignment, or 0 if
 485     * no preferred size */
 486    uint32_t opt_transfer;
 487
 488    /* Maximal transfer length in bytes.  Need not be power of 2, but
 489     * must be multiple of opt_transfer and bl.request_alignment, or 0
 490     * for no 32-bit limit.  For now, anything larger than INT_MAX is
 491     * clamped down. */
 492    uint32_t max_transfer;
 493
 494    /* memory alignment, in bytes so that no bounce buffer is needed */
 495    size_t min_mem_alignment;
 496
 497    /* memory alignment, in bytes, for bounce buffer */
 498    size_t opt_mem_alignment;
 499
 500    /* maximum number of iovec elements */
 501    int max_iov;
 502} BlockLimits;
 503
 504typedef struct BdrvOpBlocker BdrvOpBlocker;
 505
 506typedef struct BdrvAioNotifier {
 507    void (*attached_aio_context)(AioContext *new_context, void *opaque);
 508    void (*detach_aio_context)(void *opaque);
 509
 510    void *opaque;
 511    bool deleted;
 512
 513    QLIST_ENTRY(BdrvAioNotifier) list;
 514} BdrvAioNotifier;
 515
 516struct BdrvChildRole {
 517    /* If true, bdrv_replace_node() doesn't change the node this BdrvChild
 518     * points to. */
 519    bool stay_at_node;
 520
 521    void (*inherit_options)(int *child_flags, QDict *child_options,
 522                            int parent_flags, QDict *parent_options);
 523
 524    void (*change_media)(BdrvChild *child, bool load);
 525    void (*resize)(BdrvChild *child);
 526
 527    /* Returns a name that is supposedly more useful for human users than the
 528     * node name for identifying the node in question (in particular, a BB
 529     * name), or NULL if the parent can't provide a better name. */
 530    const char *(*get_name)(BdrvChild *child);
 531
 532    /* Returns a malloced string that describes the parent of the child for a
 533     * human reader. This could be a node-name, BlockBackend name, qdev ID or
 534     * QOM path of the device owning the BlockBackend, job type and ID etc. The
 535     * caller is responsible for freeing the memory. */
 536    char *(*get_parent_desc)(BdrvChild *child);
 537
 538    /*
 539     * If this pair of functions is implemented, the parent doesn't issue new
 540     * requests after returning from .drained_begin() until .drained_end() is
 541     * called.
 542     *
 543     * Note that this can be nested. If drained_begin() was called twice, new
 544     * I/O is allowed only after drained_end() was called twice, too.
 545     */
 546    void (*drained_begin)(BdrvChild *child);
 547    void (*drained_end)(BdrvChild *child);
 548
 549    /* Notifies the parent that the child has been activated/inactivated (e.g.
 550     * when migration is completing) and it can start/stop requesting
 551     * permissions and doing I/O on it. */
 552    void (*activate)(BdrvChild *child, Error **errp);
 553    int (*inactivate)(BdrvChild *child);
 554
 555    void (*attach)(BdrvChild *child);
 556    void (*detach)(BdrvChild *child);
 557
 558    /* Notifies the parent that the filename of its child has changed (e.g.
 559     * because the direct child was removed from the backing chain), so that it
 560     * can update its reference. */
 561    int (*update_filename)(BdrvChild *child, BlockDriverState *new_base,
 562                           const char *filename, Error **errp);
 563};
 564
 565extern const BdrvChildRole child_file;
 566extern const BdrvChildRole child_format;
 567extern const BdrvChildRole child_backing;
 568
 569struct BdrvChild {
 570    BlockDriverState *bs;
 571    char *name;
 572    const BdrvChildRole *role;
 573    void *opaque;
 574
 575    /**
 576     * Granted permissions for operating on this BdrvChild (BLK_PERM_* bitmask)
 577     */
 578    uint64_t perm;
 579
 580    /**
 581     * Permissions that can still be granted to other users of @bs while this
 582     * BdrvChild is still attached to it. (BLK_PERM_* bitmask)
 583     */
 584    uint64_t shared_perm;
 585
 586    QLIST_ENTRY(BdrvChild) next;
 587    QLIST_ENTRY(BdrvChild) next_parent;
 588};
 589
 590/*
 591 * Note: the function bdrv_append() copies and swaps contents of
 592 * BlockDriverStates, so if you add new fields to this struct, please
 593 * inspect bdrv_append() to determine if the new fields need to be
 594 * copied as well.
 595 */
 596struct BlockDriverState {
 597    /* Protected by big QEMU lock or read-only after opening.  No special
 598     * locking needed during I/O...
 599     */
 600    int open_flags; /* flags used to open the file, re-used for re-open */
 601    bool read_only; /* if true, the media is read only */
 602    bool encrypted; /* if true, the media is encrypted */
 603    bool sg;        /* if true, the device is a /dev/sg* */
 604    bool probed;    /* if true, format was probed rather than specified */
 605    bool force_share; /* if true, always allow all shared permissions */
 606    bool implicit;  /* if true, this filter node was automatically inserted */
 607
 608    BlockDriver *drv; /* NULL means no media */
 609    void *opaque;
 610
 611    AioContext *aio_context; /* event loop used for fd handlers, timers, etc */
 612    /* long-running tasks intended to always use the same AioContext as this
 613     * BDS may register themselves in this list to be notified of changes
 614     * regarding this BDS's context */
 615    QLIST_HEAD(, BdrvAioNotifier) aio_notifiers;
 616    bool walking_aio_notifiers; /* to make removal during iteration safe */
 617
 618    char filename[PATH_MAX];
 619    char backing_file[PATH_MAX]; /* if non zero, the image is a diff of
 620                                    this file image */
 621    char backing_format[16]; /* if non-zero and backing_file exists */
 622
 623    QDict *full_open_options;
 624    char exact_filename[PATH_MAX];
 625
 626    BdrvChild *backing;
 627    BdrvChild *file;
 628
 629    /* I/O Limits */
 630    BlockLimits bl;
 631
 632    /* Flags honored during pwrite (so far: BDRV_REQ_FUA) */
 633    unsigned int supported_write_flags;
 634    /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA,
 635     * BDRV_REQ_MAY_UNMAP) */
 636    unsigned int supported_zero_flags;
 637
 638    /* the following member gives a name to every node on the bs graph. */
 639    char node_name[32];
 640    /* element of the list of named nodes building the graph */
 641    QTAILQ_ENTRY(BlockDriverState) node_list;
 642    /* element of the list of all BlockDriverStates (all_bdrv_states) */
 643    QTAILQ_ENTRY(BlockDriverState) bs_list;
 644    /* element of the list of monitor-owned BDS */
 645    QTAILQ_ENTRY(BlockDriverState) monitor_list;
 646    int refcnt;
 647
 648    /* operation blockers */
 649    QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];
 650
 651    /* long-running background operation */
 652    BlockJob *job;
 653
 654    /* The node that this node inherited default options from (and a reopen on
 655     * which can affect this node by changing these defaults). This is always a
 656     * parent node of this node. */
 657    BlockDriverState *inherits_from;
 658    QLIST_HEAD(, BdrvChild) children;
 659    QLIST_HEAD(, BdrvChild) parents;
 660
 661    QDict *options;
 662    QDict *explicit_options;
 663    BlockdevDetectZeroesOptions detect_zeroes;
 664
 665    /* The error object in use for blocking operations on backing_hd */
 666    Error *backing_blocker;
 667
 668    /* Protected by AioContext lock */
 669
 670    /* If we are reading a disk image, give its size in sectors.
 671     * Generally read-only; it is written to by load_snapshot and
 672     * save_snaphost, but the block layer is quiescent during those.
 673     */
 674    int64_t total_sectors;
 675
 676    /* Callback before write request is processed */
 677    NotifierWithReturnList before_write_notifiers;
 678
 679    /* threshold limit for writes, in bytes. "High water mark". */
 680    uint64_t write_threshold_offset;
 681    NotifierWithReturn write_threshold_notifier;
 682
 683    /* Writing to the list requires the BQL _and_ the dirty_bitmap_mutex.
 684     * Reading from the list can be done with either the BQL or the
 685     * dirty_bitmap_mutex.  Modifying a bitmap only requires
 686     * dirty_bitmap_mutex.  */
 687    QemuMutex dirty_bitmap_mutex;
 688    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
 689
 690    /* Offset after the highest byte written to */
 691    Stat64 wr_highest_offset;
 692
 693    /* If true, copy read backing sectors into image.  Can be >1 if more
 694     * than one client has requested copy-on-read.  Accessed with atomic
 695     * ops.
 696     */
 697    int copy_on_read;
 698
 699    /* number of in-flight requests; overall and serialising.
 700     * Accessed with atomic ops.
 701     */
 702    unsigned int in_flight;
 703    unsigned int serialising_in_flight;
 704
 705    /* Internal to BDRV_POLL_WHILE and bdrv_wakeup.  Accessed with atomic
 706     * ops.
 707     */
 708    bool wakeup;
 709
 710    /* counter for nested bdrv_io_plug.
 711     * Accessed with atomic ops.
 712    */
 713    unsigned io_plugged;
 714
 715    /* do we need to tell the quest if we have a volatile write cache? */
 716    int enable_write_cache;
 717
 718    /* Accessed with atomic ops.  */
 719    int quiesce_counter;
 720    unsigned int write_gen;               /* Current data generation */
 721
 722    /* Protected by reqs_lock.  */
 723    CoMutex reqs_lock;
 724    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
 725    CoQueue flush_queue;                  /* Serializing flush queue */
 726    bool active_flush_req;                /* Flush request in flight? */
 727
 728    /* Only read/written by whoever has set active_flush_req to true.  */
 729    unsigned int flushed_gen;             /* Flushed write generation */
 730};
 731
 732struct BlockBackendRootState {
 733    int open_flags;
 734    bool read_only;
 735    BlockdevDetectZeroesOptions detect_zeroes;
 736};
 737
 738typedef enum BlockMirrorBackingMode {
 739    /* Reuse the existing backing chain from the source for the target.
 740     * - sync=full: Set backing BDS to NULL.
 741     * - sync=top:  Use source's backing BDS.
 742     * - sync=none: Use source as the backing BDS. */
 743    MIRROR_SOURCE_BACKING_CHAIN,
 744
 745    /* Open the target's backing chain completely anew */
 746    MIRROR_OPEN_BACKING_CHAIN,
 747
 748    /* Do not change the target's backing BDS after job completion */
 749    MIRROR_LEAVE_BACKING_CHAIN,
 750} BlockMirrorBackingMode;
 751
 752static inline BlockDriverState *backing_bs(BlockDriverState *bs)
 753{
 754    return bs->backing ? bs->backing->bs : NULL;
 755}
 756
 757
 758/* Essential block drivers which must always be statically linked into qemu, and
 759 * which therefore can be accessed without using bdrv_find_format() */
 760extern BlockDriver bdrv_file;
 761extern BlockDriver bdrv_raw;
 762extern BlockDriver bdrv_qcow2;
 763
 764int coroutine_fn bdrv_co_preadv(BdrvChild *child,
 765    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
 766    BdrvRequestFlags flags);
 767int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
 768    int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
 769    BdrvRequestFlags flags);
 770
 771int get_tmp_filename(char *filename, int size);
 772BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
 773                            const char *filename);
 774
 775void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix,
 776                                      QDict *options);
 777
 778
 779/**
 780 * bdrv_add_before_write_notifier:
 781 *
 782 * Register a callback that is invoked before write requests are processed but
 783 * after any throttling or waiting for overlapping requests.
 784 */
 785void bdrv_add_before_write_notifier(BlockDriverState *bs,
 786                                    NotifierWithReturn *notifier);
 787
 788/**
 789 * bdrv_detach_aio_context:
 790 *
 791 * May be called from .bdrv_detach_aio_context() to detach children from the
 792 * current #AioContext.  This is only needed by block drivers that manage their
 793 * own children.  Both ->file and ->backing are automatically handled and
 794 * block drivers should not call this function on them explicitly.
 795 */
 796void bdrv_detach_aio_context(BlockDriverState *bs);
 797
 798/**
 799 * bdrv_attach_aio_context:
 800 *
 801 * May be called from .bdrv_attach_aio_context() to attach children to the new
 802 * #AioContext.  This is only needed by block drivers that manage their own
 803 * children.  Both ->file and ->backing are automatically handled and block
 804 * drivers should not call this function on them explicitly.
 805 */
 806void bdrv_attach_aio_context(BlockDriverState *bs,
 807                             AioContext *new_context);
 808
 809/**
 810 * bdrv_add_aio_context_notifier:
 811 *
 812 * If a long-running job intends to be always run in the same AioContext as a
 813 * certain BDS, it may use this function to be notified of changes regarding the
 814 * association of the BDS to an AioContext.
 815 *
 816 * attached_aio_context() is called after the target BDS has been attached to a
 817 * new AioContext; detach_aio_context() is called before the target BDS is being
 818 * detached from its old AioContext.
 819 */
 820void bdrv_add_aio_context_notifier(BlockDriverState *bs,
 821        void (*attached_aio_context)(AioContext *new_context, void *opaque),
 822        void (*detach_aio_context)(void *opaque), void *opaque);
 823
 824/**
 825 * bdrv_remove_aio_context_notifier:
 826 *
 827 * Unsubscribe of change notifications regarding the BDS's AioContext. The
 828 * parameters given here have to be the same as those given to
 829 * bdrv_add_aio_context_notifier().
 830 */
 831void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
 832                                      void (*aio_context_attached)(AioContext *,
 833                                                                   void *),
 834                                      void (*aio_context_detached)(void *),
 835                                      void *opaque);
 836
 837/**
 838 * bdrv_wakeup:
 839 * @bs: The BlockDriverState for which an I/O operation has been completed.
 840 *
 841 * Wake up the main thread if it is waiting on BDRV_POLL_WHILE.  During
 842 * synchronous I/O on a BlockDriverState that is attached to another
 843 * I/O thread, the main thread lets the I/O thread's event loop run,
 844 * waiting for the I/O operation to complete.  A bdrv_wakeup will wake
 845 * up the main thread if necessary.
 846 *
 847 * Manual calls to bdrv_wakeup are rarely necessary, because
 848 * bdrv_dec_in_flight already calls it.
 849 */
 850void bdrv_wakeup(BlockDriverState *bs);
 851
 852#ifdef _WIN32
 853int is_windows_drive(const char *filename);
 854#endif
 855
 856/**
 857 * stream_start:
 858 * @job_id: The id of the newly-created job, or %NULL to use the
 859 * device name of @bs.
 860 * @bs: Block device to operate on.
 861 * @base: Block device that will become the new base, or %NULL to
 862 * flatten the whole backing file chain onto @bs.
 863 * @backing_file_str: The file name that will be written to @bs as the
 864 * the new backing file if the job completes. Ignored if @base is %NULL.
 865 * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
 866 * @on_error: The action to take upon error.
 867 * @errp: Error object.
 868 *
 869 * Start a streaming operation on @bs.  Clusters that are unallocated
 870 * in @bs, but allocated in any image between @base and @bs (both
 871 * exclusive) will be written to @bs.  At the end of a successful
 872 * streaming job, the backing file of @bs will be changed to
 873 * @backing_file_str in the written image and to @base in the live
 874 * BlockDriverState.
 875 */
 876void stream_start(const char *job_id, BlockDriverState *bs,
 877                  BlockDriverState *base, const char *backing_file_str,
 878                  int64_t speed, BlockdevOnError on_error, Error **errp);
 879
 880/**
 881 * commit_start:
 882 * @job_id: The id of the newly-created job, or %NULL to use the
 883 * device name of @bs.
 884 * @bs: Active block device.
 885 * @top: Top block device to be committed.
 886 * @base: Block device that will be written into, and become the new top.
 887 * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
 888 * @on_error: The action to take upon error.
 889 * @backing_file_str: String to use as the backing file in @top's overlay
 890 * @filter_node_name: The node name that should be assigned to the filter
 891 * driver that the commit job inserts into the graph above @top. NULL means
 892 * that a node name should be autogenerated.
 893 * @errp: Error object.
 894 *
 895 */
 896void commit_start(const char *job_id, BlockDriverState *bs,
 897                  BlockDriverState *base, BlockDriverState *top, int64_t speed,
 898                  BlockdevOnError on_error, const char *backing_file_str,
 899                  const char *filter_node_name, Error **errp);
 900/**
 901 * commit_active_start:
 902 * @job_id: The id of the newly-created job, or %NULL to use the
 903 * device name of @bs.
 904 * @bs: Active block device to be committed.
 905 * @base: Block device that will be written into, and become the new top.
 906 * @creation_flags: Flags that control the behavior of the Job lifetime.
 907 *                  See @BlockJobCreateFlags
 908 * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
 909 * @on_error: The action to take upon error.
 910 * @filter_node_name: The node name that should be assigned to the filter
 911 * driver that the commit job inserts into the graph above @bs. NULL means that
 912 * a node name should be autogenerated.
 913 * @cb: Completion function for the job.
 914 * @opaque: Opaque pointer value passed to @cb.
 915 * @auto_complete: Auto complete the job.
 916 * @errp: Error object.
 917 *
 918 */
 919void commit_active_start(const char *job_id, BlockDriverState *bs,
 920                         BlockDriverState *base, int creation_flags,
 921                         int64_t speed, BlockdevOnError on_error,
 922                         const char *filter_node_name,
 923                         BlockCompletionFunc *cb, void *opaque,
 924                         bool auto_complete, Error **errp);
 925/*
 926 * mirror_start:
 927 * @job_id: The id of the newly-created job, or %NULL to use the
 928 * device name of @bs.
 929 * @bs: Block device to operate on.
 930 * @target: Block device to write to.
 931 * @replaces: Block graph node name to replace once the mirror is done. Can
 932 *            only be used when full mirroring is selected.
 933 * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
 934 * @granularity: The chosen granularity for the dirty bitmap.
 935 * @buf_size: The amount of data that can be in flight at one time.
 936 * @mode: Whether to collapse all images in the chain to the target.
 937 * @backing_mode: How to establish the target's backing chain after completion.
 938 * @on_source_error: The action to take upon error reading from the source.
 939 * @on_target_error: The action to take upon error writing to the target.
 940 * @unmap: Whether to unmap target where source sectors only contain zeroes.
 941 * @filter_node_name: The node name that should be assigned to the filter
 942 * driver that the mirror job inserts into the graph above @bs. NULL means that
 943 * a node name should be autogenerated.
 944 * @errp: Error object.
 945 *
 946 * Start a mirroring operation on @bs.  Clusters that are allocated
 947 * in @bs will be written to @target until the job is cancelled or
 948 * manually completed.  At the end of a successful mirroring job,
 949 * @bs will be switched to read from @target.
 950 */
 951void mirror_start(const char *job_id, BlockDriverState *bs,
 952                  BlockDriverState *target, const char *replaces,
 953                  int64_t speed, uint32_t granularity, int64_t buf_size,
 954                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
 955                  BlockdevOnError on_source_error,
 956                  BlockdevOnError on_target_error,
 957                  bool unmap, const char *filter_node_name, Error **errp);
 958
 959/*
 960 * backup_job_create:
 961 * @job_id: The id of the newly-created job, or %NULL to use the
 962 * device name of @bs.
 963 * @bs: Block device to operate on.
 964 * @target: Block device to write to.
 965 * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
 966 * @sync_mode: What parts of the disk image should be copied to the destination.
 967 * @sync_bitmap: The dirty bitmap if sync_mode is MIRROR_SYNC_MODE_INCREMENTAL.
 968 * @on_source_error: The action to take upon error reading from the source.
 969 * @on_target_error: The action to take upon error writing to the target.
 970 * @creation_flags: Flags that control the behavior of the Job lifetime.
 971 *                  See @BlockJobCreateFlags
 972 * @cb: Completion function for the job.
 973 * @opaque: Opaque pointer value passed to @cb.
 974 * @txn: Transaction that this job is part of (may be NULL).
 975 *
 976 * Create a backup operation on @bs.  Clusters in @bs are written to @target
 977 * until the job is cancelled or manually completed.
 978 */
 979BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
 980                            BlockDriverState *target, int64_t speed,
 981                            MirrorSyncMode sync_mode,
 982                            BdrvDirtyBitmap *sync_bitmap,
 983                            bool compress,
 984                            BlockdevOnError on_source_error,
 985                            BlockdevOnError on_target_error,
 986                            int creation_flags,
 987                            BlockCompletionFunc *cb, void *opaque,
 988                            BlockJobTxn *txn, Error **errp);
 989
 990void hmp_drive_add_node(Monitor *mon, const char *optstr);
 991
 992BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
 993                                  const char *child_name,
 994                                  const BdrvChildRole *child_role,
 995                                  uint64_t perm, uint64_t shared_perm,
 996                                  void *opaque, Error **errp);
 997void bdrv_root_unref_child(BdrvChild *child);
 998
 999int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
1000                            Error **errp);
1001
1002/* Default implementation for BlockDriver.bdrv_child_perm() that can be used by
1003 * block filters: Forward CONSISTENT_READ, WRITE, WRITE_UNCHANGED and RESIZE to
1004 * all children */
1005void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,
1006                               const BdrvChildRole *role,
1007                               BlockReopenQueue *reopen_queue,
1008                               uint64_t perm, uint64_t shared,
1009                               uint64_t *nperm, uint64_t *nshared);
1010
1011/* Default implementation for BlockDriver.bdrv_child_perm() that can be used by
1012 * (non-raw) image formats: Like above for bs->backing, but for bs->file it
1013 * requires WRITE | RESIZE for read-write images, always requires
1014 * CONSISTENT_READ and doesn't share WRITE. */
1015void bdrv_format_default_perms(BlockDriverState *bs, BdrvChild *c,
1016                               const BdrvChildRole *role,
1017                               BlockReopenQueue *reopen_queue,
1018                               uint64_t perm, uint64_t shared,
1019                               uint64_t *nperm, uint64_t *nshared);
1020
1021/*
1022 * Default implementation for drivers to pass bdrv_co_get_block_status() to
1023 * their file.
1024 */
1025int64_t coroutine_fn bdrv_co_get_block_status_from_file(BlockDriverState *bs,
1026                                                        int64_t sector_num,
1027                                                        int nb_sectors,
1028                                                        int *pnum,
1029                                                        BlockDriverState **file);
1030/*
1031 * Default implementation for drivers to pass bdrv_co_get_block_status() to
1032 * their backing file.
1033 */
1034int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs,
1035                                                           int64_t sector_num,
1036                                                           int nb_sectors,
1037                                                           int *pnum,
1038                                                           BlockDriverState **file);
1039const char *bdrv_get_parent_name(const BlockDriverState *bs);
1040void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp);
1041bool blk_dev_has_removable_media(BlockBackend *blk);
1042bool blk_dev_has_tray(BlockBackend *blk);
1043void blk_dev_eject_request(BlockBackend *blk, bool force);
1044bool blk_dev_is_tray_open(BlockBackend *blk);
1045bool blk_dev_is_medium_locked(BlockBackend *blk);
1046
1047void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
1048bool bdrv_requests_pending(BlockDriverState *bs);
1049
1050void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
1051void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
1052
1053void bdrv_inc_in_flight(BlockDriverState *bs);
1054void bdrv_dec_in_flight(BlockDriverState *bs);
1055
1056void blockdev_close_all_bdrv_states(void);
1057
1058#endif /* BLOCK_INT_H */
1059