qemu/block/blkio.c
<<
>>
Prefs
   1/* SPDX-License-Identifier: LGPL-2.1-or-later */
   2/*
   3 * libblkio BlockDriver
   4 *
   5 * Copyright Red Hat, Inc.
   6 *
   7 * Author:
   8 *   Stefan Hajnoczi <stefanha@redhat.com>
   9 */
  10
  11#include "qemu/osdep.h"
  12#include <blkio.h>
  13#include "block/block_int.h"
  14#include "exec/memory.h"
  15#include "exec/cpu-common.h" /* for qemu_ram_get_fd() */
  16#include "qapi/error.h"
  17#include "qemu/error-report.h"
  18#include "qapi/qmp/qdict.h"
  19#include "qemu/module.h"
  20#include "exec/memory.h" /* for ram_block_discard_disable() */
  21
  22#include "block/block-io.h"
  23
  24/*
  25 * Keep the QEMU BlockDriver names identical to the libblkio driver names.
  26 * Using macros instead of typing out the string literals avoids typos.
  27 */
  28#define DRIVER_IO_URING "io_uring"
  29#define DRIVER_NVME_IO_URING "nvme-io_uring"
  30#define DRIVER_VIRTIO_BLK_VFIO_PCI "virtio-blk-vfio-pci"
  31#define DRIVER_VIRTIO_BLK_VHOST_USER "virtio-blk-vhost-user"
  32#define DRIVER_VIRTIO_BLK_VHOST_VDPA "virtio-blk-vhost-vdpa"
  33
  34/*
  35 * Allocated bounce buffers are kept in a list sorted by buffer address.
  36 */
  37typedef struct BlkioBounceBuf {
  38    QLIST_ENTRY(BlkioBounceBuf) next;
  39
  40    /* The bounce buffer */
  41    struct iovec buf;
  42} BlkioBounceBuf;
  43
  44typedef struct {
  45    /*
  46     * libblkio is not thread-safe so this lock protects ->blkio and
  47     * ->blkioq.
  48     */
  49    QemuMutex blkio_lock;
  50    struct blkio *blkio;
  51    struct blkioq *blkioq; /* make this multi-queue in the future... */
  52    int completion_fd;
  53
  54    /*
  55     * Polling fetches the next completion into this field.
  56     *
  57     * No lock is necessary since only one thread calls aio_poll() and invokes
  58     * fd and poll handlers.
  59     */
  60    struct blkio_completion poll_completion;
  61
  62    /*
  63     * Protects ->bounce_pool, ->bounce_bufs, ->bounce_available.
  64     *
  65     * Lock ordering: ->bounce_lock before ->blkio_lock.
  66     */
  67    CoMutex bounce_lock;
  68
  69    /* Bounce buffer pool */
  70    struct blkio_mem_region bounce_pool;
  71
  72    /* Sorted list of allocated bounce buffers */
  73    QLIST_HEAD(, BlkioBounceBuf) bounce_bufs;
  74
  75    /* Queue for coroutines waiting for bounce buffer space */
  76    CoQueue bounce_available;
  77
  78    /* The value of the "mem-region-alignment" property */
  79    size_t mem_region_alignment;
  80
  81    /* Can we skip adding/deleting blkio_mem_regions? */
  82    bool needs_mem_regions;
  83
  84    /* Are file descriptors necessary for blkio_mem_regions? */
  85    bool needs_mem_region_fd;
  86
  87    /* Are madvise(MADV_DONTNEED)-style operations unavailable? */
  88    bool may_pin_mem_regions;
  89} BDRVBlkioState;
  90
  91/* Called with s->bounce_lock held */
  92static int blkio_resize_bounce_pool(BDRVBlkioState *s, int64_t bytes)
  93{
  94    /* There can be no allocated bounce buffers during resize */
  95    assert(QLIST_EMPTY(&s->bounce_bufs));
  96
  97    /* Pad size to reduce frequency of resize calls */
  98    bytes += 128 * 1024;
  99
 100    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
 101        int ret;
 102
 103        if (s->bounce_pool.addr) {
 104            blkio_unmap_mem_region(s->blkio, &s->bounce_pool);
 105            blkio_free_mem_region(s->blkio, &s->bounce_pool);
 106            memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
 107        }
 108
 109        /* Automatically freed when s->blkio is destroyed */
 110        ret = blkio_alloc_mem_region(s->blkio, &s->bounce_pool, bytes);
 111        if (ret < 0) {
 112            return ret;
 113        }
 114
 115        ret = blkio_map_mem_region(s->blkio, &s->bounce_pool);
 116        if (ret < 0) {
 117            blkio_free_mem_region(s->blkio, &s->bounce_pool);
 118            memset(&s->bounce_pool, 0, sizeof(s->bounce_pool));
 119            return ret;
 120        }
 121    }
 122
 123    return 0;
 124}
 125
 126/* Called with s->bounce_lock held */
 127static bool
 128blkio_do_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
 129                             int64_t bytes)
 130{
 131    void *addr = s->bounce_pool.addr;
 132    BlkioBounceBuf *cur = NULL;
 133    BlkioBounceBuf *prev = NULL;
 134    ptrdiff_t space;
 135
 136    /*
 137     * This is just a linear search over the holes between requests. An
 138     * efficient allocator would be nice.
 139     */
 140    QLIST_FOREACH(cur, &s->bounce_bufs, next) {
 141        space = cur->buf.iov_base - addr;
 142        if (bytes <= space) {
 143            QLIST_INSERT_BEFORE(cur, bounce, next);
 144            bounce->buf.iov_base = addr;
 145            bounce->buf.iov_len = bytes;
 146            return true;
 147        }
 148
 149        addr = cur->buf.iov_base + cur->buf.iov_len;
 150        prev = cur;
 151    }
 152
 153    /* Is there space after the last request? */
 154    space = s->bounce_pool.addr + s->bounce_pool.len - addr;
 155    if (bytes > space) {
 156        return false;
 157    }
 158    if (prev) {
 159        QLIST_INSERT_AFTER(prev, bounce, next);
 160    } else {
 161        QLIST_INSERT_HEAD(&s->bounce_bufs, bounce, next);
 162    }
 163    bounce->buf.iov_base = addr;
 164    bounce->buf.iov_len = bytes;
 165    return true;
 166}
 167
 168static int coroutine_fn
 169blkio_alloc_bounce_buffer(BDRVBlkioState *s, BlkioBounceBuf *bounce,
 170                          int64_t bytes)
 171{
 172    /*
 173     * Ensure fairness: first time around we join the back of the queue,
 174     * subsequently we join the front so we don't lose our place.
 175     */
 176    CoQueueWaitFlags wait_flags = 0;
 177
 178    QEMU_LOCK_GUARD(&s->bounce_lock);
 179
 180    /* Ensure fairness: don't even try if other requests are already waiting */
 181    if (!qemu_co_queue_empty(&s->bounce_available)) {
 182        qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
 183                                 wait_flags);
 184        wait_flags = CO_QUEUE_WAIT_FRONT;
 185    }
 186
 187    while (true) {
 188        if (blkio_do_alloc_bounce_buffer(s, bounce, bytes)) {
 189            /* Kick the next queued request since there may be space */
 190            qemu_co_queue_next(&s->bounce_available);
 191            return 0;
 192        }
 193
 194        /*
 195         * If there are no in-flight requests then the pool was simply too
 196         * small.
 197         */
 198        if (QLIST_EMPTY(&s->bounce_bufs)) {
 199            bool ok;
 200            int ret;
 201
 202            ret = blkio_resize_bounce_pool(s, bytes);
 203            if (ret < 0) {
 204                /* Kick the next queued request since that may fail too */
 205                qemu_co_queue_next(&s->bounce_available);
 206                return ret;
 207            }
 208
 209            ok = blkio_do_alloc_bounce_buffer(s, bounce, bytes);
 210            assert(ok); /* must have space this time */
 211            return 0;
 212        }
 213
 214        qemu_co_queue_wait_flags(&s->bounce_available, &s->bounce_lock,
 215                                 wait_flags);
 216        wait_flags = CO_QUEUE_WAIT_FRONT;
 217    }
 218}
 219
 220static void coroutine_fn blkio_free_bounce_buffer(BDRVBlkioState *s,
 221                                                  BlkioBounceBuf *bounce)
 222{
 223    QEMU_LOCK_GUARD(&s->bounce_lock);
 224
 225    QLIST_REMOVE(bounce, next);
 226
 227    /* Wake up waiting coroutines since space may now be available */
 228    qemu_co_queue_next(&s->bounce_available);
 229}
 230
 231/* For async to .bdrv_co_*() conversion */
 232typedef struct {
 233    Coroutine *coroutine;
 234    int ret;
 235} BlkioCoData;
 236
 237static void blkio_completion_fd_read(void *opaque)
 238{
 239    BlockDriverState *bs = opaque;
 240    BDRVBlkioState *s = bs->opaque;
 241    uint64_t val;
 242    int ret;
 243
 244    /* Polling may have already fetched a completion */
 245    if (s->poll_completion.user_data != NULL) {
 246        BlkioCoData *cod = s->poll_completion.user_data;
 247        cod->ret = s->poll_completion.ret;
 248
 249        /* Clear it in case aio_co_wake() enters a nested event loop */
 250        s->poll_completion.user_data = NULL;
 251
 252        aio_co_wake(cod->coroutine);
 253    }
 254
 255    /* Reset completion fd status */
 256    ret = read(s->completion_fd, &val, sizeof(val));
 257
 258    /* Ignore errors, there's nothing we can do */
 259    (void)ret;
 260
 261    /*
 262     * Reading one completion at a time makes nested event loop re-entrancy
 263     * simple. Change this loop to get multiple completions in one go if it
 264     * becomes a performance bottleneck.
 265     */
 266    while (true) {
 267        struct blkio_completion completion;
 268
 269        WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
 270            ret = blkioq_do_io(s->blkioq, &completion, 0, 1, NULL);
 271        }
 272        if (ret != 1) {
 273            break;
 274        }
 275
 276        BlkioCoData *cod = completion.user_data;
 277        cod->ret = completion.ret;
 278        aio_co_wake(cod->coroutine);
 279    }
 280}
 281
 282static bool blkio_completion_fd_poll(void *opaque)
 283{
 284    BlockDriverState *bs = opaque;
 285    BDRVBlkioState *s = bs->opaque;
 286    int ret;
 287
 288    /* Just in case we already fetched a completion */
 289    if (s->poll_completion.user_data != NULL) {
 290        return true;
 291    }
 292
 293    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
 294        ret = blkioq_do_io(s->blkioq, &s->poll_completion, 0, 1, NULL);
 295    }
 296    return ret == 1;
 297}
 298
 299static void blkio_completion_fd_poll_ready(void *opaque)
 300{
 301    blkio_completion_fd_read(opaque);
 302}
 303
 304static void blkio_attach_aio_context(BlockDriverState *bs,
 305                                     AioContext *new_context)
 306{
 307    BDRVBlkioState *s = bs->opaque;
 308
 309    aio_set_fd_handler(new_context,
 310                       s->completion_fd,
 311                       false,
 312                       blkio_completion_fd_read,
 313                       NULL,
 314                       blkio_completion_fd_poll,
 315                       blkio_completion_fd_poll_ready,
 316                       bs);
 317}
 318
 319static void blkio_detach_aio_context(BlockDriverState *bs)
 320{
 321    BDRVBlkioState *s = bs->opaque;
 322
 323    aio_set_fd_handler(bdrv_get_aio_context(bs),
 324                       s->completion_fd,
 325                       false, NULL, NULL, NULL, NULL, NULL);
 326}
 327
 328/* Call with s->blkio_lock held to submit I/O after enqueuing a new request */
 329static void blkio_submit_io(BlockDriverState *bs)
 330{
 331    if (qatomic_read(&bs->io_plugged) == 0) {
 332        BDRVBlkioState *s = bs->opaque;
 333
 334        blkioq_do_io(s->blkioq, NULL, 0, 0, NULL);
 335    }
 336}
 337
 338static int coroutine_fn
 339blkio_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
 340{
 341    BDRVBlkioState *s = bs->opaque;
 342    BlkioCoData cod = {
 343        .coroutine = qemu_coroutine_self(),
 344    };
 345
 346    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
 347        blkioq_discard(s->blkioq, offset, bytes, &cod, 0);
 348        blkio_submit_io(bs);
 349    }
 350
 351    qemu_coroutine_yield();
 352    return cod.ret;
 353}
 354
 355static int coroutine_fn
 356blkio_co_preadv(BlockDriverState *bs, int64_t offset, int64_t bytes,
 357                QEMUIOVector *qiov, BdrvRequestFlags flags)
 358{
 359    BlkioCoData cod = {
 360        .coroutine = qemu_coroutine_self(),
 361    };
 362    BDRVBlkioState *s = bs->opaque;
 363    bool use_bounce_buffer =
 364        s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
 365    BlkioBounceBuf bounce;
 366    struct iovec *iov = qiov->iov;
 367    int iovcnt = qiov->niov;
 368
 369    if (use_bounce_buffer) {
 370        int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
 371        if (ret < 0) {
 372            return ret;
 373        }
 374
 375        iov = &bounce.buf;
 376        iovcnt = 1;
 377    }
 378
 379    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
 380        blkioq_readv(s->blkioq, offset, iov, iovcnt, &cod, 0);
 381        blkio_submit_io(bs);
 382    }
 383
 384    qemu_coroutine_yield();
 385
 386    if (use_bounce_buffer) {
 387        if (cod.ret == 0) {
 388            qemu_iovec_from_buf(qiov, 0,
 389                                bounce.buf.iov_base,
 390                                bounce.buf.iov_len);
 391        }
 392
 393        blkio_free_bounce_buffer(s, &bounce);
 394    }
 395
 396    return cod.ret;
 397}
 398
 399static int coroutine_fn blkio_co_pwritev(BlockDriverState *bs, int64_t offset,
 400        int64_t bytes, QEMUIOVector *qiov, BdrvRequestFlags flags)
 401{
 402    uint32_t blkio_flags = (flags & BDRV_REQ_FUA) ? BLKIO_REQ_FUA : 0;
 403    BlkioCoData cod = {
 404        .coroutine = qemu_coroutine_self(),
 405    };
 406    BDRVBlkioState *s = bs->opaque;
 407    bool use_bounce_buffer =
 408        s->needs_mem_regions && !(flags & BDRV_REQ_REGISTERED_BUF);
 409    BlkioBounceBuf bounce;
 410    struct iovec *iov = qiov->iov;
 411    int iovcnt = qiov->niov;
 412
 413    if (use_bounce_buffer) {
 414        int ret = blkio_alloc_bounce_buffer(s, &bounce, bytes);
 415        if (ret < 0) {
 416            return ret;
 417        }
 418
 419        qemu_iovec_to_buf(qiov, 0, bounce.buf.iov_base, bytes);
 420        iov = &bounce.buf;
 421        iovcnt = 1;
 422    }
 423
 424    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
 425        blkioq_writev(s->blkioq, offset, iov, iovcnt, &cod, blkio_flags);
 426        blkio_submit_io(bs);
 427    }
 428
 429    qemu_coroutine_yield();
 430
 431    if (use_bounce_buffer) {
 432        blkio_free_bounce_buffer(s, &bounce);
 433    }
 434
 435    return cod.ret;
 436}
 437
 438static int coroutine_fn blkio_co_flush(BlockDriverState *bs)
 439{
 440    BDRVBlkioState *s = bs->opaque;
 441    BlkioCoData cod = {
 442        .coroutine = qemu_coroutine_self(),
 443    };
 444
 445    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
 446        blkioq_flush(s->blkioq, &cod, 0);
 447        blkio_submit_io(bs);
 448    }
 449
 450    qemu_coroutine_yield();
 451    return cod.ret;
 452}
 453
 454static int coroutine_fn blkio_co_pwrite_zeroes(BlockDriverState *bs,
 455    int64_t offset, int64_t bytes, BdrvRequestFlags flags)
 456{
 457    BDRVBlkioState *s = bs->opaque;
 458    BlkioCoData cod = {
 459        .coroutine = qemu_coroutine_self(),
 460    };
 461    uint32_t blkio_flags = 0;
 462
 463    if (flags & BDRV_REQ_FUA) {
 464        blkio_flags |= BLKIO_REQ_FUA;
 465    }
 466    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
 467        blkio_flags |= BLKIO_REQ_NO_UNMAP;
 468    }
 469    if (flags & BDRV_REQ_NO_FALLBACK) {
 470        blkio_flags |= BLKIO_REQ_NO_FALLBACK;
 471    }
 472
 473    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
 474        blkioq_write_zeroes(s->blkioq, offset, bytes, &cod, blkio_flags);
 475        blkio_submit_io(bs);
 476    }
 477
 478    qemu_coroutine_yield();
 479    return cod.ret;
 480}
 481
 482static void coroutine_fn blkio_co_io_unplug(BlockDriverState *bs)
 483{
 484    BDRVBlkioState *s = bs->opaque;
 485
 486    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
 487        blkio_submit_io(bs);
 488    }
 489}
 490
 491typedef enum {
 492    BMRR_OK,
 493    BMRR_SKIP,
 494    BMRR_FAIL,
 495} BlkioMemRegionResult;
 496
 497/*
 498 * Produce a struct blkio_mem_region for a given address and size.
 499 *
 500 * This function produces identical results when called multiple times with the
 501 * same arguments. This property is necessary because blkio_unmap_mem_region()
 502 * must receive the same struct blkio_mem_region field values that were passed
 503 * to blkio_map_mem_region().
 504 */
 505static BlkioMemRegionResult
 506blkio_mem_region_from_host(BlockDriverState *bs,
 507                           void *host, size_t size,
 508                           struct blkio_mem_region *region,
 509                           Error **errp)
 510{
 511    BDRVBlkioState *s = bs->opaque;
 512    int fd = -1;
 513    ram_addr_t fd_offset = 0;
 514
 515    if (((uintptr_t)host | size) % s->mem_region_alignment) {
 516        error_setg(errp, "unaligned buf %p with size %zu", host, size);
 517        return BMRR_FAIL;
 518    }
 519
 520    /* Attempt to find the fd for the underlying memory */
 521    if (s->needs_mem_region_fd) {
 522        RAMBlock *ram_block;
 523        RAMBlock *end_block;
 524        ram_addr_t offset;
 525
 526        /*
 527         * bdrv_register_buf() is called with the BQL held so mr lives at least
 528         * until this function returns.
 529         */
 530        ram_block = qemu_ram_block_from_host(host, false, &fd_offset);
 531        if (ram_block) {
 532            fd = qemu_ram_get_fd(ram_block);
 533        }
 534        if (fd == -1) {
 535            /*
 536             * Ideally every RAMBlock would have an fd. pc-bios and other
 537             * things don't. Luckily they are usually not I/O buffers and we
 538             * can just ignore them.
 539             */
 540            return BMRR_SKIP;
 541        }
 542
 543        /* Make sure the fd covers the entire range */
 544        end_block = qemu_ram_block_from_host(host + size - 1, false, &offset);
 545        if (ram_block != end_block) {
 546            error_setg(errp, "registered buffer at %p with size %zu extends "
 547                       "beyond RAMBlock", host, size);
 548            return BMRR_FAIL;
 549        }
 550    }
 551
 552    *region = (struct blkio_mem_region){
 553        .addr = host,
 554        .len = size,
 555        .fd = fd,
 556        .fd_offset = fd_offset,
 557    };
 558    return BMRR_OK;
 559}
 560
 561static bool blkio_register_buf(BlockDriverState *bs, void *host, size_t size,
 562                               Error **errp)
 563{
 564    BDRVBlkioState *s = bs->opaque;
 565    struct blkio_mem_region region;
 566    BlkioMemRegionResult region_result;
 567    int ret;
 568
 569    /*
 570     * Mapping memory regions conflicts with RAM discard (virtio-mem) when
 571     * there is pinning, so only do it when necessary.
 572     */
 573    if (!s->needs_mem_regions && s->may_pin_mem_regions) {
 574        return true;
 575    }
 576
 577    region_result = blkio_mem_region_from_host(bs, host, size, &region, errp);
 578    if (region_result == BMRR_SKIP) {
 579        return true;
 580    } else if (region_result != BMRR_OK) {
 581        return false;
 582    }
 583
 584    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
 585        ret = blkio_map_mem_region(s->blkio, &region);
 586    }
 587
 588    if (ret < 0) {
 589        error_setg(errp, "Failed to add blkio mem region %p with size %zu: %s",
 590                   host, size, blkio_get_error_msg());
 591        return false;
 592    }
 593    return true;
 594}
 595
 596static void blkio_unregister_buf(BlockDriverState *bs, void *host, size_t size)
 597{
 598    BDRVBlkioState *s = bs->opaque;
 599    struct blkio_mem_region region;
 600
 601    /* See blkio_register_buf() */
 602    if (!s->needs_mem_regions && s->may_pin_mem_regions) {
 603        return;
 604    }
 605
 606    if (blkio_mem_region_from_host(bs, host, size, &region, NULL) != BMRR_OK) {
 607        return;
 608    }
 609
 610    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
 611        blkio_unmap_mem_region(s->blkio, &region);
 612    }
 613}
 614
 615static int blkio_io_uring_open(BlockDriverState *bs, QDict *options, int flags,
 616                               Error **errp)
 617{
 618    const char *filename = qdict_get_str(options, "filename");
 619    BDRVBlkioState *s = bs->opaque;
 620    int ret;
 621
 622    ret = blkio_set_str(s->blkio, "path", filename);
 623    qdict_del(options, "filename");
 624    if (ret < 0) {
 625        error_setg_errno(errp, -ret, "failed to set path: %s",
 626                         blkio_get_error_msg());
 627        return ret;
 628    }
 629
 630    if (flags & BDRV_O_NOCACHE) {
 631        ret = blkio_set_bool(s->blkio, "direct", true);
 632        if (ret < 0) {
 633            error_setg_errno(errp, -ret, "failed to set direct: %s",
 634                             blkio_get_error_msg());
 635            return ret;
 636        }
 637    }
 638
 639    return 0;
 640}
 641
 642static int blkio_nvme_io_uring(BlockDriverState *bs, QDict *options, int flags,
 643                               Error **errp)
 644{
 645    const char *path = qdict_get_try_str(options, "path");
 646    BDRVBlkioState *s = bs->opaque;
 647    int ret;
 648
 649    if (!path) {
 650        error_setg(errp, "missing 'path' option");
 651        return -EINVAL;
 652    }
 653
 654    ret = blkio_set_str(s->blkio, "path", path);
 655    qdict_del(options, "path");
 656    if (ret < 0) {
 657        error_setg_errno(errp, -ret, "failed to set path: %s",
 658                         blkio_get_error_msg());
 659        return ret;
 660    }
 661
 662    if (!(flags & BDRV_O_NOCACHE)) {
 663        error_setg(errp, "cache.direct=off is not supported");
 664        return -EINVAL;
 665    }
 666
 667    return 0;
 668}
 669
 670static int blkio_virtio_blk_common_open(BlockDriverState *bs,
 671        QDict *options, int flags, Error **errp)
 672{
 673    const char *path = qdict_get_try_str(options, "path");
 674    BDRVBlkioState *s = bs->opaque;
 675    int ret;
 676
 677    if (!path) {
 678        error_setg(errp, "missing 'path' option");
 679        return -EINVAL;
 680    }
 681
 682    ret = blkio_set_str(s->blkio, "path", path);
 683    qdict_del(options, "path");
 684    if (ret < 0) {
 685        error_setg_errno(errp, -ret, "failed to set path: %s",
 686                         blkio_get_error_msg());
 687        return ret;
 688    }
 689
 690    if (!(flags & BDRV_O_NOCACHE)) {
 691        error_setg(errp, "cache.direct=off is not supported");
 692        return -EINVAL;
 693    }
 694    return 0;
 695}
 696
 697static int blkio_file_open(BlockDriverState *bs, QDict *options, int flags,
 698                           Error **errp)
 699{
 700    const char *blkio_driver = bs->drv->protocol_name;
 701    BDRVBlkioState *s = bs->opaque;
 702    int ret;
 703
 704    ret = blkio_create(blkio_driver, &s->blkio);
 705    if (ret < 0) {
 706        error_setg_errno(errp, -ret, "blkio_create failed: %s",
 707                         blkio_get_error_msg());
 708        return ret;
 709    }
 710
 711    if (strcmp(blkio_driver, DRIVER_IO_URING) == 0) {
 712        ret = blkio_io_uring_open(bs, options, flags, errp);
 713    } else if (strcmp(blkio_driver, DRIVER_NVME_IO_URING) == 0) {
 714        ret = blkio_nvme_io_uring(bs, options, flags, errp);
 715    } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VFIO_PCI) == 0) {
 716        ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
 717    } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_USER) == 0) {
 718        ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
 719    } else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_VDPA) == 0) {
 720        ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
 721    } else {
 722        g_assert_not_reached();
 723    }
 724    if (ret < 0) {
 725        blkio_destroy(&s->blkio);
 726        return ret;
 727    }
 728
 729    if (!(flags & BDRV_O_RDWR)) {
 730        ret = blkio_set_bool(s->blkio, "read-only", true);
 731        if (ret < 0) {
 732            error_setg_errno(errp, -ret, "failed to set read-only: %s",
 733                             blkio_get_error_msg());
 734            blkio_destroy(&s->blkio);
 735            return ret;
 736        }
 737    }
 738
 739    ret = blkio_connect(s->blkio);
 740    if (ret < 0) {
 741        error_setg_errno(errp, -ret, "blkio_connect failed: %s",
 742                         blkio_get_error_msg());
 743        blkio_destroy(&s->blkio);
 744        return ret;
 745    }
 746
 747    ret = blkio_get_bool(s->blkio,
 748                         "needs-mem-regions",
 749                         &s->needs_mem_regions);
 750    if (ret < 0) {
 751        error_setg_errno(errp, -ret,
 752                         "failed to get needs-mem-regions: %s",
 753                         blkio_get_error_msg());
 754        blkio_destroy(&s->blkio);
 755        return ret;
 756    }
 757
 758    ret = blkio_get_bool(s->blkio,
 759                         "needs-mem-region-fd",
 760                         &s->needs_mem_region_fd);
 761    if (ret < 0) {
 762        error_setg_errno(errp, -ret,
 763                         "failed to get needs-mem-region-fd: %s",
 764                         blkio_get_error_msg());
 765        blkio_destroy(&s->blkio);
 766        return ret;
 767    }
 768
 769    ret = blkio_get_uint64(s->blkio,
 770                           "mem-region-alignment",
 771                           &s->mem_region_alignment);
 772    if (ret < 0) {
 773        error_setg_errno(errp, -ret,
 774                         "failed to get mem-region-alignment: %s",
 775                         blkio_get_error_msg());
 776        blkio_destroy(&s->blkio);
 777        return ret;
 778    }
 779
 780    ret = blkio_get_bool(s->blkio,
 781                         "may-pin-mem-regions",
 782                         &s->may_pin_mem_regions);
 783    if (ret < 0) {
 784        /* Be conservative (assume pinning) if the property is not supported */
 785        s->may_pin_mem_regions = s->needs_mem_regions;
 786    }
 787
 788    /*
 789     * Notify if libblkio drivers pin memory and prevent features like
 790     * virtio-mem from working.
 791     */
 792    if (s->may_pin_mem_regions) {
 793        ret = ram_block_discard_disable(true);
 794        if (ret < 0) {
 795            error_setg_errno(errp, -ret, "ram_block_discard_disable() failed");
 796            blkio_destroy(&s->blkio);
 797            return ret;
 798        }
 799    }
 800
 801    ret = blkio_start(s->blkio);
 802    if (ret < 0) {
 803        error_setg_errno(errp, -ret, "blkio_start failed: %s",
 804                         blkio_get_error_msg());
 805        blkio_destroy(&s->blkio);
 806        if (s->may_pin_mem_regions) {
 807            ram_block_discard_disable(false);
 808        }
 809        return ret;
 810    }
 811
 812    bs->supported_write_flags = BDRV_REQ_FUA | BDRV_REQ_REGISTERED_BUF;
 813    bs->supported_zero_flags = BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP |
 814                               BDRV_REQ_NO_FALLBACK;
 815
 816    qemu_mutex_init(&s->blkio_lock);
 817    qemu_co_mutex_init(&s->bounce_lock);
 818    qemu_co_queue_init(&s->bounce_available);
 819    QLIST_INIT(&s->bounce_bufs);
 820    s->blkioq = blkio_get_queue(s->blkio, 0);
 821    s->completion_fd = blkioq_get_completion_fd(s->blkioq);
 822
 823    blkio_attach_aio_context(bs, bdrv_get_aio_context(bs));
 824    return 0;
 825}
 826
 827static void blkio_close(BlockDriverState *bs)
 828{
 829    BDRVBlkioState *s = bs->opaque;
 830
 831    /* There is no destroy() API for s->bounce_lock */
 832
 833    qemu_mutex_destroy(&s->blkio_lock);
 834    blkio_detach_aio_context(bs);
 835    blkio_destroy(&s->blkio);
 836
 837    if (s->may_pin_mem_regions) {
 838        ram_block_discard_disable(false);
 839    }
 840}
 841
 842static int64_t coroutine_fn blkio_co_getlength(BlockDriverState *bs)
 843{
 844    BDRVBlkioState *s = bs->opaque;
 845    uint64_t capacity;
 846    int ret;
 847
 848    WITH_QEMU_LOCK_GUARD(&s->blkio_lock) {
 849        ret = blkio_get_uint64(s->blkio, "capacity", &capacity);
 850    }
 851    if (ret < 0) {
 852        return -ret;
 853    }
 854
 855    return capacity;
 856}
 857
 858static int coroutine_fn blkio_truncate(BlockDriverState *bs, int64_t offset,
 859                                       bool exact, PreallocMode prealloc,
 860                                       BdrvRequestFlags flags, Error **errp)
 861{
 862    int64_t current_length;
 863
 864    if (prealloc != PREALLOC_MODE_OFF) {
 865        error_setg(errp, "Unsupported preallocation mode '%s'",
 866                   PreallocMode_str(prealloc));
 867        return -ENOTSUP;
 868    }
 869
 870    current_length = blkio_co_getlength(bs);
 871
 872    if (offset > current_length) {
 873        error_setg(errp, "Cannot grow device");
 874        return -EINVAL;
 875    } else if (exact && offset != current_length) {
 876        error_setg(errp, "Cannot resize device");
 877        return -ENOTSUP;
 878    }
 879
 880    return 0;
 881}
 882
 883static int coroutine_fn
 884blkio_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
 885{
 886    return 0;
 887}
 888
 889static void blkio_refresh_limits(BlockDriverState *bs, Error **errp)
 890{
 891    BDRVBlkioState *s = bs->opaque;
 892    QEMU_LOCK_GUARD(&s->blkio_lock);
 893    int value;
 894    int ret;
 895
 896    ret = blkio_get_int(s->blkio, "request-alignment", &value);
 897    if (ret < 0) {
 898        error_setg_errno(errp, -ret, "failed to get \"request-alignment\": %s",
 899                         blkio_get_error_msg());
 900        return;
 901    }
 902    bs->bl.request_alignment = value;
 903    if (bs->bl.request_alignment < 1 ||
 904        bs->bl.request_alignment >= INT_MAX ||
 905        !is_power_of_2(bs->bl.request_alignment)) {
 906        error_setg(errp, "invalid \"request-alignment\" value %" PRIu32 ", "
 907                   "must be a power of 2 less than INT_MAX",
 908                   bs->bl.request_alignment);
 909        return;
 910    }
 911
 912    ret = blkio_get_int(s->blkio, "optimal-io-size", &value);
 913    if (ret < 0) {
 914        error_setg_errno(errp, -ret, "failed to get \"optimal-io-size\": %s",
 915                         blkio_get_error_msg());
 916        return;
 917    }
 918    bs->bl.opt_transfer = value;
 919    if (bs->bl.opt_transfer > INT_MAX ||
 920        (bs->bl.opt_transfer % bs->bl.request_alignment)) {
 921        error_setg(errp, "invalid \"optimal-io-size\" value %" PRIu32 ", must "
 922                   "be a multiple of %" PRIu32, bs->bl.opt_transfer,
 923                   bs->bl.request_alignment);
 924        return;
 925    }
 926
 927    ret = blkio_get_int(s->blkio, "max-transfer", &value);
 928    if (ret < 0) {
 929        error_setg_errno(errp, -ret, "failed to get \"max-transfer\": %s",
 930                         blkio_get_error_msg());
 931        return;
 932    }
 933    bs->bl.max_transfer = value;
 934    if ((bs->bl.max_transfer % bs->bl.request_alignment) ||
 935        (bs->bl.opt_transfer && (bs->bl.max_transfer % bs->bl.opt_transfer))) {
 936        error_setg(errp, "invalid \"max-transfer\" value %" PRIu32 ", must be "
 937                   "a multiple of %" PRIu32 " and %" PRIu32 " (if non-zero)",
 938                   bs->bl.max_transfer, bs->bl.request_alignment,
 939                   bs->bl.opt_transfer);
 940        return;
 941    }
 942
 943    ret = blkio_get_int(s->blkio, "buf-alignment", &value);
 944    if (ret < 0) {
 945        error_setg_errno(errp, -ret, "failed to get \"buf-alignment\": %s",
 946                         blkio_get_error_msg());
 947        return;
 948    }
 949    if (value < 1) {
 950        error_setg(errp, "invalid \"buf-alignment\" value %d, must be "
 951                   "positive", value);
 952        return;
 953    }
 954    bs->bl.min_mem_alignment = value;
 955
 956    ret = blkio_get_int(s->blkio, "optimal-buf-alignment", &value);
 957    if (ret < 0) {
 958        error_setg_errno(errp, -ret,
 959                         "failed to get \"optimal-buf-alignment\": %s",
 960                         blkio_get_error_msg());
 961        return;
 962    }
 963    if (value < 1) {
 964        error_setg(errp, "invalid \"optimal-buf-alignment\" value %d, "
 965                   "must be positive", value);
 966        return;
 967    }
 968    bs->bl.opt_mem_alignment = value;
 969
 970    ret = blkio_get_int(s->blkio, "max-segments", &value);
 971    if (ret < 0) {
 972        error_setg_errno(errp, -ret, "failed to get \"max-segments\": %s",
 973                         blkio_get_error_msg());
 974        return;
 975    }
 976    if (value < 1) {
 977        error_setg(errp, "invalid \"max-segments\" value %d, must be positive",
 978                   value);
 979        return;
 980    }
 981    bs->bl.max_iov = value;
 982}
 983
 984/*
 985 * TODO
 986 * Missing libblkio APIs:
 987 * - block_status
 988 * - co_invalidate_cache
 989 *
 990 * Out of scope?
 991 * - create
 992 * - truncate
 993 */
 994
 995#define BLKIO_DRIVER(name, ...) \
 996    { \
 997        .format_name             = name, \
 998        .protocol_name           = name, \
 999        .instance_size           = sizeof(BDRVBlkioState), \
1000        .bdrv_file_open          = blkio_file_open, \
1001        .bdrv_close              = blkio_close, \
1002        .bdrv_co_getlength       = blkio_co_getlength, \
1003        .bdrv_co_truncate        = blkio_truncate, \
1004        .bdrv_co_get_info        = blkio_co_get_info, \
1005        .bdrv_attach_aio_context = blkio_attach_aio_context, \
1006        .bdrv_detach_aio_context = blkio_detach_aio_context, \
1007        .bdrv_co_pdiscard        = blkio_co_pdiscard, \
1008        .bdrv_co_preadv          = blkio_co_preadv, \
1009        .bdrv_co_pwritev         = blkio_co_pwritev, \
1010        .bdrv_co_flush_to_disk   = blkio_co_flush, \
1011        .bdrv_co_pwrite_zeroes   = blkio_co_pwrite_zeroes, \
1012        .bdrv_co_io_unplug       = blkio_co_io_unplug, \
1013        .bdrv_refresh_limits     = blkio_refresh_limits, \
1014        .bdrv_register_buf       = blkio_register_buf, \
1015        .bdrv_unregister_buf     = blkio_unregister_buf, \
1016        __VA_ARGS__ \
1017    }
1018
1019static BlockDriver bdrv_io_uring = BLKIO_DRIVER(
1020    DRIVER_IO_URING,
1021    .bdrv_needs_filename = true,
1022);
1023
1024static BlockDriver bdrv_nvme_io_uring = BLKIO_DRIVER(
1025    DRIVER_NVME_IO_URING,
1026);
1027
1028static BlockDriver bdrv_virtio_blk_vfio_pci = BLKIO_DRIVER(
1029    DRIVER_VIRTIO_BLK_VFIO_PCI
1030);
1031
1032static BlockDriver bdrv_virtio_blk_vhost_user = BLKIO_DRIVER(
1033    DRIVER_VIRTIO_BLK_VHOST_USER
1034);
1035
1036static BlockDriver bdrv_virtio_blk_vhost_vdpa = BLKIO_DRIVER(
1037    DRIVER_VIRTIO_BLK_VHOST_VDPA
1038);
1039
1040static void bdrv_blkio_init(void)
1041{
1042    bdrv_register(&bdrv_io_uring);
1043    bdrv_register(&bdrv_nvme_io_uring);
1044    bdrv_register(&bdrv_virtio_blk_vfio_pci);
1045    bdrv_register(&bdrv_virtio_blk_vhost_user);
1046    bdrv_register(&bdrv_virtio_blk_vhost_vdpa);
1047}
1048
1049block_init(bdrv_blkio_init);
1050