qemu/block/nvme.c
<<
>>
Prefs
   1/*
   2 * NVMe block driver based on vfio
   3 *
   4 * Copyright 2016 - 2018 Red Hat, Inc.
   5 *
   6 * Authors:
   7 *   Fam Zheng <famz@redhat.com>
   8 *   Paolo Bonzini <pbonzini@redhat.com>
   9 *
  10 * This work is licensed under the terms of the GNU GPL, version 2 or later.
  11 * See the COPYING file in the top-level directory.
  12 */
  13
  14#include "qemu/osdep.h"
  15#include <linux/vfio.h>
  16#include "qapi/error.h"
  17#include "qapi/qmp/qdict.h"
  18#include "qapi/qmp/qstring.h"
  19#include "qemu/error-report.h"
  20#include "qemu/main-loop.h"
  21#include "qemu/module.h"
  22#include "qemu/cutils.h"
  23#include "qemu/option.h"
  24#include "qemu/vfio-helpers.h"
  25#include "block/block_int.h"
  26#include "sysemu/replay.h"
  27#include "trace.h"
  28
  29#include "block/nvme.h"
  30
  31#define NVME_SQ_ENTRY_BYTES 64
  32#define NVME_CQ_ENTRY_BYTES 16
  33#define NVME_QUEUE_SIZE 128
  34#define NVME_BAR_SIZE 8192
  35
  36typedef struct {
  37    int32_t  head, tail;
  38    uint8_t  *queue;
  39    uint64_t iova;
  40    /* Hardware MMIO register */
  41    volatile uint32_t *doorbell;
  42} NVMeQueue;
  43
  44typedef struct {
  45    BlockCompletionFunc *cb;
  46    void *opaque;
  47    int cid;
  48    void *prp_list_page;
  49    uint64_t prp_list_iova;
  50    bool busy;
  51} NVMeRequest;
  52
  53typedef struct {
  54    CoQueue     free_req_queue;
  55    QemuMutex   lock;
  56
  57    /* Fields protected by BQL */
  58    int         index;
  59    uint8_t     *prp_list_pages;
  60
  61    /* Fields protected by @lock */
  62    NVMeQueue   sq, cq;
  63    int         cq_phase;
  64    NVMeRequest reqs[NVME_QUEUE_SIZE];
  65    bool        busy;
  66    int         need_kick;
  67    int         inflight;
  68} NVMeQueuePair;
  69
  70/* Memory mapped registers */
  71typedef volatile struct {
  72    uint64_t cap;
  73    uint32_t vs;
  74    uint32_t intms;
  75    uint32_t intmc;
  76    uint32_t cc;
  77    uint32_t reserved0;
  78    uint32_t csts;
  79    uint32_t nssr;
  80    uint32_t aqa;
  81    uint64_t asq;
  82    uint64_t acq;
  83    uint32_t cmbloc;
  84    uint32_t cmbsz;
  85    uint8_t  reserved1[0xec0];
  86    uint8_t  cmd_set_specfic[0x100];
  87    uint32_t doorbells[];
  88} NVMeRegs;
  89
  90QEMU_BUILD_BUG_ON(offsetof(NVMeRegs, doorbells) != 0x1000);
  91
  92typedef struct {
  93    AioContext *aio_context;
  94    QEMUVFIOState *vfio;
  95    NVMeRegs *regs;
  96    /* The submission/completion queue pairs.
  97     * [0]: admin queue.
  98     * [1..]: io queues.
  99     */
 100    NVMeQueuePair **queues;
 101    int nr_queues;
 102    size_t page_size;
 103    /* How many uint32_t elements does each doorbell entry take. */
 104    size_t doorbell_scale;
 105    bool write_cache_supported;
 106    EventNotifier irq_notifier;
 107
 108    uint64_t nsze; /* Namespace size reported by identify command */
 109    int nsid;      /* The namespace id to read/write data. */
 110    int blkshift;
 111
 112    uint64_t max_transfer;
 113    bool plugged;
 114
 115    bool supports_write_zeroes;
 116    bool supports_discard;
 117
 118    CoMutex dma_map_lock;
 119    CoQueue dma_flush_queue;
 120
 121    /* Total size of mapped qiov, accessed under dma_map_lock */
 122    int dma_map_count;
 123
 124    /* PCI address (required for nvme_refresh_filename()) */
 125    char *device;
 126} BDRVNVMeState;
 127
 128#define NVME_BLOCK_OPT_DEVICE "device"
 129#define NVME_BLOCK_OPT_NAMESPACE "namespace"
 130
 131static QemuOptsList runtime_opts = {
 132    .name = "nvme",
 133    .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
 134    .desc = {
 135        {
 136            .name = NVME_BLOCK_OPT_DEVICE,
 137            .type = QEMU_OPT_STRING,
 138            .help = "NVMe PCI device address",
 139        },
 140        {
 141            .name = NVME_BLOCK_OPT_NAMESPACE,
 142            .type = QEMU_OPT_NUMBER,
 143            .help = "NVMe namespace",
 144        },
 145        { /* end of list */ }
 146    },
 147};
 148
 149static void nvme_init_queue(BlockDriverState *bs, NVMeQueue *q,
 150                            int nentries, int entry_bytes, Error **errp)
 151{
 152    BDRVNVMeState *s = bs->opaque;
 153    size_t bytes;
 154    int r;
 155
 156    bytes = ROUND_UP(nentries * entry_bytes, s->page_size);
 157    q->head = q->tail = 0;
 158    q->queue = qemu_try_blockalign0(bs, bytes);
 159
 160    if (!q->queue) {
 161        error_setg(errp, "Cannot allocate queue");
 162        return;
 163    }
 164    r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova);
 165    if (r) {
 166        error_setg(errp, "Cannot map queue");
 167    }
 168}
 169
 170static void nvme_free_queue_pair(BlockDriverState *bs, NVMeQueuePair *q)
 171{
 172    qemu_vfree(q->prp_list_pages);
 173    qemu_vfree(q->sq.queue);
 174    qemu_vfree(q->cq.queue);
 175    qemu_mutex_destroy(&q->lock);
 176    g_free(q);
 177}
 178
 179static void nvme_free_req_queue_cb(void *opaque)
 180{
 181    NVMeQueuePair *q = opaque;
 182
 183    qemu_mutex_lock(&q->lock);
 184    while (qemu_co_enter_next(&q->free_req_queue, &q->lock)) {
 185        /* Retry all pending requests */
 186    }
 187    qemu_mutex_unlock(&q->lock);
 188}
 189
 190static NVMeQueuePair *nvme_create_queue_pair(BlockDriverState *bs,
 191                                             int idx, int size,
 192                                             Error **errp)
 193{
 194    int i, r;
 195    BDRVNVMeState *s = bs->opaque;
 196    Error *local_err = NULL;
 197    NVMeQueuePair *q = g_new0(NVMeQueuePair, 1);
 198    uint64_t prp_list_iova;
 199
 200    qemu_mutex_init(&q->lock);
 201    q->index = idx;
 202    qemu_co_queue_init(&q->free_req_queue);
 203    q->prp_list_pages = qemu_blockalign0(bs, s->page_size * NVME_QUEUE_SIZE);
 204    r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages,
 205                          s->page_size * NVME_QUEUE_SIZE,
 206                          false, &prp_list_iova);
 207    if (r) {
 208        goto fail;
 209    }
 210    for (i = 0; i < NVME_QUEUE_SIZE; i++) {
 211        NVMeRequest *req = &q->reqs[i];
 212        req->cid = i + 1;
 213        req->prp_list_page = q->prp_list_pages + i * s->page_size;
 214        req->prp_list_iova = prp_list_iova + i * s->page_size;
 215    }
 216    nvme_init_queue(bs, &q->sq, size, NVME_SQ_ENTRY_BYTES, &local_err);
 217    if (local_err) {
 218        error_propagate(errp, local_err);
 219        goto fail;
 220    }
 221    q->sq.doorbell = &s->regs->doorbells[idx * 2 * s->doorbell_scale];
 222
 223    nvme_init_queue(bs, &q->cq, size, NVME_CQ_ENTRY_BYTES, &local_err);
 224    if (local_err) {
 225        error_propagate(errp, local_err);
 226        goto fail;
 227    }
 228    q->cq.doorbell = &s->regs->doorbells[(idx * 2 + 1) * s->doorbell_scale];
 229
 230    return q;
 231fail:
 232    nvme_free_queue_pair(bs, q);
 233    return NULL;
 234}
 235
 236/* With q->lock */
 237static void nvme_kick(BDRVNVMeState *s, NVMeQueuePair *q)
 238{
 239    if (s->plugged || !q->need_kick) {
 240        return;
 241    }
 242    trace_nvme_kick(s, q->index);
 243    assert(!(q->sq.tail & 0xFF00));
 244    /* Fence the write to submission queue entry before notifying the device. */
 245    smp_wmb();
 246    *q->sq.doorbell = cpu_to_le32(q->sq.tail);
 247    q->inflight += q->need_kick;
 248    q->need_kick = 0;
 249}
 250
 251/* Find a free request element if any, otherwise:
 252 * a) if in coroutine context, try to wait for one to become available;
 253 * b) if not in coroutine, return NULL;
 254 */
 255static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
 256{
 257    int i;
 258    NVMeRequest *req = NULL;
 259
 260    qemu_mutex_lock(&q->lock);
 261    while (q->inflight + q->need_kick > NVME_QUEUE_SIZE - 2) {
 262        /* We have to leave one slot empty as that is the full queue case (head
 263         * == tail + 1). */
 264        if (qemu_in_coroutine()) {
 265            trace_nvme_free_req_queue_wait(q);
 266            qemu_co_queue_wait(&q->free_req_queue, &q->lock);
 267        } else {
 268            qemu_mutex_unlock(&q->lock);
 269            return NULL;
 270        }
 271    }
 272    for (i = 0; i < NVME_QUEUE_SIZE; i++) {
 273        if (!q->reqs[i].busy) {
 274            q->reqs[i].busy = true;
 275            req = &q->reqs[i];
 276            break;
 277        }
 278    }
 279    /* We have checked inflight and need_kick while holding q->lock, so one
 280     * free req must be available. */
 281    assert(req);
 282    qemu_mutex_unlock(&q->lock);
 283    return req;
 284}
 285
 286static inline int nvme_translate_error(const NvmeCqe *c)
 287{
 288    uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
 289    if (status) {
 290        trace_nvme_error(le32_to_cpu(c->result),
 291                         le16_to_cpu(c->sq_head),
 292                         le16_to_cpu(c->sq_id),
 293                         le16_to_cpu(c->cid),
 294                         le16_to_cpu(status));
 295    }
 296    switch (status) {
 297    case 0:
 298        return 0;
 299    case 1:
 300        return -ENOSYS;
 301    case 2:
 302        return -EINVAL;
 303    default:
 304        return -EIO;
 305    }
 306}
 307
 308/* With q->lock */
 309static bool nvme_process_completion(BDRVNVMeState *s, NVMeQueuePair *q)
 310{
 311    bool progress = false;
 312    NVMeRequest *preq;
 313    NVMeRequest req;
 314    NvmeCqe *c;
 315
 316    trace_nvme_process_completion(s, q->index, q->inflight);
 317    if (q->busy || s->plugged) {
 318        trace_nvme_process_completion_queue_busy(s, q->index);
 319        return false;
 320    }
 321    q->busy = true;
 322    assert(q->inflight >= 0);
 323    while (q->inflight) {
 324        int16_t cid;
 325        c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
 326        if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
 327            break;
 328        }
 329        q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
 330        if (!q->cq.head) {
 331            q->cq_phase = !q->cq_phase;
 332        }
 333        cid = le16_to_cpu(c->cid);
 334        if (cid == 0 || cid > NVME_QUEUE_SIZE) {
 335            fprintf(stderr, "Unexpected CID in completion queue: %" PRIu32 "\n",
 336                    cid);
 337            continue;
 338        }
 339        assert(cid <= NVME_QUEUE_SIZE);
 340        trace_nvme_complete_command(s, q->index, cid);
 341        preq = &q->reqs[cid - 1];
 342        req = *preq;
 343        assert(req.cid == cid);
 344        assert(req.cb);
 345        preq->busy = false;
 346        preq->cb = preq->opaque = NULL;
 347        qemu_mutex_unlock(&q->lock);
 348        req.cb(req.opaque, nvme_translate_error(c));
 349        qemu_mutex_lock(&q->lock);
 350        q->inflight--;
 351        progress = true;
 352    }
 353    if (progress) {
 354        /* Notify the device so it can post more completions. */
 355        smp_mb_release();
 356        *q->cq.doorbell = cpu_to_le32(q->cq.head);
 357        if (!qemu_co_queue_empty(&q->free_req_queue)) {
 358            replay_bh_schedule_oneshot_event(s->aio_context,
 359                                             nvme_free_req_queue_cb, q);
 360        }
 361    }
 362    q->busy = false;
 363    return progress;
 364}
 365
 366static void nvme_trace_command(const NvmeCmd *cmd)
 367{
 368    int i;
 369
 370    for (i = 0; i < 8; ++i) {
 371        uint8_t *cmdp = (uint8_t *)cmd + i * 8;
 372        trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3],
 373                                      cmdp[4], cmdp[5], cmdp[6], cmdp[7]);
 374    }
 375}
 376
 377static void nvme_submit_command(BDRVNVMeState *s, NVMeQueuePair *q,
 378                                NVMeRequest *req,
 379                                NvmeCmd *cmd, BlockCompletionFunc cb,
 380                                void *opaque)
 381{
 382    assert(!req->cb);
 383    req->cb = cb;
 384    req->opaque = opaque;
 385    cmd->cid = cpu_to_le32(req->cid);
 386
 387    trace_nvme_submit_command(s, q->index, req->cid);
 388    nvme_trace_command(cmd);
 389    qemu_mutex_lock(&q->lock);
 390    memcpy((uint8_t *)q->sq.queue +
 391           q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
 392    q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
 393    q->need_kick++;
 394    nvme_kick(s, q);
 395    nvme_process_completion(s, q);
 396    qemu_mutex_unlock(&q->lock);
 397}
 398
 399static void nvme_cmd_sync_cb(void *opaque, int ret)
 400{
 401    int *pret = opaque;
 402    *pret = ret;
 403    aio_wait_kick();
 404}
 405
 406static int nvme_cmd_sync(BlockDriverState *bs, NVMeQueuePair *q,
 407                         NvmeCmd *cmd)
 408{
 409    NVMeRequest *req;
 410    BDRVNVMeState *s = bs->opaque;
 411    int ret = -EINPROGRESS;
 412    req = nvme_get_free_req(q);
 413    if (!req) {
 414        return -EBUSY;
 415    }
 416    nvme_submit_command(s, q, req, cmd, nvme_cmd_sync_cb, &ret);
 417
 418    BDRV_POLL_WHILE(bs, ret == -EINPROGRESS);
 419    return ret;
 420}
 421
 422static void nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
 423{
 424    BDRVNVMeState *s = bs->opaque;
 425    NvmeIdCtrl *idctrl;
 426    NvmeIdNs *idns;
 427    NvmeLBAF *lbaf;
 428    uint8_t *resp;
 429    uint16_t oncs;
 430    int r;
 431    uint64_t iova;
 432    NvmeCmd cmd = {
 433        .opcode = NVME_ADM_CMD_IDENTIFY,
 434        .cdw10 = cpu_to_le32(0x1),
 435    };
 436
 437    resp = qemu_try_blockalign0(bs, sizeof(NvmeIdCtrl));
 438    if (!resp) {
 439        error_setg(errp, "Cannot allocate buffer for identify response");
 440        goto out;
 441    }
 442    idctrl = (NvmeIdCtrl *)resp;
 443    idns = (NvmeIdNs *)resp;
 444    r = qemu_vfio_dma_map(s->vfio, resp, sizeof(NvmeIdCtrl), true, &iova);
 445    if (r) {
 446        error_setg(errp, "Cannot map buffer for DMA");
 447        goto out;
 448    }
 449    cmd.prp1 = cpu_to_le64(iova);
 450
 451    if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
 452        error_setg(errp, "Failed to identify controller");
 453        goto out;
 454    }
 455
 456    if (le32_to_cpu(idctrl->nn) < namespace) {
 457        error_setg(errp, "Invalid namespace");
 458        goto out;
 459    }
 460    s->write_cache_supported = le32_to_cpu(idctrl->vwc) & 0x1;
 461    s->max_transfer = (idctrl->mdts ? 1 << idctrl->mdts : 0) * s->page_size;
 462    /* For now the page list buffer per command is one page, to hold at most
 463     * s->page_size / sizeof(uint64_t) entries. */
 464    s->max_transfer = MIN_NON_ZERO(s->max_transfer,
 465                          s->page_size / sizeof(uint64_t) * s->page_size);
 466
 467    oncs = le16_to_cpu(idctrl->oncs);
 468    s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROS);
 469    s->supports_discard = !!(oncs & NVME_ONCS_DSM);
 470
 471    memset(resp, 0, 4096);
 472
 473    cmd.cdw10 = 0;
 474    cmd.nsid = cpu_to_le32(namespace);
 475    if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
 476        error_setg(errp, "Failed to identify namespace");
 477        goto out;
 478    }
 479
 480    s->nsze = le64_to_cpu(idns->nsze);
 481    lbaf = &idns->lbaf[NVME_ID_NS_FLBAS_INDEX(idns->flbas)];
 482
 483    if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(idns->dlfeat) &&
 484            NVME_ID_NS_DLFEAT_READ_BEHAVIOR(idns->dlfeat) ==
 485                    NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) {
 486        bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
 487    }
 488
 489    if (lbaf->ms) {
 490        error_setg(errp, "Namespaces with metadata are not yet supported");
 491        goto out;
 492    }
 493
 494    if (lbaf->ds < BDRV_SECTOR_BITS || lbaf->ds > 12 ||
 495        (1 << lbaf->ds) > s->page_size)
 496    {
 497        error_setg(errp, "Namespace has unsupported block size (2^%d)",
 498                   lbaf->ds);
 499        goto out;
 500    }
 501
 502    s->blkshift = lbaf->ds;
 503out:
 504    qemu_vfio_dma_unmap(s->vfio, resp);
 505    qemu_vfree(resp);
 506}
 507
 508static bool nvme_poll_queues(BDRVNVMeState *s)
 509{
 510    bool progress = false;
 511    int i;
 512
 513    for (i = 0; i < s->nr_queues; i++) {
 514        NVMeQueuePair *q = s->queues[i];
 515        qemu_mutex_lock(&q->lock);
 516        while (nvme_process_completion(s, q)) {
 517            /* Keep polling */
 518            progress = true;
 519        }
 520        qemu_mutex_unlock(&q->lock);
 521    }
 522    return progress;
 523}
 524
 525static void nvme_handle_event(EventNotifier *n)
 526{
 527    BDRVNVMeState *s = container_of(n, BDRVNVMeState, irq_notifier);
 528
 529    trace_nvme_handle_event(s);
 530    event_notifier_test_and_clear(n);
 531    nvme_poll_queues(s);
 532}
 533
 534static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
 535{
 536    BDRVNVMeState *s = bs->opaque;
 537    int n = s->nr_queues;
 538    NVMeQueuePair *q;
 539    NvmeCmd cmd;
 540    int queue_size = NVME_QUEUE_SIZE;
 541
 542    q = nvme_create_queue_pair(bs, n, queue_size, errp);
 543    if (!q) {
 544        return false;
 545    }
 546    cmd = (NvmeCmd) {
 547        .opcode = NVME_ADM_CMD_CREATE_CQ,
 548        .prp1 = cpu_to_le64(q->cq.iova),
 549        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
 550        .cdw11 = cpu_to_le32(0x3),
 551    };
 552    if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
 553        error_setg(errp, "Failed to create io queue [%d]", n);
 554        nvme_free_queue_pair(bs, q);
 555        return false;
 556    }
 557    cmd = (NvmeCmd) {
 558        .opcode = NVME_ADM_CMD_CREATE_SQ,
 559        .prp1 = cpu_to_le64(q->sq.iova),
 560        .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0xFFFF)),
 561        .cdw11 = cpu_to_le32(0x1 | (n << 16)),
 562    };
 563    if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
 564        error_setg(errp, "Failed to create io queue [%d]", n);
 565        nvme_free_queue_pair(bs, q);
 566        return false;
 567    }
 568    s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
 569    s->queues[n] = q;
 570    s->nr_queues++;
 571    return true;
 572}
 573
 574static bool nvme_poll_cb(void *opaque)
 575{
 576    EventNotifier *e = opaque;
 577    BDRVNVMeState *s = container_of(e, BDRVNVMeState, irq_notifier);
 578    bool progress = false;
 579
 580    trace_nvme_poll_cb(s);
 581    progress = nvme_poll_queues(s);
 582    return progress;
 583}
 584
 585static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
 586                     Error **errp)
 587{
 588    BDRVNVMeState *s = bs->opaque;
 589    int ret;
 590    uint64_t cap;
 591    uint64_t timeout_ms;
 592    uint64_t deadline, now;
 593    Error *local_err = NULL;
 594
 595    qemu_co_mutex_init(&s->dma_map_lock);
 596    qemu_co_queue_init(&s->dma_flush_queue);
 597    s->device = g_strdup(device);
 598    s->nsid = namespace;
 599    s->aio_context = bdrv_get_aio_context(bs);
 600    ret = event_notifier_init(&s->irq_notifier, 0);
 601    if (ret) {
 602        error_setg(errp, "Failed to init event notifier");
 603        return ret;
 604    }
 605
 606    s->vfio = qemu_vfio_open_pci(device, errp);
 607    if (!s->vfio) {
 608        ret = -EINVAL;
 609        goto out;
 610    }
 611
 612    s->regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, NVME_BAR_SIZE, errp);
 613    if (!s->regs) {
 614        ret = -EINVAL;
 615        goto out;
 616    }
 617
 618    /* Perform initialize sequence as described in NVMe spec "7.6.1
 619     * Initialization". */
 620
 621    cap = le64_to_cpu(s->regs->cap);
 622    if (!(cap & (1ULL << 37))) {
 623        error_setg(errp, "Device doesn't support NVMe command set");
 624        ret = -EINVAL;
 625        goto out;
 626    }
 627
 628    s->page_size = MAX(4096, 1 << (12 + ((cap >> 48) & 0xF)));
 629    s->doorbell_scale = (4 << (((cap >> 32) & 0xF))) / sizeof(uint32_t);
 630    bs->bl.opt_mem_alignment = s->page_size;
 631    timeout_ms = MIN(500 * ((cap >> 24) & 0xFF), 30000);
 632
 633    /* Reset device to get a clean state. */
 634    s->regs->cc = cpu_to_le32(le32_to_cpu(s->regs->cc) & 0xFE);
 635    /* Wait for CSTS.RDY = 0. */
 636    deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * 1000000ULL;
 637    while (le32_to_cpu(s->regs->csts) & 0x1) {
 638        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
 639            error_setg(errp, "Timeout while waiting for device to reset (%"
 640                             PRId64 " ms)",
 641                       timeout_ms);
 642            ret = -ETIMEDOUT;
 643            goto out;
 644        }
 645    }
 646
 647    /* Set up admin queue. */
 648    s->queues = g_new(NVMeQueuePair *, 1);
 649    s->queues[0] = nvme_create_queue_pair(bs, 0, NVME_QUEUE_SIZE, errp);
 650    if (!s->queues[0]) {
 651        ret = -EINVAL;
 652        goto out;
 653    }
 654    s->nr_queues = 1;
 655    QEMU_BUILD_BUG_ON(NVME_QUEUE_SIZE & 0xF000);
 656    s->regs->aqa = cpu_to_le32((NVME_QUEUE_SIZE << 16) | NVME_QUEUE_SIZE);
 657    s->regs->asq = cpu_to_le64(s->queues[0]->sq.iova);
 658    s->regs->acq = cpu_to_le64(s->queues[0]->cq.iova);
 659
 660    /* After setting up all control registers we can enable device now. */
 661    s->regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << 20) |
 662                              (ctz32(NVME_SQ_ENTRY_BYTES) << 16) |
 663                              0x1);
 664    /* Wait for CSTS.RDY = 1. */
 665    now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 666    deadline = now + timeout_ms * 1000000;
 667    while (!(le32_to_cpu(s->regs->csts) & 0x1)) {
 668        if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
 669            error_setg(errp, "Timeout while waiting for device to start (%"
 670                             PRId64 " ms)",
 671                       timeout_ms);
 672            ret = -ETIMEDOUT;
 673            goto out;
 674        }
 675    }
 676
 677    ret = qemu_vfio_pci_init_irq(s->vfio, &s->irq_notifier,
 678                                 VFIO_PCI_MSIX_IRQ_INDEX, errp);
 679    if (ret) {
 680        goto out;
 681    }
 682    aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
 683                           false, nvme_handle_event, nvme_poll_cb);
 684
 685    nvme_identify(bs, namespace, &local_err);
 686    if (local_err) {
 687        error_propagate(errp, local_err);
 688        ret = -EIO;
 689        goto out;
 690    }
 691
 692    /* Set up command queues. */
 693    if (!nvme_add_io_queue(bs, errp)) {
 694        ret = -EIO;
 695    }
 696out:
 697    /* Cleaning up is done in nvme_file_open() upon error. */
 698    return ret;
 699}
 700
 701/* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example:
 702 *
 703 *     nvme://0000:44:00.0/1
 704 *
 705 * where the "nvme://" is a fixed form of the protocol prefix, the middle part
 706 * is the PCI address, and the last part is the namespace number starting from
 707 * 1 according to the NVMe spec. */
 708static void nvme_parse_filename(const char *filename, QDict *options,
 709                                Error **errp)
 710{
 711    int pref = strlen("nvme://");
 712
 713    if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) {
 714        const char *tmp = filename + pref;
 715        char *device;
 716        const char *namespace;
 717        unsigned long ns;
 718        const char *slash = strchr(tmp, '/');
 719        if (!slash) {
 720            qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp);
 721            return;
 722        }
 723        device = g_strndup(tmp, slash - tmp);
 724        qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device);
 725        g_free(device);
 726        namespace = slash + 1;
 727        if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) {
 728            error_setg(errp, "Invalid namespace '%s', positive number expected",
 729                       namespace);
 730            return;
 731        }
 732        qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE,
 733                      *namespace ? namespace : "1");
 734    }
 735}
 736
 737static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
 738                                           Error **errp)
 739{
 740    int ret;
 741    BDRVNVMeState *s = bs->opaque;
 742    NvmeCmd cmd = {
 743        .opcode = NVME_ADM_CMD_SET_FEATURES,
 744        .nsid = cpu_to_le32(s->nsid),
 745        .cdw10 = cpu_to_le32(0x06),
 746        .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
 747    };
 748
 749    ret = nvme_cmd_sync(bs, s->queues[0], &cmd);
 750    if (ret) {
 751        error_setg(errp, "Failed to configure NVMe write cache");
 752    }
 753    return ret;
 754}
 755
 756static void nvme_close(BlockDriverState *bs)
 757{
 758    int i;
 759    BDRVNVMeState *s = bs->opaque;
 760
 761    for (i = 0; i < s->nr_queues; ++i) {
 762        nvme_free_queue_pair(bs, s->queues[i]);
 763    }
 764    g_free(s->queues);
 765    aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
 766                           false, NULL, NULL);
 767    event_notifier_cleanup(&s->irq_notifier);
 768    qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)s->regs, 0, NVME_BAR_SIZE);
 769    qemu_vfio_close(s->vfio);
 770
 771    g_free(s->device);
 772}
 773
 774static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
 775                          Error **errp)
 776{
 777    const char *device;
 778    QemuOpts *opts;
 779    int namespace;
 780    int ret;
 781    BDRVNVMeState *s = bs->opaque;
 782
 783    bs->supported_write_flags = BDRV_REQ_FUA;
 784
 785    opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
 786    qemu_opts_absorb_qdict(opts, options, &error_abort);
 787    device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
 788    if (!device) {
 789        error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required");
 790        qemu_opts_del(opts);
 791        return -EINVAL;
 792    }
 793
 794    namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1);
 795    ret = nvme_init(bs, device, namespace, errp);
 796    qemu_opts_del(opts);
 797    if (ret) {
 798        goto fail;
 799    }
 800    if (flags & BDRV_O_NOCACHE) {
 801        if (!s->write_cache_supported) {
 802            error_setg(errp,
 803                       "NVMe controller doesn't support write cache configuration");
 804            ret = -EINVAL;
 805        } else {
 806            ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE),
 807                                                  errp);
 808        }
 809        if (ret) {
 810            goto fail;
 811        }
 812    }
 813    return 0;
 814fail:
 815    nvme_close(bs);
 816    return ret;
 817}
 818
 819static int64_t nvme_getlength(BlockDriverState *bs)
 820{
 821    BDRVNVMeState *s = bs->opaque;
 822    return s->nsze << s->blkshift;
 823}
 824
 825static uint32_t nvme_get_blocksize(BlockDriverState *bs)
 826{
 827    BDRVNVMeState *s = bs->opaque;
 828    assert(s->blkshift >= BDRV_SECTOR_BITS && s->blkshift <= 12);
 829    return UINT32_C(1) << s->blkshift;
 830}
 831
 832static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
 833{
 834    uint32_t blocksize = nvme_get_blocksize(bs);
 835    bsz->phys = blocksize;
 836    bsz->log = blocksize;
 837    return 0;
 838}
 839
 840/* Called with s->dma_map_lock */
 841static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs,
 842                                            QEMUIOVector *qiov)
 843{
 844    int r = 0;
 845    BDRVNVMeState *s = bs->opaque;
 846
 847    s->dma_map_count -= qiov->size;
 848    if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) {
 849        r = qemu_vfio_dma_reset_temporary(s->vfio);
 850        if (!r) {
 851            qemu_co_queue_restart_all(&s->dma_flush_queue);
 852        }
 853    }
 854    return r;
 855}
 856
 857/* Called with s->dma_map_lock */
 858static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
 859                                          NVMeRequest *req, QEMUIOVector *qiov)
 860{
 861    BDRVNVMeState *s = bs->opaque;
 862    uint64_t *pagelist = req->prp_list_page;
 863    int i, j, r;
 864    int entries = 0;
 865
 866    assert(qiov->size);
 867    assert(QEMU_IS_ALIGNED(qiov->size, s->page_size));
 868    assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t));
 869    for (i = 0; i < qiov->niov; ++i) {
 870        bool retry = true;
 871        uint64_t iova;
 872try_map:
 873        r = qemu_vfio_dma_map(s->vfio,
 874                              qiov->iov[i].iov_base,
 875                              qiov->iov[i].iov_len,
 876                              true, &iova);
 877        if (r == -ENOMEM && retry) {
 878            retry = false;
 879            trace_nvme_dma_flush_queue_wait(s);
 880            if (s->dma_map_count) {
 881                trace_nvme_dma_map_flush(s);
 882                qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock);
 883            } else {
 884                r = qemu_vfio_dma_reset_temporary(s->vfio);
 885                if (r) {
 886                    goto fail;
 887                }
 888            }
 889            goto try_map;
 890        }
 891        if (r) {
 892            goto fail;
 893        }
 894
 895        for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) {
 896            pagelist[entries++] = cpu_to_le64(iova + j * s->page_size);
 897        }
 898        trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base,
 899                                    qiov->iov[i].iov_len / s->page_size);
 900    }
 901
 902    s->dma_map_count += qiov->size;
 903
 904    assert(entries <= s->page_size / sizeof(uint64_t));
 905    switch (entries) {
 906    case 0:
 907        abort();
 908    case 1:
 909        cmd->prp1 = pagelist[0];
 910        cmd->prp2 = 0;
 911        break;
 912    case 2:
 913        cmd->prp1 = pagelist[0];
 914        cmd->prp2 = pagelist[1];
 915        break;
 916    default:
 917        cmd->prp1 = pagelist[0];
 918        cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
 919        break;
 920    }
 921    trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
 922    for (i = 0; i < entries; ++i) {
 923        trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]);
 924    }
 925    return 0;
 926fail:
 927    /* No need to unmap [0 - i) iovs even if we've failed, since we don't
 928     * increment s->dma_map_count. This is okay for fixed mapping memory areas
 929     * because they are already mapped before calling this function; for
 930     * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by
 931     * calling qemu_vfio_dma_reset_temporary when necessary. */
 932    return r;
 933}
 934
 935typedef struct {
 936    Coroutine *co;
 937    int ret;
 938    AioContext *ctx;
 939} NVMeCoData;
 940
 941static void nvme_rw_cb_bh(void *opaque)
 942{
 943    NVMeCoData *data = opaque;
 944    qemu_coroutine_enter(data->co);
 945}
 946
 947static void nvme_rw_cb(void *opaque, int ret)
 948{
 949    NVMeCoData *data = opaque;
 950    data->ret = ret;
 951    if (!data->co) {
 952        /* The rw coroutine hasn't yielded, don't try to enter. */
 953        return;
 954    }
 955    replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data);
 956}
 957
 958static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
 959                                            uint64_t offset, uint64_t bytes,
 960                                            QEMUIOVector *qiov,
 961                                            bool is_write,
 962                                            int flags)
 963{
 964    int r;
 965    BDRVNVMeState *s = bs->opaque;
 966    NVMeQueuePair *ioq = s->queues[1];
 967    NVMeRequest *req;
 968
 969    uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) |
 970                       (flags & BDRV_REQ_FUA ? 1 << 30 : 0);
 971    NvmeCmd cmd = {
 972        .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ,
 973        .nsid = cpu_to_le32(s->nsid),
 974        .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
 975        .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
 976        .cdw12 = cpu_to_le32(cdw12),
 977    };
 978    NVMeCoData data = {
 979        .ctx = bdrv_get_aio_context(bs),
 980        .ret = -EINPROGRESS,
 981    };
 982
 983    trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
 984    assert(s->nr_queues > 1);
 985    req = nvme_get_free_req(ioq);
 986    assert(req);
 987
 988    qemu_co_mutex_lock(&s->dma_map_lock);
 989    r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
 990    qemu_co_mutex_unlock(&s->dma_map_lock);
 991    if (r) {
 992        req->busy = false;
 993        return r;
 994    }
 995    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
 996
 997    data.co = qemu_coroutine_self();
 998    while (data.ret == -EINPROGRESS) {
 999        qemu_coroutine_yield();
1000    }
1001
1002    qemu_co_mutex_lock(&s->dma_map_lock);
1003    r = nvme_cmd_unmap_qiov(bs, qiov);
1004    qemu_co_mutex_unlock(&s->dma_map_lock);
1005    if (r) {
1006        return r;
1007    }
1008
1009    trace_nvme_rw_done(s, is_write, offset, bytes, data.ret);
1010    return data.ret;
1011}
1012
1013static inline bool nvme_qiov_aligned(BlockDriverState *bs,
1014                                     const QEMUIOVector *qiov)
1015{
1016    int i;
1017    BDRVNVMeState *s = bs->opaque;
1018
1019    for (i = 0; i < qiov->niov; ++i) {
1020        if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base, s->page_size) ||
1021            !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, s->page_size)) {
1022            trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
1023                                      qiov->iov[i].iov_len, s->page_size);
1024            return false;
1025        }
1026    }
1027    return true;
1028}
1029
1030static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
1031                       QEMUIOVector *qiov, bool is_write, int flags)
1032{
1033    BDRVNVMeState *s = bs->opaque;
1034    int r;
1035    uint8_t *buf = NULL;
1036    QEMUIOVector local_qiov;
1037
1038    assert(QEMU_IS_ALIGNED(offset, s->page_size));
1039    assert(QEMU_IS_ALIGNED(bytes, s->page_size));
1040    assert(bytes <= s->max_transfer);
1041    if (nvme_qiov_aligned(bs, qiov)) {
1042        return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags);
1043    }
1044    trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
1045    buf = qemu_try_blockalign(bs, bytes);
1046
1047    if (!buf) {
1048        return -ENOMEM;
1049    }
1050    qemu_iovec_init(&local_qiov, 1);
1051    if (is_write) {
1052        qemu_iovec_to_buf(qiov, 0, buf, bytes);
1053    }
1054    qemu_iovec_add(&local_qiov, buf, bytes);
1055    r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags);
1056    qemu_iovec_destroy(&local_qiov);
1057    if (!r && !is_write) {
1058        qemu_iovec_from_buf(qiov, 0, buf, bytes);
1059    }
1060    qemu_vfree(buf);
1061    return r;
1062}
1063
1064static coroutine_fn int nvme_co_preadv(BlockDriverState *bs,
1065                                       uint64_t offset, uint64_t bytes,
1066                                       QEMUIOVector *qiov, int flags)
1067{
1068    return nvme_co_prw(bs, offset, bytes, qiov, false, flags);
1069}
1070
1071static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs,
1072                                        uint64_t offset, uint64_t bytes,
1073                                        QEMUIOVector *qiov, int flags)
1074{
1075    return nvme_co_prw(bs, offset, bytes, qiov, true, flags);
1076}
1077
1078static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
1079{
1080    BDRVNVMeState *s = bs->opaque;
1081    NVMeQueuePair *ioq = s->queues[1];
1082    NVMeRequest *req;
1083    NvmeCmd cmd = {
1084        .opcode = NVME_CMD_FLUSH,
1085        .nsid = cpu_to_le32(s->nsid),
1086    };
1087    NVMeCoData data = {
1088        .ctx = bdrv_get_aio_context(bs),
1089        .ret = -EINPROGRESS,
1090    };
1091
1092    assert(s->nr_queues > 1);
1093    req = nvme_get_free_req(ioq);
1094    assert(req);
1095    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
1096
1097    data.co = qemu_coroutine_self();
1098    if (data.ret == -EINPROGRESS) {
1099        qemu_coroutine_yield();
1100    }
1101
1102    return data.ret;
1103}
1104
1105
1106static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
1107                                              int64_t offset,
1108                                              int bytes,
1109                                              BdrvRequestFlags flags)
1110{
1111    BDRVNVMeState *s = bs->opaque;
1112    NVMeQueuePair *ioq = s->queues[1];
1113    NVMeRequest *req;
1114
1115    uint32_t cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
1116
1117    if (!s->supports_write_zeroes) {
1118        return -ENOTSUP;
1119    }
1120
1121    NvmeCmd cmd = {
1122        .opcode = NVME_CMD_WRITE_ZEROS,
1123        .nsid = cpu_to_le32(s->nsid),
1124        .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
1125        .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
1126    };
1127
1128    NVMeCoData data = {
1129        .ctx = bdrv_get_aio_context(bs),
1130        .ret = -EINPROGRESS,
1131    };
1132
1133    if (flags & BDRV_REQ_MAY_UNMAP) {
1134        cdw12 |= (1 << 25);
1135    }
1136
1137    if (flags & BDRV_REQ_FUA) {
1138        cdw12 |= (1 << 30);
1139    }
1140
1141    cmd.cdw12 = cpu_to_le32(cdw12);
1142
1143    trace_nvme_write_zeroes(s, offset, bytes, flags);
1144    assert(s->nr_queues > 1);
1145    req = nvme_get_free_req(ioq);
1146    assert(req);
1147
1148    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
1149
1150    data.co = qemu_coroutine_self();
1151    while (data.ret == -EINPROGRESS) {
1152        qemu_coroutine_yield();
1153    }
1154
1155    trace_nvme_rw_done(s, true, offset, bytes, data.ret);
1156    return data.ret;
1157}
1158
1159
1160static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
1161                                         int64_t offset,
1162                                         int bytes)
1163{
1164    BDRVNVMeState *s = bs->opaque;
1165    NVMeQueuePair *ioq = s->queues[1];
1166    NVMeRequest *req;
1167    NvmeDsmRange *buf;
1168    QEMUIOVector local_qiov;
1169    int ret;
1170
1171    NvmeCmd cmd = {
1172        .opcode = NVME_CMD_DSM,
1173        .nsid = cpu_to_le32(s->nsid),
1174        .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
1175        .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
1176    };
1177
1178    NVMeCoData data = {
1179        .ctx = bdrv_get_aio_context(bs),
1180        .ret = -EINPROGRESS,
1181    };
1182
1183    if (!s->supports_discard) {
1184        return -ENOTSUP;
1185    }
1186
1187    assert(s->nr_queues > 1);
1188
1189    buf = qemu_try_blockalign0(bs, s->page_size);
1190    if (!buf) {
1191        return -ENOMEM;
1192    }
1193
1194    buf->nlb = cpu_to_le32(bytes >> s->blkshift);
1195    buf->slba = cpu_to_le64(offset >> s->blkshift);
1196    buf->cattr = 0;
1197
1198    qemu_iovec_init(&local_qiov, 1);
1199    qemu_iovec_add(&local_qiov, buf, 4096);
1200
1201    req = nvme_get_free_req(ioq);
1202    assert(req);
1203
1204    qemu_co_mutex_lock(&s->dma_map_lock);
1205    ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
1206    qemu_co_mutex_unlock(&s->dma_map_lock);
1207
1208    if (ret) {
1209        req->busy = false;
1210        goto out;
1211    }
1212
1213    trace_nvme_dsm(s, offset, bytes);
1214
1215    nvme_submit_command(s, ioq, req, &cmd, nvme_rw_cb, &data);
1216
1217    data.co = qemu_coroutine_self();
1218    while (data.ret == -EINPROGRESS) {
1219        qemu_coroutine_yield();
1220    }
1221
1222    qemu_co_mutex_lock(&s->dma_map_lock);
1223    ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
1224    qemu_co_mutex_unlock(&s->dma_map_lock);
1225
1226    if (ret) {
1227        goto out;
1228    }
1229
1230    ret = data.ret;
1231    trace_nvme_dsm_done(s, offset, bytes, ret);
1232out:
1233    qemu_iovec_destroy(&local_qiov);
1234    qemu_vfree(buf);
1235    return ret;
1236
1237}
1238
1239
1240static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
1241                               BlockReopenQueue *queue, Error **errp)
1242{
1243    return 0;
1244}
1245
1246static void nvme_refresh_filename(BlockDriverState *bs)
1247{
1248    BDRVNVMeState *s = bs->opaque;
1249
1250    snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i",
1251             s->device, s->nsid);
1252}
1253
1254static void nvme_refresh_limits(BlockDriverState *bs, Error **errp)
1255{
1256    BDRVNVMeState *s = bs->opaque;
1257
1258    bs->bl.opt_mem_alignment = s->page_size;
1259    bs->bl.request_alignment = s->page_size;
1260    bs->bl.max_transfer = s->max_transfer;
1261}
1262
1263static void nvme_detach_aio_context(BlockDriverState *bs)
1264{
1265    BDRVNVMeState *s = bs->opaque;
1266
1267    aio_set_event_notifier(bdrv_get_aio_context(bs), &s->irq_notifier,
1268                           false, NULL, NULL);
1269}
1270
1271static void nvme_attach_aio_context(BlockDriverState *bs,
1272                                    AioContext *new_context)
1273{
1274    BDRVNVMeState *s = bs->opaque;
1275
1276    s->aio_context = new_context;
1277    aio_set_event_notifier(new_context, &s->irq_notifier,
1278                           false, nvme_handle_event, nvme_poll_cb);
1279}
1280
1281static void nvme_aio_plug(BlockDriverState *bs)
1282{
1283    BDRVNVMeState *s = bs->opaque;
1284    assert(!s->plugged);
1285    s->plugged = true;
1286}
1287
1288static void nvme_aio_unplug(BlockDriverState *bs)
1289{
1290    int i;
1291    BDRVNVMeState *s = bs->opaque;
1292    assert(s->plugged);
1293    s->plugged = false;
1294    for (i = 1; i < s->nr_queues; i++) {
1295        NVMeQueuePair *q = s->queues[i];
1296        qemu_mutex_lock(&q->lock);
1297        nvme_kick(s, q);
1298        nvme_process_completion(s, q);
1299        qemu_mutex_unlock(&q->lock);
1300    }
1301}
1302
1303static void nvme_register_buf(BlockDriverState *bs, void *host, size_t size)
1304{
1305    int ret;
1306    BDRVNVMeState *s = bs->opaque;
1307
1308    ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL);
1309    if (ret) {
1310        /* FIXME: we may run out of IOVA addresses after repeated
1311         * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap
1312         * doesn't reclaim addresses for fixed mappings. */
1313        error_report("nvme_register_buf failed: %s", strerror(-ret));
1314    }
1315}
1316
1317static void nvme_unregister_buf(BlockDriverState *bs, void *host)
1318{
1319    BDRVNVMeState *s = bs->opaque;
1320
1321    qemu_vfio_dma_unmap(s->vfio, host);
1322}
1323
1324static const char *const nvme_strong_runtime_opts[] = {
1325    NVME_BLOCK_OPT_DEVICE,
1326    NVME_BLOCK_OPT_NAMESPACE,
1327
1328    NULL
1329};
1330
1331static BlockDriver bdrv_nvme = {
1332    .format_name              = "nvme",
1333    .protocol_name            = "nvme",
1334    .instance_size            = sizeof(BDRVNVMeState),
1335
1336    .bdrv_parse_filename      = nvme_parse_filename,
1337    .bdrv_file_open           = nvme_file_open,
1338    .bdrv_close               = nvme_close,
1339    .bdrv_getlength           = nvme_getlength,
1340    .bdrv_probe_blocksizes    = nvme_probe_blocksizes,
1341
1342    .bdrv_co_preadv           = nvme_co_preadv,
1343    .bdrv_co_pwritev          = nvme_co_pwritev,
1344
1345    .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
1346    .bdrv_co_pdiscard         = nvme_co_pdiscard,
1347
1348    .bdrv_co_flush_to_disk    = nvme_co_flush,
1349    .bdrv_reopen_prepare      = nvme_reopen_prepare,
1350
1351    .bdrv_refresh_filename    = nvme_refresh_filename,
1352    .bdrv_refresh_limits      = nvme_refresh_limits,
1353    .strong_runtime_opts      = nvme_strong_runtime_opts,
1354
1355    .bdrv_detach_aio_context  = nvme_detach_aio_context,
1356    .bdrv_attach_aio_context  = nvme_attach_aio_context,
1357
1358    .bdrv_io_plug             = nvme_aio_plug,
1359    .bdrv_io_unplug           = nvme_aio_unplug,
1360
1361    .bdrv_register_buf        = nvme_register_buf,
1362    .bdrv_unregister_buf      = nvme_unregister_buf,
1363};
1364
1365static void bdrv_nvme_init(void)
1366{
1367    bdrv_register(&bdrv_nvme);
1368}
1369
1370block_init(bdrv_nvme_init);
1371